Predict whether a customer is likely to return

Model ranking by minority category (customer returns) F1 score  

Decision Tree: 27%  
Naive Bayes: 24%  
KNN: 22%  
Random Forest: 21%  
Logistic Regression: 14%  
XGBoost: 18%  
CatBoost: 15%  
SVM: 10%  

Best Decision Tree F1 with Tuning:  36%  
Best Parameters: 'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2

In [37]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix


In [38]:
data = pd.read_csv("../data/TechCorner_Sales_converted.csv", index_col=0)

In [39]:
data.columns

Index(['Date', 'Cus_Location', 'Age', 'Gender', 'SellPrice', 'from_FB',
       'follows_page', 'bought_before', 'heard_of_shop', 'is_local', 'is_male',
       'Mobile Name_Galaxy M35 5G 8/128',
       'Mobile Name_Galaxy S24 Ultra 12/256', 'Mobile Name_Moto G85 5G 8/128',
       'Mobile Name_Narzo N53 4/64', 'Mobile Name_Note 11S 6/128',
       'Mobile Name_Note 14 Pro 5G 8/256', 'Mobile Name_Pixel 7a 8/128',
       'Mobile Name_Pixel 8 Pro 12/256', 'Mobile Name_R-70 Turbo 5G 6/128',
       'Mobile Name_Redmi Note 12 Pro 8/128', 'Mobile Name_Vivo T3x 5G 8/128',
       'Mobile Name_Vivo Y200 5G 6/128', 'Mobile Name_iPhone 16 Pro 256GB',
       'Mobile Name_iPhone 16 Pro Max 1TB',
       'Mobile Name_iQOO Neo 9 Pro 5G 12/256', 'Mobile Name_iQOO Z7 5G 6/128'],
      dtype='object')

Define Target and Features

In [40]:
# Target column: binary encoded version of returning customer status
y = data['bought_before']

# Drop target and irrelevant columns
X = data.drop(columns=['Date', 'Cus_Location', 'Gender', 'bought_before'])

Train-Test Split

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=7, stratify=y
)

Check class imbalance, add a SMOTE version dataset

In [42]:
# Check class imbalance
print(y_train.value_counts())

bought_before
0    5341
1    1755
Name: count, dtype: int64


In [43]:
# Apply SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy='auto', random_state=7)    
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

In [44]:
# Check class imbalance
print(y_train_sm.value_counts())

bought_before
0    5341
1    5341
Name: count, dtype: int64


Scale age and SellPrice on both versions of datasets

In [45]:
sc = StandardScaler()
X_train_scaled = X_train.copy()
X_train_sm_scaled = X_train_sm.copy()

X_test_scaled = X_test.copy()

X_train_scaled[['Age', 'SellPrice']] = sc.fit_transform(X_train_scaled[['Age', 'SellPrice']])
X_train_sm_scaled[['Age', 'SellPrice']] = sc.fit_transform(X_train_sm_scaled[['Age', 'SellPrice']])

X_test_scaled[['Age', 'SellPrice']] = sc.fit_transform(X_test_scaled[['Age', 'SellPrice']])

Train and compare classifier models' confusion matrices on both dataset versions  
Focus on best F1 and Recall for the minority class (which customers returned)

In [46]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

***Decision Tree***

In [47]:
# Decision Tree basic
model = tree.DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.76      0.76      1336
           1       0.26      0.25      0.25       439

    accuracy                           0.64      1775
   macro avg       0.51      0.51      0.51      1775
weighted avg       0.63      0.64      0.63      1775

Confusion Matrix:
 [[1021  315]
 [ 330  109]]


In [48]:
# Decision Tree scaled
model = tree.DecisionTreeClassifier()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.75      0.75      1336
           1       0.26      0.27      0.27       439

    accuracy                           0.63      1775
   macro avg       0.51      0.51      0.51      1775
weighted avg       0.63      0.63      0.63      1775

Confusion Matrix:
 [[997 339]
 [319 120]]


In [49]:
# Decision Tree smote
model = tree.DecisionTreeClassifier()
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.65      0.70      1336
           1       0.24      0.33      0.27       439

    accuracy                           0.57      1775
   macro avg       0.49      0.49      0.49      1775
weighted avg       0.62      0.57      0.59      1775

Confusion Matrix:
 [[871 465]
 [295 144]]


***Naive Bayes***

In [50]:
# Naive Bayes basic
model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.98      0.85      1336
           1       0.15      0.01      0.02       439

    accuracy                           0.74      1775
   macro avg       0.45      0.50      0.43      1775
weighted avg       0.60      0.74      0.65      1775

Confusion Matrix:
 [[1314   22]
 [ 435    4]]


In [51]:
# Naive Bayes scaled
model = GaussianNB()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.89      0.81      1336
           1       0.22      0.10      0.14       439

    accuracy                           0.69      1775
   macro avg       0.48      0.49      0.47      1775
weighted avg       0.62      0.69      0.64      1775

Confusion Matrix:
 [[1183  153]
 [ 396   43]]


In [52]:
# Naive Bayes smote
model = GaussianNB()
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.77      0.76      1336
           1       0.25      0.23      0.24       439

    accuracy                           0.64      1775
   macro avg       0.50      0.50      0.50      1775
weighted avg       0.63      0.64      0.63      1775

Confusion Matrix:
 [[1030  306]
 [ 337  102]]


***KNN***

In [53]:
# KNN basic
model = KNeighborsClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.90      0.82      1336
           1       0.24      0.10      0.14       439

    accuracy                           0.70      1775
   macro avg       0.50      0.50      0.48      1775
weighted avg       0.63      0.70      0.65      1775

Confusion Matrix:
 [[1198  138]
 [ 395   44]]


In [54]:
# KNN scaled
model = KNeighborsClassifier()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.89      0.81      1336
           1       0.22      0.10      0.14       439

    accuracy                           0.69      1775
   macro avg       0.49      0.49      0.48      1775
weighted avg       0.62      0.69      0.65      1775

Confusion Matrix:
 [[1184  152]
 [ 395   44]]


***Random Forest***

In [55]:
# KNN smote
model = KNeighborsClassifier()
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.78      0.76      1336
           1       0.24      0.21      0.22       439

    accuracy                           0.64      1775
   macro avg       0.49      0.49      0.49      1775
weighted avg       0.62      0.64      0.63      1775

Confusion Matrix:
 [[1043  293]
 [ 348   91]]


In [56]:
# Random Forest basic
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.93      0.83      1336
           1       0.21      0.05      0.08       439

    accuracy                           0.72      1775
   macro avg       0.48      0.49      0.46      1775
weighted avg       0.62      0.72      0.65      1775

Confusion Matrix:
 [[1247   89]
 [ 416   23]]


In [57]:
# Random Forest scaled
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.94      0.84      1336
           1       0.21      0.05      0.08       439

    accuracy                           0.72      1775
   macro avg       0.48      0.50      0.46      1775
weighted avg       0.62      0.72      0.65      1775

Confusion Matrix:
 [[1262   74]
 [ 419   20]]


In [58]:
# Random Forest smote
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.79      0.77      1336
           1       0.24      0.21      0.22       439

    accuracy                           0.65      1775
   macro avg       0.50      0.50      0.50      1775
weighted avg       0.63      0.65      0.63      1775

Confusion Matrix:
 [[1056  280]
 [ 349   90]]


***Logistic Regression***

In [59]:
# Logistic Regression smote (non-smote had no minority class predictions)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.90      0.82      1336
           1       0.24      0.10      0.14       439

    accuracy                           0.70      1775
   macro avg       0.50      0.50      0.48      1775
weighted avg       0.63      0.70      0.65      1775

Confusion Matrix:
 [[1200  136]
 [ 396   43]]


***XGBoost***

In [60]:
# XGBoost basic
model = XGBClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.95      0.84      1336
           1       0.24      0.05      0.08       439

    accuracy                           0.73      1775
   macro avg       0.50      0.50      0.46      1775
weighted avg       0.63      0.73      0.65      1775

Confusion Matrix:
 [[1273   63]
 [ 419   20]]


In [61]:
# XGBoost scaled
model = XGBClassifier()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.93      0.83      1336
           1       0.21      0.05      0.08       439

    accuracy                           0.72      1775
   macro avg       0.48      0.49      0.46      1775
weighted avg       0.62      0.72      0.65      1775

Confusion Matrix:
 [[1248   88]
 [ 416   23]]


In [62]:
# XGBoost smote
model = XGBClassifier()
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.84      0.79      1336
           1       0.23      0.14      0.18       439

    accuracy                           0.67      1775
   macro avg       0.49      0.49      0.48      1775
weighted avg       0.62      0.67      0.64      1775

Confusion Matrix:
 [[1123  213]
 [ 376   63]]


***CatBoost***

In [63]:
# CatBoost smote (non-smote had no minority class predictions)
model = CatBoostClassifier()
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

Learning rate set to 0.028325
0:	learn: 0.6889038	total: 136ms	remaining: 2m 15s
1:	learn: 0.6849917	total: 143ms	remaining: 1m 11s
2:	learn: 0.6813245	total: 151ms	remaining: 50.1s
3:	learn: 0.6781989	total: 158ms	remaining: 39.3s
4:	learn: 0.6751468	total: 166ms	remaining: 33.1s
5:	learn: 0.6719160	total: 173ms	remaining: 28.6s
6:	learn: 0.6690170	total: 181ms	remaining: 25.6s
7:	learn: 0.6661455	total: 188ms	remaining: 23.3s
8:	learn: 0.6635457	total: 195ms	remaining: 21.5s
9:	learn: 0.6615584	total: 203ms	remaining: 20.1s
10:	learn: 0.6590377	total: 209ms	remaining: 18.8s
11:	learn: 0.6570732	total: 217ms	remaining: 17.9s
12:	learn: 0.6548574	total: 224ms	remaining: 17s
13:	learn: 0.6528645	total: 231ms	remaining: 16.3s
14:	learn: 0.6510706	total: 237ms	remaining: 15.6s
15:	learn: 0.6495648	total: 245ms	remaining: 15s
16:	learn: 0.6479943	total: 254ms	remaining: 14.7s
17:	learn: 0.6462455	total: 260ms	remaining: 14.2s
18:	learn: 0.6445215	total: 268ms	remaining: 13.8s
19:	learn: 0.

In [64]:
# CatBoost
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.89      0.81      1336
           1       0.23      0.11      0.15       439

    accuracy                           0.69      1775
   macro avg       0.49      0.50      0.48      1775
weighted avg       0.62      0.69      0.65      1775

Confusion Matrix:
 [[1183  153]
 [ 392   47]]


***SVM***

In [65]:
# SVM smote (non-smote had no minority class predictions)
model = SVC(probability = True)
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.94      0.83      1336
           1       0.24      0.06      0.10       439

    accuracy                           0.72      1775
   macro avg       0.50      0.50      0.47      1775
weighted avg       0.63      0.72      0.65      1775

Confusion Matrix:
 [[1252   84]
 [ 412   27]]


Tune the Decision Tree with GridSearchCV

In [66]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

In [67]:
# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [68]:
# Set up GridSearchCV
grid = GridSearchCV(
    estimator = tree.DecisionTreeClassifier(),
    param_grid = param_grid,
    cv = 5,
    scoring = 'f1',
    n_jobs = -1,
    verbose = 2
    )

In [69]:
# Fit to training data
grid.fit(X_train_sm_scaled, y_train_sm)


Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [70]:
# Best model
best_tree = grid.best_estimator_

# Predict
y_pred = best_tree.predict(X_test_scaled)

# Evaluate
print("Best Parameters:", grid.best_params_)
print("F1 Score:", f1_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Best Parameters: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
F1 Score: 0.35918367346938773
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.43      0.55      1336
           1       0.26      0.60      0.36       439

    accuracy                           0.47      1775
   macro avg       0.51      0.51      0.45      1775
weighted avg       0.64      0.47      0.50      1775



Export trained model

In [71]:
import joblib

joblib.dump(best_tree, 'phone_customer_again.pkl')
joblib.dump(sc, 'scaler.pkl')

['scaler.pkl']