Predict whether a customer is likely to return

Model ranking by minority category (customer returns) F1 score  

Decision Tree: 27%  
Naive Bayes: 24%  
KNN: 22%  
Random Forest: 21%  
Logistic Regression: 14%  
XGBoost: 18%  
CatBoost: 15%  
SVM: 10%  

Best Decision Tree F1 with Tuning:  36%  
Best Parameters: 'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
data = pd.read_csv("../data/TechCorner_Sales_converted.csv", index_col=0)

In [34]:
data.columns

Index(['Date', 'Cus. Location', 'Age', 'Gender', 'Sell Price', 'from FB',
       'follows page', 'bought before', 'heard of shop', 'is_local', 'is_male',
       'Mobile Name_Galaxy M35 5G 8/128',
       'Mobile Name_Galaxy S24 Ultra 12/256', 'Mobile Name_Moto G85 5G 8/128',
       'Mobile Name_Narzo N53 4/64', 'Mobile Name_Note 11S 6/128',
       'Mobile Name_Note 14 Pro 5G 8/256', 'Mobile Name_Pixel 7a 8/128',
       'Mobile Name_Pixel 8 Pro 12/256', 'Mobile Name_R-70 Turbo 5G 6/128',
       'Mobile Name_Redmi Note 12 Pro 8/128', 'Mobile Name_Vivo T3x 5G 8/128',
       'Mobile Name_Vivo Y200 5G 6/128', 'Mobile Name_iPhone 16 Pro 256GB',
       'Mobile Name_iPhone 16 Pro Max 1TB',
       'Mobile Name_iQOO Neo 9 Pro 5G 12/256', 'Mobile Name_iQOO Z7 5G 6/128'],
      dtype='object')

Define Target and Features

In [3]:
# Target column: binary encoded version of returning customer status
y = data['bought before']

# Drop target and irrelevant columns
X = data.drop(columns=['Date', 'Cus. Location', 'Gender', 'bought before'])

Train-Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=7, stratify=y
)

Check class imbalance, add a SMOTE version dataset

In [5]:
# Check class imbalance
print(y_train.value_counts())

bought before
0    5341
1    1755
Name: count, dtype: int64


In [6]:
# Apply SMOTE to oversample the minority class
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy='auto', random_state=7)    
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

In [7]:
# Check class imbalance
print(y_train_sm.value_counts())

bought before
0    5341
1    5341
Name: count, dtype: int64


Scale age and sell price on both versions of datasets

In [8]:
sc = StandardScaler()
X_train_scaled = X_train.copy()
X_train_sm_scaled = X_train_sm.copy()

X_test_scaled = X_test.copy()

X_train_scaled[['Age', 'Sell Price']] = sc.fit_transform(X_train_scaled[['Age', 'Sell Price']])
X_train_sm_scaled[['Age', 'Sell Price']] = sc.fit_transform(X_train_sm_scaled[['Age', 'Sell Price']])

X_test_scaled[['Age', 'Sell Price']] = sc.fit_transform(X_test_scaled[['Age', 'Sell Price']])

Train and compare classifier models' confusion matrices on both dataset versions  
Focus on best F1 and Recall for the minority class (which customers returned)

In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

***Decision Tree***

In [14]:
# Decision Tree basic
model = tree.DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.76      0.76      1336
           1       0.25      0.24      0.24       439

    accuracy                           0.63      1775
   macro avg       0.50      0.50      0.50      1775
weighted avg       0.63      0.63      0.63      1775

Confusion Matrix:
 [[1013  323]
 [ 334  105]]


In [15]:
# Decision Tree scaled
model = tree.DecisionTreeClassifier()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.74      0.75      1336
           1       0.25      0.27      0.26       439

    accuracy                           0.62      1775
   macro avg       0.50      0.50      0.50      1775
weighted avg       0.63      0.62      0.63      1775

Confusion Matrix:
 [[983 353]
 [319 120]]


In [17]:
# Decision Tree smote
model = tree.DecisionTreeClassifier()
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.66      0.70      1336
           1       0.23      0.32      0.27       439

    accuracy                           0.57      1775
   macro avg       0.49      0.49      0.48      1775
weighted avg       0.62      0.57      0.59      1775

Confusion Matrix:
 [[878 458]
 [299 140]]


***Naive Bayes***

In [26]:
# Naive Bayes basic
model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.98      0.85      1336
           1       0.15      0.01      0.02       439

    accuracy                           0.74      1775
   macro avg       0.45      0.50      0.43      1775
weighted avg       0.60      0.74      0.65      1775

Confusion Matrix:
 [[1314   22]
 [ 435    4]]


In [27]:
# Naive Bayes scaled
model = GaussianNB()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.89      0.81      1336
           1       0.22      0.10      0.14       439

    accuracy                           0.69      1775
   macro avg       0.48      0.49      0.47      1775
weighted avg       0.62      0.69      0.64      1775

Confusion Matrix:
 [[1183  153]
 [ 396   43]]


In [28]:
# Naive Bayes smote
model = GaussianNB()
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.77      0.76      1336
           1       0.25      0.23      0.24       439

    accuracy                           0.64      1775
   macro avg       0.50      0.50      0.50      1775
weighted avg       0.63      0.64      0.63      1775

Confusion Matrix:
 [[1030  306]
 [ 337  102]]


***KNN***

In [23]:
# KNN basic
model = KNeighborsClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.90      0.82      1336
           1       0.24      0.10      0.14       439

    accuracy                           0.70      1775
   macro avg       0.50      0.50      0.48      1775
weighted avg       0.63      0.70      0.65      1775

Confusion Matrix:
 [[1198  138]
 [ 395   44]]


In [24]:
# KNN scaled
model = KNeighborsClassifier()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.89      0.81      1336
           1       0.22      0.10      0.14       439

    accuracy                           0.69      1775
   macro avg       0.49      0.49      0.48      1775
weighted avg       0.62      0.69      0.65      1775

Confusion Matrix:
 [[1184  152]
 [ 395   44]]


***Random Forest***

In [25]:
# KNN smote
model = KNeighborsClassifier()
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.78      0.76      1336
           1       0.24      0.21      0.22       439

    accuracy                           0.64      1775
   macro avg       0.49      0.49      0.49      1775
weighted avg       0.62      0.64      0.63      1775

Confusion Matrix:
 [[1043  293]
 [ 348   91]]


In [10]:
# Random Forest basic
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.94      0.83      1336
           1       0.21      0.05      0.08       439

    accuracy                           0.72      1775
   macro avg       0.48      0.49      0.46      1775
weighted avg       0.62      0.72      0.65      1775

Confusion Matrix:
 [[1252   84]
 [ 417   22]]


In [11]:
# Random Forest scaled
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.93      0.83      1336
           1       0.19      0.05      0.08       439

    accuracy                           0.72      1775
   macro avg       0.47      0.49      0.45      1775
weighted avg       0.61      0.72      0.65      1775

Confusion Matrix:
 [[1249   87]
 [ 418   21]]


In [13]:
# Random Forest smote
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.77      0.76      1336
           1       0.23      0.20      0.21       439

    accuracy                           0.63      1775
   macro avg       0.49      0.49      0.49      1775
weighted avg       0.62      0.63      0.62      1775

Confusion Matrix:
 [[1035  301]
 [ 351   88]]


***Logistic Regression***

In [22]:
# Logistic Regression smote (non-smote had no minority class predictions)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.90      0.82      1336
           1       0.24      0.10      0.14       439

    accuracy                           0.70      1775
   macro avg       0.50      0.50      0.48      1775
weighted avg       0.63      0.70      0.65      1775

Confusion Matrix:
 [[1200  136]
 [ 396   43]]


***XGBoost***

In [None]:
# XGBoost basic
model = XGBClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.95      0.84      1336
           1       0.24      0.05      0.08       439

    accuracy                           0.73      1775
   macro avg       0.50      0.50      0.46      1775
weighted avg       0.63      0.73      0.65      1775

Confusion Matrix:
 [[1273   63]
 [ 419   20]]


In [31]:
# XGBoost scaled
model = XGBClassifier()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.93      0.83      1336
           1       0.21      0.05      0.08       439

    accuracy                           0.72      1775
   macro avg       0.48      0.49      0.46      1775
weighted avg       0.62      0.72      0.65      1775

Confusion Matrix:
 [[1248   88]
 [ 416   23]]


In [32]:
# XGBoost smote
model = XGBClassifier()
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.84      0.79      1336
           1       0.23      0.14      0.18       439

    accuracy                           0.67      1775
   macro avg       0.49      0.49      0.48      1775
weighted avg       0.62      0.67      0.64      1775

Confusion Matrix:
 [[1123  213]
 [ 376   63]]


***CatBoost***

In [35]:
# CatBoost smote (non-smote had no minority class predictions)
model = CatBoostClassifier()
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

Learning rate set to 0.028325
0:	learn: 0.6889038	total: 21.4ms	remaining: 21.4s
1:	learn: 0.6849917	total: 32.7ms	remaining: 16.3s
2:	learn: 0.6813245	total: 41.2ms	remaining: 13.7s
3:	learn: 0.6781989	total: 51.6ms	remaining: 12.9s
4:	learn: 0.6751468	total: 58.9ms	remaining: 11.7s
5:	learn: 0.6719160	total: 68.4ms	remaining: 11.3s
6:	learn: 0.6690170	total: 77.6ms	remaining: 11s
7:	learn: 0.6661455	total: 89.4ms	remaining: 11.1s
8:	learn: 0.6635457	total: 99.1ms	remaining: 10.9s
9:	learn: 0.6615584	total: 110ms	remaining: 10.9s
10:	learn: 0.6590377	total: 123ms	remaining: 11.1s
11:	learn: 0.6570732	total: 133ms	remaining: 10.9s
12:	learn: 0.6548574	total: 146ms	remaining: 11.1s
13:	learn: 0.6528645	total: 155ms	remaining: 10.9s
14:	learn: 0.6510706	total: 162ms	remaining: 10.6s
15:	learn: 0.6495648	total: 170ms	remaining: 10.5s
16:	learn: 0.6479943	total: 179ms	remaining: 10.3s
17:	learn: 0.6462455	total: 187ms	remaining: 10.2s
18:	learn: 0.6445215	total: 198ms	remaining: 10.2s
19:	

In [36]:
# CatBoost
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.89      0.81      1336
           1       0.23      0.11      0.15       439

    accuracy                           0.69      1775
   macro avg       0.49      0.50      0.48      1775
weighted avg       0.62      0.69      0.65      1775

Confusion Matrix:
 [[1183  153]
 [ 392   47]]


***SVM***

In [39]:
# SVM smote (non-smote had no minority class predictions)
model = SVC(probability = True)
model.fit(X_train_sm_scaled, y_train_sm)

y_pred = model.predict(X_test_scaled)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.94      0.83      1336
           1       0.24      0.06      0.10       439

    accuracy                           0.72      1775
   macro avg       0.50      0.50      0.47      1775
weighted avg       0.63      0.72      0.65      1775

Confusion Matrix:
 [[1252   84]
 [ 412   27]]


Tune the Decision Tree with GridSearchCV

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

In [41]:
# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [43]:
# Set up GridSearchCV
grid = GridSearchCV(
    estimator = tree.DecisionTreeClassifier(),
    param_grid = param_grid,
    cv = 5,
    scoring = 'f1',
    n_jobs = -1,
    verbose = 2
    )

In [44]:
# Fit to training data
grid.fit(X_train_sm_scaled, y_train_sm)


Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [45]:
# Best model
best_tree = grid.best_estimator_

# Predict
y_pred = best_tree.predict(X_test_scaled)

# Evaluate
print("Best Parameters:", grid.best_params_)
print("F1 Score:", f1_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Best Parameters: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
F1 Score: 0.35918367346938773
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.43      0.55      1336
           1       0.26      0.60      0.36       439

    accuracy                           0.47      1775
   macro avg       0.51      0.51      0.45      1775
weighted avg       0.64      0.47      0.50      1775

