# Classification algorithms

Compare Random Forest and XGBoost for classifying customers into clusters. For new customers, XGBoost with SMOTE+Tomek is also tried.

In [None]:
# Import necessary libraries
import pickle
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTENC
from xgboost import XGBRegressor, XGBClassifier

## Recurring customers

### Random forest

Random search:

In [None]:
# Import data
with open('Data clustering\\data_recurring_clusters.pkl', 'rb') as file:
    data_recurring_clusters = pickle.load(file)

In [None]:
# Sample 100 000 rows to perform a random search
index_sample = random.sample(range(0, len(data_recurring_clusters)), 100000)
data_recurring_clusters = data_recurring_clusters.iloc[index_sample,]

In [None]:
# Divide data into dependent and independent variables
y = data_recurring_clusters["Cluster"]
X = data_recurring_clusters.drop(["Total revenue", "Cluster"], axis = "columns")
# Divide data into train and test data sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.25)

In [None]:
# Define search range for random search for...
# number of decision trees to use
n_estimators = [40, 50, 60, 70, 80] 
# loss function
criterion = ["gini", "entropy"]
# maximum number of features to consider at every split
max_features = ["log2", "sqrt"]
# maximum number of levels in each tree
max_depth = [3, 5, 10, 15, 20]
# minimum number of observations to split a node 
min_samples_split = [0.0005, 0.001, 0.0025, 0.005] 
# use bootstrap samples
bootstrap = [True]

# Create random grid
random_grid = {"n_estimators": n_estimators, "criterion": criterion, "max_features": max_features, "max_depth": max_depth,
               "min_samples_split": min_samples_split, "bootstrap": bootstrap}

In [None]:
# Define a model
rf = RandomForestClassifier()
# Random search (tries 50 different random combinations out of 400 possible ones)
random_search = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
                                      n_iter = 50, cv = 2, verbose = 2, n_jobs = -1,
                                      scoring = "roc_auc_ovr_weighted")

In [None]:
# Perform a random search
random_search.fit(train_X, train_y)

In [None]:
# Extract the best parameters
random_search.best_params_

Model:

In [None]:
# Import data
with open('Data clustering\\data_recurring_clusters.pkl', 'rb') as file:
    data_recurring_clusters = pickle.load(file)

In [None]:
# Divide data into dependent and independent variables
y = data_recurring_clusters["Cluster"]
X = data_recurring_clusters.drop(["Total revenue", "Cluster"], axis = "columns")
# Divide data into train and test data sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.25)

In [None]:
# Define the model with the best parameters 
rf = RandomForestClassifier(n_estimators = 80, min_samples_split = 0.0005, max_features = "log2",
                                     max_depth = 20, criterion = "entropy", bootstrap = True)

In [None]:
# Fit the model
rf.fit(train_X, train_y) 

In [None]:
# Get predictions
pred = rf.predict(test_X)
pred_prob = rf.predict_proba(test_X)
# Calculate accuracy
accuracy = accuracy_score(test_y, pred)
# Calculate weighted roc_auc score
auc = roc_auc_score(test_y, pred_prob, multi_class = "ovr", average = "weighted") 

print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")

In [None]:
# Confusion matrix 
plt.figure(figsize=(100,100))
cm = confusion_matrix(test_y, pred)
cm_display = ConfusionMatrixDisplay(cm, display_labels = ["Low", "Medium", "High"])
fig, ax = plt.subplots(figsize=(8,6)) 
cm_display.plot(cmap = "Blues", values_format = "d", colorbar = False, ax = ax) 
for text in cm_display.text_.ravel():
    text.set_fontsize(14)
ax.set_xlabel("Predicted label", fontsize=14)
ax.set_ylabel("True label", fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.savefig("CM_RF_Recurring.pdf", bbox_inches='tight')
plt.show()

### XGBoost

Random search:

In [None]:
# Import data
with open('Data clustering\\data_recurring_clusters.pkl', 'rb') as file:
    data_recurring_clusters = pickle.load(file)

In [None]:
# Sample 1 000 000 rows to perform a random search
index_sample = random.sample(range(0, len(data_recurring_clusters)), 1000000)
data_recurring_clusters = data_recurring_clusters.iloc[index_sample,]

In [None]:
# Divide data into dependent and independent variables
y = data_recurring_clusters["Cluster"]
X = data_recurring_clusters.drop(["Total revenue", "Cluster"], axis = "columns")
# Divide data into train and test data sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.25)

In [None]:
# Set parameters that will be checked for a random search
params = {
    'n_estimators':[70, 100, 200],
    'min_child_weight':[1, 2, 3], 
    'gamma':[i/10.0 for i in range(0,3)],  
    'subsample':[i/10.0 for i in range(6,11)],
    'colsample_bytree':[i/10.0 for i in range(6,11)], 
    'max_depth': [2,3,4,5,6,7],
    'objective': ['multi:softmax'],
    'booster': ['gbtree'],
    'eta': [i/10.0 for i in range(3,6)]
}

In [None]:
# Define a model
cla = XGBClassifier(nthread=-1)
# Set the number of iterations for a random search
n_iter_search = 50
# Define a random search
random_search = RandomizedSearchCV(cla, param_distributions=params,
                                   n_iter=n_iter_search, cv=5, scoring= 'roc_auc_ovr_weighted')

In [None]:
# Perform a random search
random_search.fit(train_X, train_y)

In [None]:
# Extract the best parameters
random_search.best_params_

Model:

In [None]:
# Import data
with open('Data clustering\\data_recurring_clusters.pkl', 'rb') as file:
    data_recurring_clusters = pickle.load(file)

In [None]:
# Divide data into dependent and independent variables
y = data_recurring_clusters["Cluster"]
X = data_recurring_clusters.drop(["Total revenue", "Cluster"], axis = "columns")
# Divide data into train and test data sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.25)

In [None]:
# Define the model with the best parameters 
model_xgb = XGBClassifier(subsample = 0.9, objective = 'multi:softmax', n_estimators = 200, min_child_weight = 3, 
                    max_depth = 6, gamma = 0.2, eta = 0.3, colsample_bytree = 0.6, booster = 'gbtree')

In [None]:
# Fit the model
model_xgb.fit(train_X, train_y)

In [None]:
# Get predictions
pred = model_xgb.predict(test_X)
pred_prob = model_xgb.predict_proba(test_X)
# Calculate accuracy
accuracy = accuracy_score(test_y, pred)
# Calculate weighted roc_auc score
auc = roc_auc_score(test_y, pred_prob, multi_class = "ovr", average = "weighted") 

print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")

In [None]:
# Confusion matrix 
plt.figure(figsize=(100,100))
cm = confusion_matrix(test_y, pred)
cm_display = ConfusionMatrixDisplay(cm, display_labels = ["Low", "Medium", "High"])
fig, ax = plt.subplots(figsize=(8,6)) 
cm_display.plot(cmap = "Blues", values_format = "d", colorbar = False, ax = ax) 
for text in cm_display.text_.ravel():
    text.set_fontsize(14)
ax.set_xlabel("Predicted label", fontsize=14)
ax.set_ylabel("True label", fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.savefig("CM_XGBoost_Recurring.pdf", bbox_inches='tight')
plt.show()

## New customers

### Random forest 

Random search:

In [None]:
# Import data
with open('Data clustering\\data_new_clusters.pkl', 'rb') as file:
    data_new_clusters = pickle.load(file)

In [None]:
# Sample 100 000 rows to perform a random search
index_sample = random.sample(range(0, len(data_new_clusters)), 100000)
data_new_clusters = data_new_clusters.iloc[index_sample,]

In [None]:
# Divide data into dependent and independent variables
y = data_new_clusters["Cluster"]
X = data_new_clusters.drop(["Total revenue", "Cluster"], axis = "columns")
# Divide data into train and test data sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.25)

In [None]:
# Define search range for random search for...
# number of decision trees to use
n_estimators = [40, 50, 60, 70, 80] 
# loss function
criterion = ["gini", "entropy"]
# maximum number of features to consider at every split
max_features = ["log2", "sqrt"]
# maximum number of levels in each tree
max_depth = [3, 5, 10, 15, 20]
# minimum number of observations to split a node 
min_samples_split = [0.0005, 0.001, 0.0025, 0.005] 
# use bootstrap samples
bootstrap = [True]

# Create random grid
random_grid = {"n_estimators": n_estimators, "criterion": criterion, "max_features": max_features, "max_depth": max_depth,
               "min_samples_split": min_samples_split, "bootstrap": bootstrap}

In [None]:
# Define a model
rf = RandomForestClassifier()
# Random search (tries 50 different random combinations out of 400 possible ones)
random_search = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
                                      n_iter = 50, cv = 2, verbose = 2, n_jobs = -1,
                                      scoring = "roc_auc_ovr_weighted")

In [None]:
# Perform a random search
random_search.fit(train_X, train_y)

In [None]:
# Extract the best parameters
random_search.best_params_

Model:

In [None]:
# Import data
with open('Data clustering\\data_new_clusters.pkl', 'rb') as file:
    data_new_clusters = pickle.load(file)

In [None]:
# Divide data into dependent and independent variables
y = data_new_clusters["Cluster"]
X = data_new_clusters.drop(["Total_revenue", "Cluster"], axis = "columns")
# Divide data into train and test data sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.25)

In [None]:
# Define the model with the best parameters 
rf = RandomForestClassifier(n_estimators = 80, min_samples_split = 0.0005, max_features = "sqrt",
                                     max_depth = 10, criterion = "entropy", bootstrap = True)

In [None]:
# Train the model on training data
rf.fit(train_X, train_y)

In [None]:
# Get predictions
pred = rf.predict(test_X)
pred_prob = rf.predict_proba(test_X)
# Calculate accuracy
accuracy = accuracy_score(test_y, pred)
# Calculate weighted roc_auc score
auc = roc_auc_score(test_y, pred_prob, multi_class = "ovr", average = "weighted") 

print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")

In [None]:
# Confusion matrix 
plt.figure(figsize=(100,100))
cm = confusion_matrix(test_y, pred)
cm_display = ConfusionMatrixDisplay(cm, display_labels = ["Low", "Medium", "High"])
fig, ax = plt.subplots(figsize=(8,6)) 
cm_display.plot(cmap = "Blues", values_format = "d", colorbar = False, ax = ax) 
for text in cm_display.text_.ravel():
    text.set_fontsize(14)
ax.set_xlabel("Predicted label", fontsize=14)
ax.set_ylabel("True label", fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.savefig("CM_RF_New.pdf", bbox_inches='tight')
plt.show()

### XGBoost

Random search:

In [None]:
# Import data
with open('Data clustering\\data_new_clusters.pkl', 'rb') as file:
    data_new_clusters = pickle.load(file)

In [None]:
# Divide data into dependent and independent variables
y = data_new_clusters["Cluster"]
X = data_new_clusters.drop(["Total revenue", "Cluster"], axis = "columns")
# Divide data into train and test data sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.25)

In [None]:
# Set parameters that will be checked for a random search
params = {
    'n_estimators':[70, 100, 200],
    'min_child_weight':[1, 2, 3], 
    'gamma':[i/10.0 for i in range(0,3)],  
    'subsample':[i/10.0 for i in range(6,11)],
    'colsample_bytree':[i/10.0 for i in range(6,11)], 
    'max_depth': [2,3,4,6,7],
    'objective': ['multi:softmax'],
    'booster': ['gbtree', 'gblinear', 'dart'],
    'eta': [i/10.0 for i in range(3,6)]
}

In [None]:
# Define a model
cla = XGBClassifier(nthread=-1)
# Set the number of iterations for a random search
n_iter_search = 50
# Define a random search
random_search = RandomizedSearchCV(cla, param_distributions=params,
                                   n_iter=n_iter_search, cv=5, scoring= 'roc_auc_ovr_weighted')

In [None]:
# Perform a random search
random_search.fit(train_X, train_y)

In [None]:
# Extract the best parameters
random_search.best_params_

Model:

In [None]:
# Import data
with open('Data clustering\\data_new_clusters.pkl', 'rb') as file:
    data_new_clusters = pickle.load(file)

In [None]:
# Divide data into dependent and independent variables
y = data_new_clusters["Cluster"]
X = data_new_clusters.drop(["Total revenue", "Cluster"], axis = "columns")
# Divide data into train and test data sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.25)

In [None]:
# Define the model with the best parameters 
model_xgb = XGBClassifier(subsample = 1, objective = 'multi:softmax', n_estimators = 200, min_child_weight = 2, 
                    max_depth = 5, gamma = 0.2, eta = 0.5, colsample_bytree = 0.9, booster = 'gbtree')

In [None]:
# Fit the model
model_xgb.fit(train_X, train_y)

In [None]:
# Get predictions
pred = model_xgb.predict(test_X)
pred_prob = model_xgb.predict_proba(test_X)
# Calculate accuracy
accuracy = accuracy_score(test_y, pred)
# Calculate weighted roc_auc score
auc = roc_auc_score(test_y, pred_prob, multi_class = "ovr", average = "weighted") 

print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")

In [None]:
# Confusion matrix 
plt.figure(figsize=(100,100))
cm = confusion_matrix(test_y, pred)
cm_display = ConfusionMatrixDisplay(cm, display_labels = ["Low", "Medium", "High"])
fig, ax = plt.subplots(figsize=(8,6)) 
cm_display.plot(cmap = "Blues", values_format = "d", colorbar = False, ax = ax) 
for text in cm_display.text_.ravel():
    text.set_fontsize(14)
ax.set_xlabel("Predicted label", fontsize=14)
ax.set_ylabel("True label", fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.savefig("CM_XGBoost_New.pdf", bbox_inches='tight')
plt.show()

### XGBoost + SMOTE-Tomek

SMOTE-Tomek resampling:

In [None]:
# Import data
with open('Data clustering\\data_new_clusters.pkl', 'rb') as file:
    data_new_clusters = pickle.load(file)

In [None]:
# Divide data into dependent and independent variables
y = data_new_clusters["Cluster"]
X = data_new_clusters.drop(["Total revenue", "Cluster"], axis = "columns")

In [None]:
# Categorical variables 
cat_vars = X.columns.isin(["Insurance", "Co-applicant", "Gender", "Quarter_2", "Quarter_3", "Quarter_4"])

In [None]:
# Divide data into train and test data sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.25)

In [None]:
# Resample with Smote-Tomek links
resample = SMOTETomek(smote = SMOTENC(categorical_features = cat_vars, sampling_strategy = "not majority", k_neighbors = 5),  
                      tomek = TomekLinks(sampling_strategy = "all"), 
                      random_state = 3)
resampled_data = resample.fit_resample(train_X, train_y)
train_X_resampled = resampled_data[0]
train_y_resampled = resampled_data[1]

Model:

In [None]:
# Import data
with open('Data clustering\\data_new_clusters.pkl', 'rb') as file:
    data_new_clusters = pickle.load(file)

In [None]:
# Define a model with the best parameters 
model_xgb = XGBClassifier(subsample = 1, objective = 'multi:softmax', n_estimators = 200, min_child_weight = 2, 
                    max_depth = 5, gamma = 0.2, eta = 0.5, colsample_bytree = 0.9, booster = 'gbtree')

In [None]:
# Get predictions
pred = model_xgb.predict(test_X)
pred_prob = model_xgb.predict_proba(test_X)
# Calculate accuracy
accuracy = accuracy_score(test_y, pred)
# Calculate weighted roc_auc score
auc = roc_auc_score(test_y, pred_prob, multi_class = "ovr", average = "weighted") 

print(f"Accuracy: {accuracy}")
print(f"AUC: {auc}")

In [None]:
# Confusion matrix 
plt.figure(figsize=(100,100))
cm = confusion_matrix(test_y, pred)
cm_display = ConfusionMatrixDisplay(cm, display_labels = ["Low", "Medium", "High"])
fig, ax = plt.subplots(figsize=(8,6)) 
cm_display.plot(cmap = "Blues", values_format = "d", colorbar = False, ax = ax) 
for text in cm_display.text_.ravel():
    text.set_fontsize(14)
ax.set_xlabel("Predicted label", fontsize=14)
ax.set_ylabel("True label", fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.savefig("CM_XGBoost_ST_New.pdf", bbox_inches='tight')
plt.show()