# ML Modeling

## Libraries

In [1]:
# main libraries
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# metrics
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score, make_scorer

# ML classifier models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# model selection (CV)
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint

## Machine Learning

### Separating the DF into X and Y

In [2]:
bank = pd.read_csv("../data/bank_processed_data.csv", index_col=0)
bank.head()

Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,...,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Education_Level_encoded,Income_Category_encoded,Card_Category_encoded,x0_Married,x0_Single,x0_Unknown,x1_Existing Customer,x2_M
0,45,3,39,5,1,3,12691.0,777,1.335,1144,...,1.625,0.061,2.0,3.0,0.0,1.0,0.0,0.0,1.0,1.0
1,49,5,44,6,1,2,8256.0,864,1.541,1291,...,3.714,0.105,4.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,51,3,36,4,1,0,3418.0,0,2.594,1887,...,2.333,0.0,4.0,4.0,0.0,1.0,0.0,0.0,1.0,1.0
3,40,4,34,3,4,1,3313.0,2517,1.405,1171,...,2.333,0.76,2.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,40,3,21,5,1,0,4716.0,0,2.175,816,...,2.5,0.0,1.0,3.0,0.0,1.0,0.0,0.0,1.0,1.0


In [3]:
# separating X and y
X = bank.drop(columns="x1_Existing Customer")
y = bank["x1_Existing Customer"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [4]:
# Scaling the data

scaler = StandardScaler() # initialize the scaler

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Choosing the Models

The column that I want to identify is **x1_Existing Customer**, being **1** if the customer is still a customer, and **0** otherwise. In this case, as it's a True/False decission, the models that fit better for this type of Supervised ML are the Classifiers.

I will start checking the different models, without parameter tuning, for identify which are the models that perform better.

In [5]:
# Initializing the models

neigh = KNeighborsClassifier()
tree = DecisionTreeClassifier()
gradient = GradientBoostingClassifier()
RF = RandomForestClassifier()
adaboost = AdaBoostClassifier()
extra_tree = ExtraTreesClassifier()
support_vector = SVC()


models = [neigh, tree, RF, adaboost, gradient, extra_tree, support_vector]
model_names = ["KNeighbors", "DecisionTree", "RandomForest", "AdaBoost", 
               "GradientBoost", "ExtraTress", "SVC"]

The data per each category is not balanced, as customers represent 83.8% of the sample, the accuracy here is not relevant. 

In this scenario, I will focus more on *recall*, to ensure that the model classifies correctly the labels, and the precision. Mention that, the main focus will be on the label **0.0**, as it is the customers that already churned the bank, and we want to focus on that part to ensure that our model is able to predict possible future cases and act before churn happens.

Last, but not least, *macro avg* will also be taken into consideration, as we want to ensure that **0.0** are classified correctly, but we want that the amount of **1.0** are good too. I would have to find the perfect balance between those metrics.

#### Finding the best classification model

In [6]:
time_to_train = []
macro_precision = []
macro_recall = []
macro_F1 = []
report_dict = []

for model in models:    
    start = time.time()
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    # metrics
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro")
    f1 = f1_score(y_test, y_pred, average="macro")
    
    clasf_report = classification_report(y_test, y_pred)
    clasf_report_dict = classification_report(y_test, y_pred, output_dict=True)
    
    print(f"Classification Report of {model} | Precision {round(precision,2)} | Recall {round(recall,2)} | F1 {round(f1,2)}:")
    print(f"{clasf_report}")
    print(f"Training time of {time.time() - start}\n")
    
    # appending to empty lists
    time_to_train.append((time.time() - start))
    macro_precision.append(round(precision,2))
    macro_recall.append(round(recall,2))
    macro_F1.append(round(f1,2))
    report_dict.append(clasf_report_dict)

Classification Report of KNeighborsClassifier() | Precision 0.86 | Recall 0.73 | F1 0.77:
              precision    recall  f1-score   support

         0.0       0.81      0.49      0.61       327
         1.0       0.91      0.98      0.94      1699

    accuracy                           0.90      2026
   macro avg       0.86      0.73      0.77      2026
weighted avg       0.89      0.90      0.89      2026

Training time of 0.801854133605957

Classification Report of DecisionTreeClassifier() | Precision 0.87 | Recall 0.87 | F1 0.87:
              precision    recall  f1-score   support

         0.0       0.79      0.77      0.78       327
         1.0       0.96      0.96      0.96      1699

    accuracy                           0.93      2026
   macro avg       0.87      0.87      0.87      2026
weighted avg       0.93      0.93      0.93      2026

Training time of 0.07579708099365234

Classification Report of RandomForestClassifier() | Precision 0.94 | Recall 0.9 | F1 0.92:

In [7]:
precision_0 = []
recall_0 = []
f1_0 = []
precision_1 = []
recall_1 = []
f1_1 = []

for report in report_dict:
    # Info of churn label
    precision_0.append(round(report["0.0"]["precision"],2))
    recall_0.append(round(report["0.0"]["recall"],2))
    f1_0.append(round(report["0.0"]["f1-score"],2))
    
    # Info of current customers
    precision_1.append(round(report["1.0"]["precision"],2))
    recall_1.append(round(report["1.0"]["recall"],2))
    f1_1.append(round(report["1.0"]["f1-score"],2))

With all the information, I will create a DF for better visualization of the different models, beign able to identify which ones will provide best results for identiying churned customers.

In [8]:
best_models_DF = pd.DataFrame({"model":model_names,
                               "training_time":time_to_train,
                               "precision_macro":macro_precision,
                               "recall_macro":macro_recall,
                               "f1_macro":macro_F1,
                               "precision_0":precision_0,
                               "recall_0":recall_0,
                               "f1_0":f1_0,
                               "precision_1":precision_1,
                               "recall_1":recall_1,
                               "f1_1":f1_1
                               })

In [9]:
top3 = best_models_DF.sort_values(by=["recall_0", "precision_0", "recall_macro"], ascending=False).reset_index(drop=True).iloc[:3]

In [10]:
top3

Unnamed: 0,model,training_time,precision_macro,recall_macro,f1_macro,precision_0,recall_0,f1_0,precision_1,recall_1,f1_1
0,GradientBoost,1.850053,0.95,0.91,0.93,0.94,0.84,0.88,0.97,0.99,0.98
1,AdaBoost,0.49368,0.92,0.9,0.91,0.87,0.82,0.84,0.97,0.98,0.97
2,RandomForest,1.12898,0.94,0.9,0.92,0.93,0.8,0.86,0.96,0.99,0.98


The three models that does the best selection for the churned customers and also for the actual customers are **GradientBoost**, **AdaBoost** and **RandomForest**.

Now that we have in mind which models work best, let's start tuning them for improve their results

### Tuning Models

#### GradientBoost

In [38]:
# start_time = time.time()

# gradient_params = {"loss":["deviance", "exponential"],
#                   "criterion":["friedman_mse", "mse", "mae"],
#                   "max_features":["auto", "sqrt", "log2"],
#                   "n_estimators":randint(low=50, high=300),
#                   "max_depth":randint(low=2, high=8),
#                   "max_leaf_nodes":randint(low=5, high=15)}

# scorers = {"precision_score": make_scorer(precision_score, average="macro"),
#            "recall_score": make_scorer(recall_score, average="macro"),
#            "f1_score": make_scorer(f1_score, average="macro")
#           }

# gradient_search = RandomizedSearchCV(gradient,
#                                      gradient_params,
#                                      n_iter=10,
#                                      n_jobs=-1,
#                                      cv=10,
#                                      scoring=scorers,
#                                      refit=False,
#                                      random_state=42)

# gradient_search.fit(X_train_scaled, y_train)

# print("--- %s seconds ---" % (time.time() - start_time))

--- 811.6379034519196 seconds ---


After the results obtained, I will create a DF for visualize which are the scores for each scoring. After that, I will pick the parameters that performed better for passing it to GridSearchCV.

In [43]:
gradient_results = pd.DataFrame(gradient_search.cv_results_)

gradient_results = gradient_results[["params", "mean_test_precision_score", "rank_test_precision_score",
                                     "mean_test_recall_score", "rank_test_recall_score",
                                     "mean_test_f1_score", "rank_test_f1_score"]]

gradient_results

Unnamed: 0,params,mean_test_precision_score,rank_test_precision_score,mean_test_recall_score,rank_test_recall_score,mean_test_f1_score,rank_test_f1_score
0,"{'criterion': 'mae', 'loss': 'exponential', 'm...",0.926291,6,0.846002,6,0.877009,6
1,"{'criterion': 'friedman_mse', 'loss': 'devianc...",0.953618,4,0.90978,4,0.929682,4
2,"{'criterion': 'mae', 'loss': 'exponential', 'm...",0.910651,7,0.784373,8,0.829159,8
3,"{'criterion': 'mse', 'loss': 'exponential', 'm...",0.9603,3,0.927914,3,0.942923,3
4,"{'criterion': 'friedman_mse', 'loss': 'devianc...",0.962142,2,0.937456,2,0.949083,2
5,"{'criterion': 'friedman_mse', 'loss': 'devianc...",0.962713,1,0.938994,1,0.950104,1
6,"{'criterion': 'mae', 'loss': 'exponential', 'm...",0.909723,8,0.790855,7,0.831012,7
7,"{'criterion': 'mae', 'loss': 'exponential', 'm...",0.877599,10,0.684356,10,0.730526,10
8,"{'criterion': 'mae', 'loss': 'deviance', 'max_...",0.888624,9,0.747059,9,0.791634,9
9,"{'criterion': 'friedman_mse', 'loss': 'exponen...",0.947826,5,0.884469,5,0.912026,5


In [46]:
gradient_results["params"][5]

{'criterion': 'friedman_mse',
 'loss': 'deviance',
 'max_depth': 4,
 'max_features': 'log2',
 'max_leaf_nodes': 14,
 'n_estimators': 269}

In [49]:
start_time = time.time()

gradient_params = {"loss":["deviance"],
                  "criterion":["friedman_mse"],
                  "max_features":["log2"],
                  "n_estimators":[269],
                  "max_depth":[4],
                  "max_leaf_nodes":[14]}

scorers = {"precision_score": make_scorer(precision_score, average="macro"),
           "recall_score": make_scorer(recall_score, average="macro"),
           "f1_score": make_scorer(f1_score, average="macro")
          }

best_gradient_search = GridSearchCV(gradient,
                               gradient_params,
                               n_jobs=-1,
                               cv=50,
                               scoring=scorers,
                               refit=False)

best_gradient_search.fit(X_train_scaled, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

--- 15.88581919670105 seconds ---


After fitting the gradient with the best parameters, we obtain the training time, which is considerable low. Then, we can start creating the **best_gradient** and predict which is going to be the overall *precision*, *recall* and *F1 Score* for the model.

In [54]:
best_gradient = GradientBoostingClassifier(loss="deviance", criterion="friedman_mse",
                                          max_features="log2", n_estimators=269,
                                          max_depth=4, max_leaf_nodes=14)

In [55]:
best_gradient.fit(X_train_scaled, y_train)

GradientBoostingClassifier(max_depth=4, max_features='log2', max_leaf_nodes=14,
                           n_estimators=269)

In [58]:
y_pred_gradient = best_gradient.predict(X_test_scaled)

In [137]:
gradient_precision = round(precision_score(y_test, y_pred_gradient),4)
gradient_recall = round(recall_score(y_test, y_pred_gradient), 4)
gradient_f1_score = round(f1_score(y_test, y_pred_gradient), 4)

print(f"Best Gradient Precision: {round(precision_score(y_test, y_pred_gradient),4)}")
print(f"Best Gradient Recall: {round(recall_score(y_test, y_pred_gradient), 4)}")
print(f"Best Gradient F1 Score: {round(f1_score(y_test, y_pred_gradient), 4)}")

Best Gradient Precision: 0.9744
Best Gradient Recall: 0.9871
Best Gradient F1 Score: 0.9807


#### AdaBoost

In [96]:
# start_time = time.time()

# adaboost_params = {"algorithm":["SAMME", "SAMME.R"],
#                   "n_estimators":randint(low=10, high=200)
#                   }

# scorers = {"precision_score": make_scorer(precision_score, average="macro"),
#            "recall_score": make_scorer(recall_score, average="macro"),
#            "f1_score": make_scorer(f1_score, average="macro")
#           }

# adaboost_search = RandomizedSearchCV(adaboost,
#                                      adaboost_params,
#                                      n_iter=10,
#                                      n_jobs=-1,
#                                      cv=10,
#                                      scoring=scorers,
#                                      refit=False,
#                                      random_state=42)

# adaboost_search.fit(X_train_scaled, y_train)

# print("--- %s seconds ---" % (time.time() - start_time))

--- 14.541921854019165 seconds ---


In [99]:
adaboost_results = pd.DataFrame(adaboost_search.cv_results_)

adaboost_results = adaboost_results[["params", "mean_test_precision_score", "rank_test_precision_score",
                                     "mean_test_recall_score", "rank_test_recall_score",
                                     "mean_test_f1_score", "rank_test_f1_score"]]

adaboost_results

Unnamed: 0,params,mean_test_precision_score,rank_test_precision_score,mean_test_recall_score,rank_test_recall_score,mean_test_f1_score,rank_test_f1_score
0,"{'algorithm': 'SAMME', 'n_estimators': 189}",0.936063,1,0.912258,2,0.923323,2
1,"{'algorithm': 'SAMME', 'n_estimators': 24}",0.906809,10,0.833152,10,0.863271,10
2,"{'algorithm': 'SAMME', 'n_estimators': 81}",0.930603,7,0.888706,8,0.907679,8
3,"{'algorithm': 'SAMME', 'n_estimators': 30}",0.911296,9,0.846614,9,0.873831,9
4,"{'algorithm': 'SAMME', 'n_estimators': 131}",0.933799,6,0.905646,4,0.918695,4
5,"{'algorithm': 'SAMME', 'n_estimators': 84}",0.928539,8,0.890884,7,0.907993,7
6,"{'algorithm': 'SAMME', 'n_estimators': 97}",0.933977,5,0.896619,6,0.913639,6
7,"{'algorithm': 'SAMME', 'n_estimators': 109}",0.934408,4,0.902094,5,0.916889,5
8,"{'algorithm': 'SAMME.R', 'n_estimators': 161}",0.934634,3,0.921834,1,0.927895,1
9,"{'algorithm': 'SAMME', 'n_estimators': 159}",0.935324,2,0.911031,3,0.9224,3


In [100]:
adaboost_results["params"][8]

{'algorithm': 'SAMME.R', 'n_estimators': 161}

In [101]:
start_time = time.time()

adaboost_params = {"algorithm":["SAMME.R"],
                  "n_estimators":[161]
                  }

scorers = {"precision_score": make_scorer(precision_score, average="macro"),
           "recall_score": make_scorer(recall_score, average="macro"),
           "f1_score": make_scorer(f1_score, average="macro")
          }

best_adaboost_search = GridSearchCV(adaboost,
                               adaboost_params,
                               n_jobs=-1,
                               cv=50,
                               scoring=scorers,
                               refit=False)

best_adaboost_search.fit(X_train_scaled, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

--- 12.782952070236206 seconds ---


In [102]:
best_adaboost = AdaBoostClassifier(algorithm="SAMME.R", n_estimators=161)

In [103]:
best_adaboost.fit(X_train_scaled, y_train)

AdaBoostClassifier(n_estimators=161)

In [104]:
y_pred_adaboost = best_adaboost.predict(X_test_scaled)

In [142]:
adaboost_precision = round(precision_score(y_test, y_pred_adaboost),4)
adaboost_recall = round(recall_score(y_test, y_pred_adaboost), 4)
adaboost_f1_score = round(f1_score(y_test, y_pred_adaboost), 4)

print(f"Best AdaBoost Precision: {round(precision_score(y_test, y_pred_adaboost),4)}")
print(f"Best AdaBoost Recall: {round(recall_score(y_test, y_pred_adaboost), 4)}")
print(f"Best AdaBoost F1 Score: {round(f1_score(y_test, y_pred_adaboost), 4)}")

Best AdaBoost Precision: 0.9691
Best AdaBoost Recall: 0.9794
Best AdaBoost F1 Score: 0.9742


#### RandomForest

In [120]:
# start_time = time.time()

# RF_params = {"criterion":["gini", "entropy"],
#              "max_features":["auto", "sqrt", "log2"],
#              "class_weight":["balanced", "balanced_subsample"],          
#              "n_estimators":randint(low=10, high=400),
#              "max_depth":randint(low=2, high=20),
#              "min_samples_split":randint(low=2, high=40)
#             }

# scorers = {"precision_score": make_scorer(precision_score, average="macro"),
#            "recall_score": make_scorer(recall_score, average="macro"),
#            "f1_score": make_scorer(f1_score, average="macro")
#           }

# RF_search = RandomizedSearchCV(RF,
#                                RF_params,
#                                n_iter=10,
#                                n_jobs=-1,
#                                cv=50,
#                                scoring=scorers,
#                                refit=False,
#                                random_state=42)

# RF_search.fit(X_train_scaled, y_train)

# print("--- %s seconds ---" % (time.time() - start_time))

--- 178.9842643737793 seconds ---


In [121]:
RF_results = pd.DataFrame(RF_search.cv_results_)

RF_results = RF_results[["params", "mean_test_precision_score", "rank_test_precision_score",
                         "mean_test_recall_score", "rank_test_recall_score",
                         "mean_test_f1_score", "rank_test_f1_score"]]

RF_results

Unnamed: 0,params,mean_test_precision_score,rank_test_precision_score,mean_test_recall_score,rank_test_recall_score,mean_test_f1_score,rank_test_f1_score
0,"{'class_weight': 'balanced', 'criterion': 'ent...",0.937418,2,0.922512,6,0.928718,2
1,"{'class_weight': 'balanced', 'criterion': 'gin...",0.912491,5,0.934996,1,0.922076,5
2,"{'class_weight': 'balanced_subsample', 'criter...",0.789057,10,0.884323,10,0.821278,10
3,"{'class_weight': 'balanced_subsample', 'criter...",0.869792,8,0.926401,4,0.893274,8
4,"{'class_weight': 'balanced_subsample', 'criter...",0.909635,6,0.932632,3,0.91965,7
5,"{'class_weight': 'balanced', 'criterion': 'ent...",0.948699,1,0.907981,8,0.925835,3
6,"{'class_weight': 'balanced', 'criterion': 'gin...",0.90883,7,0.934426,2,0.919668,6
7,"{'class_weight': 'balanced', 'criterion': 'ent...",0.936011,3,0.925385,5,0.929491,1
8,"{'class_weight': 'balanced', 'criterion': 'ent...",0.935234,4,0.916906,7,0.924709,4
9,"{'class_weight': 'balanced_subsample', 'criter...",0.803499,9,0.892898,9,0.83512,9


In [128]:
RF_results["params"][6]

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'log2',
 'min_samples_split': 19,
 'n_estimators': 397}

In [129]:
start_time = time.time()

RF_params = {"class_weight":["balanced"],
             "criterion":["gini"],
             "max_features":["log2"],
             "max_depth":[10],
             "min_samples_split":[19],
             "n_estimators":[397]
            }

scorers = {"precision_score": make_scorer(precision_score, average="macro"),
           "recall_score": make_scorer(recall_score, average="macro"),
           "f1_score": make_scorer(f1_score, average="macro")
          }

best_RF_search = GridSearchCV(RF,
                              RF_params,
                              n_jobs=-1,
                              cv=50,
                              scoring=scorers,
                              refit=False)

best_RF_search.fit(X_train_scaled, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

--- 30.186686992645264 seconds ---


In [130]:
best_RF = RandomForestClassifier(class_weight="balanced", criterion="gini", max_features="log2",
                                 max_depth=10, min_samples_split=19, n_estimators=397)

In [131]:
best_RF.fit(X_train_scaled, y_train)

RandomForestClassifier(class_weight='balanced', max_depth=10,
                       max_features='log2', min_samples_split=19,
                       n_estimators=397)

In [132]:
y_pred_RF = best_RF.predict(X_test_scaled)

In [140]:
RF_precision = round(precision_score(y_test, y_pred_RF),4)
RF_recall = round(recall_score(y_test, y_pred_RF), 4)
RF_f1_score = round(f1_score(y_test, y_pred_RF), 4)

print(f"Best RF Precision: {round(precision_score(y_test, y_pred_RF),4)}")
print(f"Best RF Recall: {round(recall_score(y_test, y_pred_RF), 4)}")
print(f"Best RF F1 Score: {round(f1_score(y_test, y_pred_RF), 4)}")

Best RF Precision: 0.9792
Best RF Recall: 0.9682
Best RF F1 Score: 0.9737


### Analyzing the results

In [None]:
best_models = ["GradientBoost", "AdaBoost", "RandomForest"]
best_precisions = [gradient_precision, adaboost_precision, RF_precision]
best_recalls = [gradient_recall, adaboost_recall, RF_recall]
best_f1s = [gradient_f1_score, adaboost_f1_score, RF_f1_score]

top_3_models_comparision = pd.DataFrame({"model":best_models,
                                        "precision":best_precisions,
                                        "recall":best_recalls,
                                        "f1_score":best_f1s})