In [1]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer, f1_score, precision_score, recall_score
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np



In [2]:
df = pd.read_csv("./../clean_data.csv")

numerical_columns = df.select_dtypes(include=['float64']).columns

x = df[numerical_columns]
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# Balancing data
ros = RandomOverSampler(random_state=42)
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

# Display class distribution after oversampling
print("\nClass distribution after oversampling:")
print(pd.Series(y_oversampled).value_counts())

# Apply RandomUnderSampler to address class imbalance
rus = RandomUnderSampler(random_state=42)
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

# Display class distribution after undersampling
print("\nClass distribution after undersampling:")
print(pd.Series(y_undersampled).value_counts())

X_oversampled


Class distribution after oversampling:
No     3273
Yes    3273
Name: churn, dtype: int64

Class distribution after undersampling:
No     519
Yes    519
Name: churn, dtype: int64


Unnamed: 0,accountlength,numbervmailmessages,totaldayminutes,totaldaycalls,totalevecalls,totalevecharge,totalnightminutes,totalnightcalls,totalintlcalls,totalintlcharge,numbercustomerservicecalls
0,0.384259,0.000000,0.537786,0.396552,0.658120,0.890114,0.657114,0.316239,0.3,0.270455,0.0
1,0.459605,0.291667,0.541863,0.379310,0.410256,0.119732,0.597468,0.666667,0.2,0.515909,0.4
2,0.532407,0.479167,0.732204,0.310345,0.675214,0.287909,0.758747,0.529915,0.2,0.595455,0.4
3,0.680556,0.000000,0.661963,0.431034,0.470085,0.444269,0.443519,0.316239,0.3,0.250000,0.4
4,0.277778,0.000000,0.674506,0.724138,0.290598,0.494683,0.403199,0.333333,0.1,0.368182,0.0
...,...,...,...,...,...,...,...,...,...,...,...
6541,0.375000,0.000000,0.973032,0.267241,0.666667,0.520284,0.723092,0.512821,0.5,0.520455,0.0
6542,0.662037,0.000000,0.132016,0.646552,0.367521,0.080740,0.409530,0.487179,0.6,0.372727,0.8
6543,0.509259,0.000000,0.438696,0.043103,0.094017,0.724301,0.432189,0.521368,0.1,0.406818,0.4
6544,0.680556,0.000000,0.671057,0.336207,0.247863,0.450965,0.240586,0.470085,0.1,0.459091,0.0


In [52]:
# Random forest

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_oversampled, y_oversampled)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest - Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nRandom Forest - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nRandom Forest - Recall:", recall_score(y_test, y_pred_rf, pos_label='Yes'))
print("Random Forest - Precision:", precision_score(y_test, y_pred_rf, pos_label='Yes'))
print("Random Forest - F1:", f1_score(y_test, y_pred_rf, pos_label='Yes'))


Random Forest - Accuracy: 0.9156118143459916

Random Forest - Confusion Matrix:
 [[803  18]
 [ 62  65]]

Random Forest - Recall: 0.5118110236220472
Random Forest - Precision: 0.7831325301204819
Random Forest - F1: 0.6190476190476191


In [48]:
# GradientBoostingClassifier

gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_oversampled, y_oversampled)
y_pred_gb = gb_model.predict(X_test)

print("GradientBoostingClassifier - Accuracy:", accuracy_score(y_test, y_pred_gb))
print("\nGradientBoostingClassifier - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("\nGradientBoostingClassifier - Recall:", recall_score(y_test, y_pred_gb, pos_label='Yes'))
print("GradientBoostingClassifier - Precision:", precision_score(y_test, y_pred_gb, pos_label='Yes'))
print("GradientBoostingClassifier - F1:", f1_score(y_test, y_pred_gb, pos_label='Yes'))


GradientBoostingClassifier - Accuracy: 0.8544303797468354

GradientBoostingClassifier - Confusion Matrix:
 [[730  91]
 [ 47  80]]

GradientBoostingClassifier - Recall: 0.6299212598425197
GradientBoostingClassifier - Precision: 0.4678362573099415
GradientBoostingClassifier - F1: 0.5369127516778524


In [49]:
# HistGradientBoostingClassifier

hgbc_model = HistGradientBoostingClassifier(random_state=42)
hgbc_model.fit(X_oversampled, y_oversampled)
y_pred_hgbc = hgbc_model.predict(X_test)

print("HistGradientBoostingClassifier - Accuracy:", accuracy_score(y_test, y_pred_hgbc))
print("\nHistGradientBoostingClassifier - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_hgbc))
print("\nHistGradientBoostingClassifier - Classification Report:\n", classification_report(y_test, y_pred_hgbc))
print("\nHistGradientBoostingClassifier - Recall:", recall_score(y_test, y_pred_hgbc, pos_label='Yes'))
print("HistGradientBoostingClassifier - Precision:", precision_score(y_test, y_pred_hgbc, pos_label='Yes'))
print("HistGradientBoostingClassifier - F1:", f1_score(y_test, y_pred_hgbc, pos_label='Yes'))


HistGradientBoostingClassifier - Accuracy: 0.8966244725738397

HistGradientBoostingClassifier - Confusion Matrix:
 [[781  40]
 [ 58  69]]

HistGradientBoostingClassifier - Classification Report:
               precision    recall  f1-score   support

          No       0.93      0.95      0.94       821
         Yes       0.63      0.54      0.58       127

    accuracy                           0.90       948
   macro avg       0.78      0.75      0.76       948
weighted avg       0.89      0.90      0.89       948


HistGradientBoostingClassifier - Recall: 0.5433070866141733
HistGradientBoostingClassifier - Precision: 0.6330275229357798
HistGradientBoostingClassifier - F1: 0.5847457627118645


In [19]:
# Cross-Validation

cv_scores_rf = cross_val_score(rf_model, X_oversampled, y_oversampled, cv=3)
cv_scores_gb = cross_val_score(gb_model, X_oversampled, y_oversampled, cv=3)

print("Cross-Validation Scores - Random Forest:", cv_scores_rf)
print("Random Forest Mean Accuracy:", np.mean(cv_scores_rf))
print("Cross-Validation Scores - Gradient Boosting:", cv_scores_gb)
print("Gradient Boosting Mean Accuracy:", np.mean(cv_scores_gb))

Cross-Validation Scores - Random Forest: [0.98350137 0.98945921 0.98716774]
Random Forest Mean Accuracy: 0.9867094408799266
Cross-Validation Scores - Gradient Boosting: [0.85059578 0.86480293 0.85701192]
Gradient Boosting Mean Accuracy: 0.8574702108157654


In [38]:
# Hyperparameter spaces for different classifiers
param_grid_bagging = {
    'n_estimators': [50, 100, 200],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0],
    'bootstrap': [True, False],
    'bootstrap_features': [True, False],
    'base_estimator__criterion': ['gini', 'entropy'],
    'base_estimator__max_depth': [None, 10, 20],
}

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None, 1, 2, 3],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
}

param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2, 0.5],
    'algorithm': ['SAMME', 'SAMME.R'],
}

param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2, 0.5],
    'subsample': [0.8, 1.0],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_oversampled)

In [31]:
# BaggingClassifier
bagging_model = BaggingClassifier()
grid_bagging = GridSearchCV(estimator=bagging_model, param_grid=param_grid_bagging, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_bagging.fit(X_imputed, y_oversampled)

X_test_imputed = imputer.transform(X_test)
y_pred_bagging = grid_bagging.best_estimator_.predict(X_test_imputed)
acc_bagging = grid_bagging.best_estimator_.score(X_test_imputed, y_test)


f1_scorer = make_scorer(f1_score, pos_label='Yes')
grid_bagging_f1 = GridSearchCV(estimator=bagging_model, param_grid=param_grid_bagging, cv=5, n_jobs=-1, verbose=1, scoring=f1_scorer)
grid_bagging_f1.fit(X_imputed, y_oversampled)

X_test_imputed = imputer.transform(X_test)
y_pred_bagging_f1 = grid_bagging_f1.best_estimator_.predict(X_test_imputed)

# Calculate F1 score for the best model
f1_bagging_best = f1_score(y_test, y_pred_bagging_f1, pos_label='Yes', average='binary')


print("\nBest Hyperparameters for Bagging:", grid_bagging.best_params_)
print("Best Bagging Accuracy:", acc_bagging)
print("Best Hyperparameters for Bagging (F1):", grid_bagging_f1.best_params_)
print("Best Bagging F1 Score:", f1_bagging_best)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Hyperparameters for Bagging: {'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 200}
Best Bagging Accuracy: 0.8839662447257384
Best Hyperparameters for Bagging (F1): {'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 100}
Best Bagging F1 Score: 0.25675675675675674


In [40]:
# RandomForestClassifier
rf_model = RandomForestClassifier()
grid_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_rf.fit(X_imputed, y_oversampled)

X_test_imputed = imputer.transform(X_test)
y_pred_rf = grid_rf.best_estimator_.predict(X_test_imputed)
acc_rf = grid_rf.best_estimator_.score(X_test_imputed, y_test)

grid_rf_f1 = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=1, scoring=f1_scorer)
grid_rf_f1.fit(X_imputed, y_oversampled)

X_test_imputed = imputer.transform(X_test)
y_pred_rf_f1 = grid_rf_f1.best_estimator_.predict(X_test_imputed)

# Calculate F1 score for the best model
f1_rf_best = f1_score(y_test, y_pred_rf_f1, pos_label='Yes', average='binary')

print("\nBest Hyperparameters for RandomForest:", grid_rf.best_params_)
print("Best RandomForest Accuracy:", acc_rf)
print("Best Hyperparameters for RandomForest (F1):", grid_rf_f1.best_params_)
print("Best RadnomForest F1 Score:", f1_rf_best)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Fitting 5 folds for each of 54 candidates, totalling 270 fits

Best Hyperparameters for RandomForest: {'max_depth': None, 'max_features': 1, 'n_estimators': 200}
Best RandomForest Accuracy: 0.9008438818565401
Best Hyperparameters for RandomForest (F1): {'max_depth': None, 'max_features': 1, 'n_estimators': 200}
Best RadnomForest F1 Score: 0.46327683615819204


In [39]:
# AdaBoostClassifier
ada_model = AdaBoostClassifier()
grid_ada = GridSearchCV(estimator=ada_model, param_grid=param_grid_ada, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_ada.fit(X_imputed, y_oversampled)

X_test_imputed = imputer.transform(X_test)
y_pred_ada = grid_ada.best_estimator_.predict(X_test_imputed)
acc_ada = grid_ada.best_estimator_.score(X_test_imputed, y_test)

grid_ada_f1 = GridSearchCV(estimator=ada_model, param_grid=param_grid_ada, cv=5, n_jobs=-1, verbose=1, scoring=f1_scorer)
grid_ada_f1.fit(X_imputed, y_oversampled)

X_test_imputed = imputer.transform(X_test)
y_pred_ada_f1 = grid_ada_f1.best_estimator_.predict(X_test_imputed)

# Calculate F1 score for the best model
f1_ada_best = f1_score(y_test, y_pred_ada_f1, pos_label='Yes', average='binary')


print("Best Hyperparameters for AdaBoost:", grid_ada.best_params_)
print("Best AdaBoost Accuracy:", acc_ada)
print("Best Hyperparameters for AdaBoost (F1):", grid_ada_f1.best_params_)
print("Best AdaBoost F1 Score:", f1_ada_best)


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Hyperparameters for AdaBoost: {'n_estimators': 200}
Best AdaBoost Accuracy: 0.7943037974683544
Best Hyperparameters for AdaBoost (F1): {'n_estimators': 200}
Best AdaBoost F1 Score: 0.44126074498567336


In [41]:
# GradientBoostingClassifier
gb_model = GradientBoostingClassifier()
grid_gb = GridSearchCV(estimator=gb_model, param_grid=param_grid_gb, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
grid_gb.fit(X_imputed, y_oversampled)

X_test_imputed = imputer.transform(X_test)
y_pred_gb= grid_gb.best_estimator_.predict(X_test_imputed)
acc_gb = grid_gb.best_estimator_.score(X_test_imputed, y_test)

X_test_imputed = imputer.transform(X_test)
y_pred_gb = grid_gb.best_estimator_.predict(X_test_imputed)
acc_gb = grid_gb.best_estimator_.score(X_test_imputed, y_test)

f1_scorer = make_scorer(f1_score, pos_label='Yes')
grid_gb_f1 = GridSearchCV(estimator=gb_model, param_grid=param_grid_gb, cv=5, n_jobs=-1, verbose=1, scoring=f1_scorer)
grid_gb_f1.fit(X_imputed, y_oversampled)

X_test_imputed = imputer.transform(X_test)
y_pred_gb_f1 = grid_gb_f1.best_estimator_.predict(X_test_imputed)

# Calculate F1 score for the best model
f1_gb_best = f1_score(y_test, y_pred_gb_f1, pos_label='Yes', average='binary')

print("Best Hyperparameters for GradientBoosting:", grid_gb.best_params_)
print("Best GradientBoosting Accuracy:", acc_gb)
print("Best Hyperparameters for GradientBoosting (F1):", grid_gb_f1.best_params_)
print("Best GradientBoosting F1 Score:", f1_gb_best)


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Hyperparameters for GradientBoosting: {'learning_rate': 0.2, 'n_estimators': 200}
Best GradientBoosting Accuracy: 0.8744725738396625
Best Hyperparameters for GradientBoosting (F1): {'learning_rate': 0.2, 'n_estimators': 200}
Best GradientBoosting F1 Score: 0.5703971119133574


In [46]:
# AdaBoostingClassifier

ada_model = AdaBoostClassifier(random_state=42)
ada_model.fit(X_oversampled, y_oversampled)
y_pred_ada = ada_model.predict(X_test)

print("AdaBoostingClassifier - Accuracy:", accuracy_score(y_test, y_pred_ada))
print("\nAdaBoostingClassifier - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_ada))
print("\nAdaBoostingClassifier - Recall:", recall_score(y_test, y_pred_ada, pos_label='Yes'))
print("AdaBoostingClassifier - Precision:", precision_score(y_test, y_pred_ada, pos_label='Yes'))
print("AdaBoostingClassifier - F1:", f1_score(y_test, y_pred_ada, pos_label='Yes'))


# BaggingClassifier
bagging_model = BaggingClassifier(random_state=42)
bagging_model.fit(X_oversampled, y_oversampled)
y_pred_bagging = bagging_model.predict(X_test)

print("BaggingClassifier - Accuracy:", accuracy_score(y_test, y_pred_bagging))
print("\nBaggingClassifier - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_bagging))
print("\nBaggingClassifier - Recall:", recall_score(y_test, y_pred_bagging, pos_label='Yes'))
print("BaggingClassifier - Precision:", precision_score(y_test, y_pred_bagging, pos_label='Yes'))
print("BaggingClassifier - F1:", f1_score(y_test, y_pred_bagging, pos_label='Yes'))

AdaBoostingClassifier - Accuracy: 0.7943037974683544

AdaBoostingClassifier - Confusion Matrix:
 [[667 154]
 [ 41  86]]

AdaBoostingClassifier - Recall: 0.6771653543307087
AdaBoostingClassifier - Precision: 0.35833333333333334
AdaBoostingClassifier - F1: 0.4686648501362398
BaggingClassifier - Accuracy: 0.9050632911392406

BaggingClassifier - Confusion Matrix:
 [[795  26]
 [ 64  63]]

BaggingClassifier - Recall: 0.49606299212598426
BaggingClassifier - Precision: 0.7078651685393258
BaggingClassifier - F1: 0.5833333333333334
