In [1]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer, f1_score, precision_score, recall_score
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np



In [2]:
df = pd.read_csv("./../clean_data.csv")

numerical_columns = df.select_dtypes(include=['float64']).columns

x = df[numerical_columns]
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# Balancing data
ros = RandomOverSampler(random_state=42)
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

# Display class distribution after oversampling
print("\nClass distribution after oversampling:")
print(pd.Series(y_oversampled).value_counts())

# Apply RandomUnderSampler to address class imbalance
rus = RandomUnderSampler(random_state=42)
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

# Display class distribution after undersampling
print("\nClass distribution after undersampling:")
print(pd.Series(y_undersampled).value_counts())

X_oversampled


Class distribution after oversampling:
No     3273
Yes    3273
Name: churn, dtype: int64

Class distribution after undersampling:
No     519
Yes    519
Name: churn, dtype: int64


Unnamed: 0,accountlength,numbervmailmessages,totaldayminutes,totaldaycalls,totalevecalls,totalevecharge,totalnightminutes,totalnightcalls,totalintlcalls,totalintlcharge,numbercustomerservicecalls
0,0.384259,0.000000,0.537786,0.396552,0.658120,0.890114,0.657114,0.316239,0.3,0.270455,0.0
1,0.459605,0.291667,0.541863,0.379310,0.410256,0.119732,0.597468,0.666667,0.2,0.515909,0.4
2,0.532407,0.479167,0.732204,0.310345,0.675214,0.287909,0.758747,0.529915,0.2,0.595455,0.4
3,0.680556,0.000000,0.661963,0.431034,0.470085,0.444269,0.443519,0.316239,0.3,0.250000,0.4
4,0.277778,0.000000,0.674506,0.724138,0.290598,0.494683,0.403199,0.333333,0.1,0.368182,0.0
...,...,...,...,...,...,...,...,...,...,...,...
6541,0.375000,0.000000,0.973032,0.267241,0.666667,0.520284,0.723092,0.512821,0.5,0.520455,0.0
6542,0.662037,0.000000,0.132016,0.646552,0.367521,0.080740,0.409530,0.487179,0.6,0.372727,0.8
6543,0.509259,0.000000,0.438696,0.043103,0.094017,0.724301,0.432189,0.521368,0.1,0.406818,0.4
6544,0.680556,0.000000,0.671057,0.336207,0.247863,0.450965,0.240586,0.470085,0.1,0.459091,0.0


In [14]:
# Random forest

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_oversampled, y_oversampled)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest - Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nRandom Forest - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nRandom Forest - Recall:", accuracy_score(y_test, y_pred_rf))
print("Random Forest - Precision:", precision_score(y_test, y_pred_rf, pos_label='Yes'))
print("Random Forest - F1:", f1_score(y_test, y_pred_rf, pos_label='Yes'))


Random Forest - Accuracy: 0.9156118143459916

Random Forest - Confusion Matrix:
 [[803  18]
 [ 62  65]]

Random Forest - Recall: 0.9156118143459916
Random Forest - Precision: 0.7831325301204819
Random Forest - F1: 0.6190476190476191


In [17]:
# GradientBoostingClassifier

gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_oversampled, y_oversampled)
y_pred_gb = gb_model.predict(X_test)

print("GradientBoostingClassifier - Accuracy:", accuracy_score(y_test, y_pred_gb))
print("\nGradientBoostingClassifier - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("\nGradientBoostingClassifier - Recall:", accuracy_score(y_test, y_pred_gb))
print("GradientBoostingClassifier - Precision:", precision_score(y_test, y_pred_gb, pos_label='Yes'))
print("GradientBoostingClassifier - F1:", f1_score(y_test, y_pred_gb, pos_label='Yes'))


GradientBoostingClassifier - Accuracy: 0.8544303797468354

GradientBoostingClassifier - Confusion Matrix:
 [[730  91]
 [ 47  80]]

GradientBoostingClassifier - Recall: 0.8544303797468354
GradientBoostingClassifier - Precision: 0.4678362573099415
GradientBoostingClassifier - F1: 0.5369127516778524


In [18]:
# HistGradientBoostingClassifier

hgbc_model = HistGradientBoostingClassifier(random_state=42)
hgbc_model.fit(X_oversampled, y_oversampled)
y_pred_hgbc = hgbc_model.predict(X_test)

print("HistGradientBoostingClassifier - Accuracy:", accuracy_score(y_test, y_pred_hgbc))
print("\nHistGradientBoostingClassifier - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_hgbc))
print("\nHistGradientBoostingClassifier - Classification Report:\n", classification_report(y_test, y_pred_hgbc))
print("\nHistGradientBoostingClassifier - Recall:", accuracy_score(y_test, y_pred_hgbc))
print("HistGradientBoostingClassifier - Precision:", precision_score(y_test, y_pred_hgbc, pos_label='Yes'))
print("HistGradientBoostingClassifier - F1:", f1_score(y_test, y_pred_hgbc, pos_label='Yes'))


HistGradientBoostingClassifier - Accuracy: 0.8966244725738397

HistGradientBoostingClassifier - Confusion Matrix:
 [[781  40]
 [ 58  69]]

HistGradientBoostingClassifier - Classification Report:
               precision    recall  f1-score   support

          No       0.93      0.95      0.94       821
         Yes       0.63      0.54      0.58       127

    accuracy                           0.90       948
   macro avg       0.78      0.75      0.76       948
weighted avg       0.89      0.90      0.89       948


HistGradientBoostingClassifier - Recall: 0.8966244725738397
HistGradientBoostingClassifier - Precision: 0.6330275229357798
HistGradientBoostingClassifier - F1: 0.5847457627118645


In [19]:
# Cross-Validation

cv_scores_rf = cross_val_score(rf_model, X_oversampled, y_oversampled, cv=3)
cv_scores_gb = cross_val_score(gb_model, X_oversampled, y_oversampled, cv=3)

print("Cross-Validation Scores - Random Forest:", cv_scores_rf)
print("Random Forest Mean Accuracy:", np.mean(cv_scores_rf))
print("Cross-Validation Scores - Gradient Boosting:", cv_scores_gb)
print("Gradient Boosting Mean Accuracy:", np.mean(cv_scores_gb))

Cross-Validation Scores - Random Forest: [0.98350137 0.98945921 0.98716774]
Random Forest Mean Accuracy: 0.9867094408799266
Cross-Validation Scores - Gradient Boosting: [0.85059578 0.86480293 0.85701192]
Gradient Boosting Mean Accuracy: 0.8574702108157654


In [20]:
# Hyperparameter spaces for different classifiers
param_grid_bagging = {
    'baggingclassifier__n_estimators': [50, 100, 200],
}

param_grid_rf = {
    'randomforestclassifier__n_estimators': [50, 100, 200],
    'randomforestclassifier__max_depth': [None, 10, 20],  
    'randomforestclassifier__max_features': ['sqrt', 'log2', None, 1, 2, 3],  
}

param_grid_ada = {
    'adaboostclassifier__n_estimators': [50, 100, 200],
}

param_grid_gb = {
    'gradientboostingclassifier__n_estimators': [50, 100, 200],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.2],
}


imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_train)

In [41]:
# BaggingClassifier
bagging_model = BaggingClassifier()
bagging_model_with_imputer = make_pipeline(SimpleImputer(strategy='mean'), bagging_model)
grid_search_bagging = GridSearchCV(bagging_model_with_imputer, param_grid_bagging, cv=5)
grid_search_bagging.fit(X_imputed, y_train_encoded)


X_test_imputed = imputer.transform(X_test)
y_pred_bagging = grid_search_bagging.best_estimator_.predict(X_test_imputed)
acc_bagging = grid_search_bagging.best_estimator_.score(X_test_imputed, y_test_encoded)

print("Best Hyperparameters for Bagging:", grid_search_bagging.best_params_)
print("Best Bagging Accuracy:", acc_bagging)

Best Hyperparameters for Bagging: {'baggingclassifier__n_estimators': 200}
Best Bagging Accuracy: 0.8551236749116607


In [42]:
# RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model_with_imputer = make_pipeline(SimpleImputer(strategy='mean'), rf_model)
grid_search_rf = GridSearchCV(rf_model_with_imputer, param_grid_rf, cv=5)
grid_search_rf.fit(X_imputed, y_train_encoded)

X_test_imputed = imputer.transform(X_test)
y_pred_rf = grid_search_rf.best_estimator_.predict(X_test_imputed)
acc_rf = grid_search_rf.best_estimator_.score(X_test_imputed, y_test_encoded)

print("Best Hyperparameters for RandomForest:", grid_search_rf.best_params_)
print("Best RandomForest Accuracy:", acc_rf)

Best Hyperparameters for RandomForest: {'randomforestclassifier__max_depth': 10, 'randomforestclassifier__max_features': None, 'randomforestclassifier__n_estimators': 200}
Best RandomForest Accuracy: 0.8515901060070671


In [43]:
# AdaBoostClassifier
ada_model = AdaBoostClassifier()
ada_model_with_imputer = make_pipeline(SimpleImputer(strategy='mean'), ada_model)
grid_search_ada = GridSearchCV(ada_model_with_imputer, param_grid_ada, cv=5)
grid_search_ada.fit(X_imputed, y_train_encoded)

X_test_imputed = imputer.transform(X_test)
y_pred_ada = grid_search_ada.best_estimator_.predict(X_test_imputed)
acc_ada = grid_search_ada.best_estimator_.score(X_test_imputed, y_test_encoded)

print("Best Hyperparameters for AdaBoost:", grid_search_ada.best_params_)
print("Best AdaBoost Accuracy:", acc_ada)

Best Hyperparameters for AdaBoost: {'adaboostclassifier__n_estimators': 50}
Best AdaBoost Accuracy: 0.7950530035335689


In [44]:
# GradientBoostingClassifier
gb_model = GradientBoostingClassifier()
gb_model_with_imputer = make_pipeline(SimpleImputer(strategy='mean'), gb_model)
grid_search_gb = GridSearchCV(gb_model_with_imputer, param_grid_gb, cv=5)
grid_search_gb.fit(X_imputed, y_train_encoded)

X_test_imputed = imputer.transform(X_test)
y_pred_gb= grid_search_gb.best_estimator_.predict(X_test_imputed)
acc_gb = grid_search_gb.best_estimator_.score(X_test_imputed, y_test_encoded)

print("Best Hyperparameters for GradientBoosting:", grid_search_gb.best_params_)
print("Best GradientBoosting Accuracy:", acc_gb)


Best Hyperparameters for GradientBoosting: {'gradientboostingclassifier__learning_rate': 0.1, 'gradientboostingclassifier__n_estimators': 100}
Best GradientBoosting Accuracy: 0.8798586572438163
