In [33]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [34]:
# data pre-processing
df = pd.read_csv("./../processed_data.csv")

X = df.drop('churn', axis=1)
y = df['churn']

# ros = RandomOverSampler(random_state=42)
# X_balanced, y_balanced = ros.fit_resample(X, y)
# X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.replace({'yes': 1, 'no': 0}, inplace=True)
X_test.replace({'yes': 1, 'no': 0}, inplace=True)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [35]:
# Random forest

rf_model_with_imputer = make_pipeline(SimpleImputer(strategy='mean'), RandomForestClassifier(random_state=42))
rf_model_with_imputer.fit(X_train, y_train_encoded)
y_pred_rf_imputed = rf_model_with_imputer.predict(X_test)

print("Random Forest with Imputer - Accuracy:", accuracy_score(y_test_encoded, y_pred_rf_imputed))
print("\nRandom Forest with Imputer - Confusion Matrix:\n", confusion_matrix(y_test_encoded, y_pred_rf_imputed))
print("\nRandom Forest with Imputer - Classification Report:\n", classification_report(y_test_encoded, y_pred_rf_imputed))


Random Forest with Imputer - Accuracy: 0.872791519434629

Random Forest with Imputer - Confusion Matrix:
 [[126  14]
 [ 22 121]]

Random Forest with Imputer - Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.90      0.88       140
           1       0.90      0.85      0.87       143

    accuracy                           0.87       283
   macro avg       0.87      0.87      0.87       283
weighted avg       0.87      0.87      0.87       283



In [36]:
# GradientBoostingClassifier

gb_model_with_imputer = make_pipeline(SimpleImputer(strategy='mean'), GradientBoostingClassifier(random_state=42))
gb_model_with_imputer.fit(X_train, y_train_encoded)
y_pred_gb_imputed = gb_model_with_imputer.predict(X_test)

print("GradientBoostingClassifier with Imputer - Accuracy:", accuracy_score(y_test_encoded, y_pred_gb_imputed))
print("\nGradientBoostingClassifier with Imputer - Confusion Matrix:\n", confusion_matrix(y_test_encoded, y_pred_gb_imputed))
print("\nGradientBoostingClassifier with Imputer - Classification Report:\n", classification_report(y_test_encoded, y_pred_gb_imputed))


GradientBoostingClassifier with Imputer - Accuracy: 0.8798586572438163

GradientBoostingClassifier with Imputer - Confusion Matrix:
 [[128  12]
 [ 22 121]]

GradientBoostingClassifier with Imputer - Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.91      0.88       140
           1       0.91      0.85      0.88       143

    accuracy                           0.88       283
   macro avg       0.88      0.88      0.88       283
weighted avg       0.88      0.88      0.88       283



In [37]:
# HistGradientBoostingClassifier

hgbc_model = HistGradientBoostingClassifier(random_state=42)
hgbc_model.fit(X_train, y_train_encoded)
y_pred_hgbc = hgbc_model.predict(X_test)

print("HistGradientBoostingClassifier - Accuracy:", accuracy_score(y_test_encoded, y_pred_hgbc))
print("\nHistGradientBoostingClassifier - Confusion Matrix:\n", confusion_matrix(y_test_encoded, y_pred_hgbc))
print("\nHistGradientBoostingClassifier - Classification Report:\n", classification_report(y_test_encoded, y_pred_hgbc))


HistGradientBoostingClassifier - Accuracy: 0.8621908127208481

HistGradientBoostingClassifier - Confusion Matrix:
 [[125  15]
 [ 24 119]]

HistGradientBoostingClassifier - Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.87       140
           1       0.89      0.83      0.86       143

    accuracy                           0.86       283
   macro avg       0.86      0.86      0.86       283
weighted avg       0.86      0.86      0.86       283



In [38]:
# Cross-Validation

cv_scores_rf = cross_val_score(rf_model_with_imputer, X_train, y_train_encoded, cv=3)
cv_scores_gb = cross_val_score(gb_model_with_imputer, X_train, y_train_encoded, cv=3)

print("Cross-Validation Scores - Random Forest:", cv_scores_rf)
print("Random Forest Mean Accuracy:", np.mean(cv_scores_rf))
print("Cross-Validation Scores - Gradient Boosting:", cv_scores_gb)
print("Gradient Boosting Mean Accuracy:", np.mean(cv_scores_gb))

Cross-Validation Scores - Random Forest: [0.83819629 0.8806366  0.90185676]
Random Forest Mean Accuracy: 0.8735632183908045
Cross-Validation Scores - Gradient Boosting: [0.86472149 0.87533156 0.88859416]
Gradient Boosting Mean Accuracy: 0.8762157382847038


In [39]:
# # Feature importance
# feature_importance = rf_model.feature_importances_

# # Visualizing feature importance
# plt.figure(figsize=(10, 6))
# plt.barh(X.columns, feature_importance)
# plt.title("Random Forest - Feature Importance")
# plt.xlabel("Importance")
# plt.show()

In [40]:
# Hyperparameter spaces for different classifiers
param_grid_bagging = {
    'baggingclassifier__n_estimators': [50, 100, 200],
}

param_grid_rf = {
    'randomforestclassifier__n_estimators': [50, 100, 200],
    'randomforestclassifier__max_depth': [None, 10, 20],  
    'randomforestclassifier__max_features': ['sqrt', 'log2', None, 1, 2, 3],  
}

param_grid_ada = {
    'adaboostclassifier__n_estimators': [50, 100, 200],
}

param_grid_gb = {
    'gradientboostingclassifier__n_estimators': [50, 100, 200],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.2],
}


imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_train)

In [41]:
# BaggingClassifier
bagging_model = BaggingClassifier()
bagging_model_with_imputer = make_pipeline(SimpleImputer(strategy='mean'), bagging_model)
grid_search_bagging = GridSearchCV(bagging_model_with_imputer, param_grid_bagging, cv=5)
grid_search_bagging.fit(X_imputed, y_train_encoded)


X_test_imputed = imputer.transform(X_test)
y_pred_bagging = grid_search_bagging.best_estimator_.predict(X_test_imputed)
acc_bagging = grid_search_bagging.best_estimator_.score(X_test_imputed, y_test_encoded)

print("Best Hyperparameters for Bagging:", grid_search_bagging.best_params_)
print("Best Bagging Accuracy:", acc_bagging)

Best Hyperparameters for Bagging: {'baggingclassifier__n_estimators': 200}
Best Bagging Accuracy: 0.8551236749116607


In [42]:
# RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model_with_imputer = make_pipeline(SimpleImputer(strategy='mean'), rf_model)
grid_search_rf = GridSearchCV(rf_model_with_imputer, param_grid_rf, cv=5)
grid_search_rf.fit(X_imputed, y_train_encoded)

X_test_imputed = imputer.transform(X_test)
y_pred_rf = grid_search_rf.best_estimator_.predict(X_test_imputed)
acc_rf = grid_search_rf.best_estimator_.score(X_test_imputed, y_test_encoded)

print("Best Hyperparameters for RandomForest:", grid_search_rf.best_params_)
print("Best RandomForest Accuracy:", acc_rf)

Best Hyperparameters for RandomForest: {'randomforestclassifier__max_depth': 10, 'randomforestclassifier__max_features': None, 'randomforestclassifier__n_estimators': 200}
Best RandomForest Accuracy: 0.8515901060070671


In [43]:
# AdaBoostClassifier
ada_model = AdaBoostClassifier()
ada_model_with_imputer = make_pipeline(SimpleImputer(strategy='mean'), ada_model)
grid_search_ada = GridSearchCV(ada_model_with_imputer, param_grid_ada, cv=5)
grid_search_ada.fit(X_imputed, y_train_encoded)

X_test_imputed = imputer.transform(X_test)
y_pred_ada = grid_search_ada.best_estimator_.predict(X_test_imputed)
acc_ada = grid_search_ada.best_estimator_.score(X_test_imputed, y_test_encoded)

print("Best Hyperparameters for AdaBoost:", grid_search_ada.best_params_)
print("Best AdaBoost Accuracy:", acc_ada)

Best Hyperparameters for AdaBoost: {'adaboostclassifier__n_estimators': 50}
Best AdaBoost Accuracy: 0.7950530035335689


In [44]:
# GradientBoostingClassifier
gb_model = GradientBoostingClassifier()
gb_model_with_imputer = make_pipeline(SimpleImputer(strategy='mean'), gb_model)
grid_search_gb = GridSearchCV(gb_model_with_imputer, param_grid_gb, cv=5)
grid_search_gb.fit(X_imputed, y_train_encoded)

X_test_imputed = imputer.transform(X_test)
y_pred_gb= grid_search_gb.best_estimator_.predict(X_test_imputed)
acc_gb = grid_search_gb.best_estimator_.score(X_test_imputed, y_test_encoded)

print("Best Hyperparameters for GradientBoosting:", grid_search_gb.best_params_)
print("Best GradientBoosting Accuracy:", acc_gb)


Best Hyperparameters for GradientBoosting: {'gradientboostingclassifier__learning_rate': 0.1, 'gradientboostingclassifier__n_estimators': 100}
Best GradientBoosting Accuracy: 0.8798586572438163
