In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [2]:
df = pd.read_csv('tel_churn.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [3]:
df = df.drop('Unnamed: 0',axis=1)

In [4]:
x= df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [5]:
y = df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

In [6]:
#Divide train and test data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)

Handling imbalanced data with SMOTE-ENN

In [7]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [8]:
x_train,x_test,y_train,y_test = train_test_split(X_resampled,y_resampled,test_size = 0.2)

In [9]:
#Implementing random forest classifier
from sklearn.ensemble import RandomForestClassifier

In [10]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [11]:
model_rf_smote.fit(x_train,y_train)

In [12]:
y_predict = model_rf_smote.predict(x_test)
model_score = model_rf_smote.score(x_test, y_test)

In [13]:
print(model_score)
print(metrics.classification_report(y_test, y_predict))

0.9390862944162437
              precision    recall  f1-score   support

           0       0.96      0.90      0.93       548
           1       0.92      0.97      0.94       634

    accuracy                           0.94      1182
   macro avg       0.94      0.94      0.94      1182
weighted avg       0.94      0.94      0.94      1182



In [60]:
#implementing xgboost
import xgboost as xgb
from sklearn.metrics import accuracy_score
xgb_1 = xgb.XGBClassifier()
xgb_1.fit(x_train, y_train)

In [61]:
pred_xgb = xgb_1.predict(x_test)

In [62]:
acc = accuracy_score(y_test,pred_xgb)
print ("Accuracy with xgb :", acc)

Accuracy with xgb : 0.9602368866328257


# Hyperparameter tuning

In [63]:
from sklearn.model_selection import GridSearchCV

In [72]:
param_grid = {
    'max_depth':[5],
    'learning_rate':[0,0.01,0.05,0.1],
    'gamma':[1,5,10],
    'scale_pos_weight':[2,5,10,20],
    'subsample':[1],
    'colsample_bytree':[1]
}

xgb_2= xgb.XGBClassifier(objective='binary:logistic')
grid_cv = GridSearchCV(xgb_2, param_grid, n_jobs=-1,cv=3, scoring='roc_auc')
grid_cv.fit(x_train,y_train)


In [71]:
print('Best score:',grid_cv.best_score_)
print('Best Params:',grid_cv.best_params_)

Best score: 0.9901602735757812
Best Params: {'colsample_bytree': 1, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 5, 'scale_pos_weight': 2, 'subsample': 1}


In [74]:
final_classifier = xgb.XGBClassifier(
    **grid_cv.best_params_, objective = 'binary:logistic'
)
final_model = final_classifier.fit(x_train,y_train)

In [77]:
pred = final_model.predict(x_test)
acc = accuracy_score(y_test,pred)
print('accuracy after hyperparameter tuning:', acc)

accuracy after hyperparameter tuning: 0.9619289340101523




*   The xgboost classifier is able to achieve a high accuracy of 96.19 % thus it the final model for classifiction.

