Importing Libraries

In [99]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN 

In [100]:
df = pd.read_csv("/content/tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [101]:
df = df.drop('Unnamed: 0',axis = 1)

In [102]:
x = df.drop('Churn',axis = 1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [103]:
y = df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

Train Test Split

In [104]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size= 0.2)

Decision Tree Classifier

In [105]:
model_dt = DecisionTreeClassifier(criterion="gini",random_state=100,max_depth=6,min_samples_leaf=8)

In [106]:
model_dt.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [107]:
y_pred = model_dt.predict(x_test)
y_pred

array([0, 0, 1, ..., 0, 0, 0])

In [108]:
model_dt.score(x_test,y_test)


0.7725657427149965

In [109]:
print(classification_report(y_test,y_pred,labels = [0,1]))

              precision    recall  f1-score   support

           0       0.84      0.85      0.84      1015
           1       0.59      0.58      0.59       392

    accuracy                           0.77      1407
   macro avg       0.72      0.71      0.71      1407
weighted avg       0.77      0.77      0.77      1407



As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
Hence, moving ahead to call SMOTEENN

In [110]:
sm = SMOTEENN()
X_resampled,y_resampled = sm.fit_resample(x,y)

In [111]:
xr_train,xr_test,yr_train,yr_test = train_test_split(X_resampled,y_resampled,test_size=0.2)

In [112]:
model_dt_smote = DecisionTreeClassifier(criterion="gini",random_state=100,max_depth=6,min_samples_leaf=8)

In [113]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test,yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test,yr_predict))

0.9368061485909479
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       549
           1       0.92      0.96      0.94       622

    accuracy                           0.94      1171
   macro avg       0.94      0.94      0.94      1171
weighted avg       0.94      0.94      0.94      1171



In [114]:
print(metrics.confusion_matrix(yr_test,yr_predict))

[[499  50]
 [ 24 598]]


Now we can see quite better results, i.e. Accuracy: 93 %, and a very good recall, precision & f1 score for minority class.

Random Forest Classifier

In [115]:
from sklearn.ensemble import RandomForestClassifier


In [116]:
model_rf = RandomForestClassifier(n_estimators=100,criterion='gini',random_state=100,min_samples_leaf=8,max_depth=6)

In [117]:
model_rf.fit(x_train,y_train)


RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [118]:
y_pred = model_rf.predict(x_test)

In [119]:
model_rf.score(x_test,y_test)

0.7896233120113717

In [120]:
print(classification_report(y_test,y_pred,labels = [0,1]))

              precision    recall  f1-score   support

           0       0.81      0.92      0.86      1015
           1       0.68      0.45      0.55       392

    accuracy                           0.79      1407
   macro avg       0.75      0.69      0.70      1407
weighted avg       0.78      0.79      0.77      1407



As accuracy is low , We again used SMOTEENN

In [121]:
sm = SMOTEENN()
x_resampled1,y_resampled1 = sm.fit_resample(x,y)

In [122]:
xr_train1,xr_test1,yr_train1,yr_test1 = train_test_split(x_resampled1,y_resampled1,test_size= 0.2)

In [123]:
model_rf_smote = RandomForestClassifier(n_estimators=100,criterion="gini",random_state=100,max_depth=6,min_samples_leaf=8)

In [124]:
model_rf_smote.fit(xr_train1,yr_train1)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [125]:
yr_predict1 = model_rf_smote.predict(xr_test1)


In [126]:
model_score_r1 = model_rf_smote.score(xr_test1,yr_test1)

In [127]:
model_score_r1

0.9525862068965517

In [128]:
print(metrics.classification_report(yr_test1,yr_predict1))

              precision    recall  f1-score   support

           0       0.97      0.93      0.95       524
           1       0.94      0.97      0.96       636

    accuracy                           0.95      1160
   macro avg       0.95      0.95      0.95      1160
weighted avg       0.95      0.95      0.95      1160



In [129]:
print(metrics.confusion_matrix(yr_test1,yr_predict1))

[[486  38]
 [ 17 619]]


With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.


In [130]:
#from sklearn.decomposition import PCA
#pca = PCA()
#xr_train_pca = pca.fit_transform(xr_train1)
#xr_test_pca = pca.transform(xr_test1)
#explained_variance = pca.explained_variance_ratio_


In [131]:
#model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8) 

In [132]:
#model.fit(xr_train_pca,yr_train1)

In [133]:
#yr_predict_pca = model.predict(xr_test_pca)

In [134]:
#model_score_r_pca = model.score(xr_test_pca,yr_test1)

In [135]:
#print(model_score_r_pca)

In [136]:
#print(metrics.classification_report(yr_test1,yr_predict_pca))

In [137]:
#print(metrics.confusion_matrix(yr_test1,yr_predict_pca))

Pickling the model

In [138]:
import pickle

In [139]:
filename = 'model.sav'

In [140]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [141]:
load_model = pickle.load(open(filename,'rb'))


In [142]:
model_score_r1 = load_model.score(xr_test1,yr_test1)

In [143]:
model_score_r1

0.9525862068965517