In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [2]:
df=pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,True,False,False,True,True,...,False,False,True,False,True,False,False,False,False,False
1,1,0,56.95,1889.5,0,False,True,True,False,True,...,False,False,False,True,False,False,True,False,False,False
2,2,0,53.85,108.15,1,False,True,True,False,True,...,False,False,False,True,True,False,False,False,False,False
3,3,0,42.3,1840.75,0,False,True,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,4,0,70.7,151.65,1,True,False,True,False,True,...,False,False,True,False,True,False,False,False,False,False


In [3]:

df=df.drop('Unnamed: 0',axis=1)

In [4]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,True,False,False,True,True,False,True,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.50,False,True,True,False,True,False,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,False,True,True,False,True,False,False,...,False,False,False,True,True,False,False,False,False,False
3,0,42.30,1840.75,False,True,True,False,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,0,70.70,151.65,True,False,True,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,False,True,False,True,False,True,False,...,False,False,False,True,False,True,False,False,False,False
7028,0,103.20,7362.90,True,False,False,True,False,True,False,...,False,True,False,False,False,False,False,False,False,True
7029,0,29.60,346.45,True,False,False,True,False,True,True,...,False,False,True,False,True,False,False,False,False,False
7030,1,74.40,306.60,False,True,False,True,True,False,False,...,False,False,False,True,True,False,False,False,False,False


In [5]:

y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

Train Test Split

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

Decision Tree Classifier

In [9]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [10]:
model_dt.fit(x_train,y_train)

In [11]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [12]:
model_dt.score(x_test,y_test)

0.7931769722814499

In [13]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.90      0.87      1045
           1       0.63      0.48      0.54       362

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.70      1407
weighted avg       0.78      0.79      0.78      1407



As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [15]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [16]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [17]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [18]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.931740614334471
              precision    recall  f1-score   support

           0       0.94      0.90      0.92       513
           1       0.93      0.96      0.94       659

    accuracy                           0.93      1172
   macro avg       0.93      0.93      0.93      1172
weighted avg       0.93      0.93      0.93      1172



In [19]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[462  51]
 [ 29 630]]


Now we can see quite better results, i.e. Accuracy: 93.87
%, and a very good recall, precision & f1 score for minority class.
Let's try with some other classifier.

Random Forest Classifier

In [22]:
from sklearn.ensemble import RandomForestClassifier


In [23]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [24]:
model_rf.fit(x_train,y_train)

In [25]:
y_pred=model_rf.predict(x_test)

In [26]:
model_rf.score(x_test,y_test)

0.7924662402274343

In [27]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1045
           1       0.65      0.41      0.51       362

    accuracy                           0.79      1407
   macro avg       0.74      0.67      0.69      1407
weighted avg       0.78      0.79      0.78      1407



using SMOTEEN for Random forest classifier

In [29]:

sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [30]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [31]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [32]:
model_rf_smote.fit(xr_train1,yr_train1)

In [33]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [34]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [35]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9355385920271416
              precision    recall  f1-score   support

           0       0.95      0.90      0.93       530
           1       0.92      0.96      0.94       649

    accuracy                           0.94      1179
   macro avg       0.94      0.93      0.93      1179
weighted avg       0.94      0.94      0.94      1179



In [36]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[477  53]
 [ 23 626]]


Performing PCA

In [71]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_

In [73]:
model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [75]:
model.fit(xr_train_pca,yr_train1)

In [76]:
yr_predict_pca = model.predict(xr_test_pca)

In [79]:
model_score_r_pca = model.score(xr_test_pca, yr_test1)

In [81]:
print(model_score_r_pca)
print(metrics.classification_report(yr_test1, yr_predict_pca))

0.6870229007633588
              precision    recall  f1-score   support

           0       0.68      0.57      0.62       530
           1       0.69      0.78      0.73       649

    accuracy                           0.69      1179
   macro avg       0.69      0.68      0.68      1179
weighted avg       0.69      0.69      0.68      1179



Pickling the model

In [96]:
import pickle

In [86]:
filename= 'Churn_Prediction_Model.sav'

In [88]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [90]:
load_model = pickle.load(open(filename, 'rb'))

In [92]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [94]:
model_score_r1

0.9355385920271416