## **Churn Analysis -- Model Building**

#### Import Libraries 

In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN


#### Reading CSV file 

In [2]:
df = pd.read_csv('churnData_ML.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Churn,Tenure,City_Tier,CC_Contacted_LY,Service_Score,Account_user_count,CC_Agent_Score,Complain_ly,rev_growth_yoy,...,coupon_used_for_payment_6,coupon_used_for_payment_7,coupon_used_for_payment_8,coupon_used_for_payment_9,Login_device_Computer,Login_device_Mobile,Login_device_Others,Tenure_Bin_0-20,Tenure_Bin_21-40,Tenure_Bin_61-99
0,0,1,4.0,3.0,6.0,3.0,3.0,2.0,1.0,11.0,...,0,0,0,0,0,1,0,1,0,0
1,1,1,0.0,1.0,8.0,3.0,4.0,3.0,1.0,15.0,...,0,0,0,0,0,1,0,1,0,0
2,2,1,0.0,1.0,30.0,2.0,4.0,3.0,1.0,14.0,...,0,0,0,0,0,1,0,1,0,0
3,3,1,0.0,3.0,15.0,2.0,4.0,5.0,0.0,23.0,...,0,0,0,0,0,1,0,1,0,0
4,4,1,0.0,1.0,12.0,2.0,3.0,5.0,0.0,11.0,...,0,0,0,0,0,1,0,1,0,0


In [3]:
df=df.drop('Unnamed: 0',axis=1)

#### Create the Dependent (y) and Independent (X) variables 

In [5]:
X = df.drop('Churn',axis=1) 
X           ## Input Variables 

Unnamed: 0,Tenure,City_Tier,CC_Contacted_LY,Service_Score,Account_user_count,CC_Agent_Score,Complain_ly,rev_growth_yoy,Day_Since_CC_connect,cashback,...,coupon_used_for_payment_6,coupon_used_for_payment_7,coupon_used_for_payment_8,coupon_used_for_payment_9,Login_device_Computer,Login_device_Mobile,Login_device_Others,Tenure_Bin_0-20,Tenure_Bin_21-40,Tenure_Bin_61-99
0,4.0,3.0,6.0,3.0,3.0,2.0,1.0,11.0,5.0,160.0,...,0,0,0,0,0,1,0,1,0,0
1,0.0,1.0,8.0,3.0,4.0,3.0,1.0,15.0,0.0,121.0,...,0,0,0,0,0,1,0,1,0,0
2,0.0,1.0,30.0,2.0,4.0,3.0,1.0,14.0,3.0,134.0,...,0,0,0,0,0,1,0,1,0,0
3,0.0,3.0,15.0,2.0,4.0,5.0,0.0,23.0,3.0,134.0,...,0,0,0,0,0,1,0,1,0,0
4,0.0,1.0,12.0,2.0,3.0,5.0,0.0,11.0,3.0,130.0,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11255,10.0,1.0,34.0,3.0,2.0,1.0,0.0,19.0,4.0,154.0,...,0,0,0,0,1,0,0,1,0,0
11256,13.0,1.0,19.0,3.0,5.0,5.0,0.0,16.0,8.0,227.0,...,0,0,0,0,0,1,0,1,0,0
11257,1.0,1.0,14.0,3.0,2.0,4.0,1.0,22.0,4.0,191.0,...,0,0,0,0,0,1,0,1,0,0
11258,23.0,3.0,11.0,4.0,5.0,4.0,0.0,16.0,9.0,180.0,...,0,0,0,0,1,0,0,0,1,0


In [23]:
y=df['Churn']
y           ## output variable 

0        1
1        1
2        1
3        1
4        1
        ..
11255    0
11256    0
11257    0
11258    0
11259    0
Name: Churn, Length: 11260, dtype: int64

#### Perform Train Test Split 

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30, random_state=42)

In [8]:
X_train

Unnamed: 0,Tenure,City_Tier,CC_Contacted_LY,Service_Score,Account_user_count,CC_Agent_Score,Complain_ly,rev_growth_yoy,Day_Since_CC_connect,cashback,...,coupon_used_for_payment_6,coupon_used_for_payment_7,coupon_used_for_payment_8,coupon_used_for_payment_9,Login_device_Computer,Login_device_Mobile,Login_device_Others,Tenure_Bin_0-20,Tenure_Bin_21-40,Tenure_Bin_61-99
8274,13.0,1.0,14.0,2.0,3.0,1.0,0.0,19.0,3.0,169.0,...,0,0,0,0,0,1,0,1,0,0
5259,19.0,1.0,8.0,4.0,4.0,4.0,0.0,13.0,8.0,199.0,...,0,0,0,0,0,1,0,1,0,0
7756,0.0,1.0,10.0,2.0,4.0,5.0,0.0,16.0,3.0,145.0,...,0,0,0,0,0,1,0,1,0,0
2399,22.0,1.0,13.0,2.0,3.0,4.0,0.0,14.0,7.0,205.0,...,0,0,0,0,0,1,0,0,1,0
5820,16.0,1.0,18.0,2.0,3.0,2.0,0.0,21.0,7.0,125.0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0.0,1.0,28.0,2.0,4.0,2.0,1.0,14.0,1.0,125.0,...,0,0,0,0,0,1,0,1,0,0
5191,10.0,3.0,16.0,3.0,4.0,1.0,0.0,12.0,4.0,226.0,...,0,0,0,0,0,1,0,1,0,0
5390,1.0,1.0,36.0,4.0,4.0,5.0,0.0,14.0,3.0,164.0,...,0,0,0,0,0,1,0,1,0,0
860,0.0,1.0,28.0,3.0,3.0,3.0,1.0,15.0,7.0,162.0,...,0,0,0,0,0,1,0,1,0,0


#### Decision Tree Classifier

In [24]:
model_dt = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)


In [25]:
model_dt.fit(X_train,y_train)

In [26]:
y_predict =model_dt.predict(X_test)
y_predict

array([0, 0, 0, ..., 0, 0, 1], shape=(3378,))

In [27]:
model_dt.score(X_test,y_test)

0.8892835997631735

In [28]:
print(classification_report(y_test, y_predict, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93      2789
           1       0.69      0.66      0.67       589

    accuracy                           0.89      3378
   macro avg       0.81      0.80      0.80      3378
weighted avg       0.89      0.89      0.89      3378



**As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.**

**Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.**

**Hence, moving ahead to call SMOTEENN (UpSampling + ENN)**


In [29]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(X,y)



In [30]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.3)


In [31]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=5)


In [33]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)

print ('Classification Report')
print(metrics.classification_report(yr_test, yr_predict))

0.9063586178695563
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      2147
           1       0.93      0.91      0.92      2744

    accuracy                           0.91      4891
   macro avg       0.90      0.91      0.91      4891
weighted avg       0.91      0.91      0.91      4891



In [34]:
print(metrics.confusion_matrix(yr_test, yr_predict))


[[1949  198]
 [ 260 2484]]


**Now we can see quite better results, i.e. Accuracy: 90 %, and a very good recall, precision & f1 score for minority class.**

**Let's try with some other classifier.**

#### Random Forest Classifier


In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)


In [37]:
model_rf.fit(X_train,y_train)

In [38]:
y_pred_rf = model_rf.predict(X_test)
y_pred_rf

array([0, 0, 0, ..., 0, 0, 0], shape=(3378,))

In [39]:
model_rf.score(X_test,y_test)

0.8484310242747187

In [40]:
print(classification_report(y_test, y_pred_rf, labels=[0,1]))


              precision    recall  f1-score   support

           0       0.85      1.00      0.92      2789
           1       0.96      0.14      0.24       589

    accuracy                           0.85      3378
   macro avg       0.90      0.57      0.58      3378
weighted avg       0.87      0.85      0.80      3378



In [41]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm._fit_resample(X,y)



In [42]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)


In [43]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)


In [44]:
model_rf_smote.fit(xr_train1,yr_train1)


In [45]:
yr_predict_RF = model_rf_smote.predict(xr_test1)
yr_predict_RF

array([1, 0, 1, ..., 0, 0, 1], shape=(3259,))

In [46]:
model_score_rf1 = model_rf_smote.score(xr_test1, yr_test1)
model_score_rf1

0.9202209266646211

In [47]:
print(metrics.classification_report(yr_test1, yr_predict_RF))


              precision    recall  f1-score   support

           0       0.90      0.93      0.91      1454
           1       0.94      0.92      0.93      1805

    accuracy                           0.92      3259
   macro avg       0.92      0.92      0.92      3259
weighted avg       0.92      0.92      0.92      3259



In [48]:
print(metrics.confusion_matrix(yr_test1, yr_predict_RF))

[[1345  109]
 [ 151 1654]]


**With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.**