### CUSTOMER CHURN PREDICTION : A PREDICTIVE INSIGHTS FOR CUSTOMER RETENTION


In [242]:
#importing necessary libraries
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

##### **Dataset Information:** The dataset after preprocessing the telco dataset

In [243]:
df=pd.read_csv("telco_churn_pred.csv")
df.head(3)

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29,29,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56,1889,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53,108,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0


In [244]:
df=df.drop('Unnamed: 0',axis=1)

In [245]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29,29,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56,1889,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53,108,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42,1840,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70,151,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84,1990,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103,7362,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29,346,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74,306,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [246]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

#### Train Test Split

In [247]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

#### **Decision Tree Classifier**

In [248]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [249]:
model_dt.fit(x_train,y_train)

In [250]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [251]:
model_dt.score(x_test,y_test)

0.781042654028436

In [252]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.92      0.86      1528
           1       0.66      0.42      0.52       582

    accuracy                           0.78      2110
   macro avg       0.73      0.67      0.69      2110
weighted avg       0.77      0.78      0.76      2110



##### The accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model since in imbalanced dataset accuracy is normally low.
##### Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
##### Hence, using SMOTEENN (UpSampling + ENN) to make it balanced dataset.

In [253]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [254]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.3)

In [255]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [256]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)

0.9392550143266476


In [257]:
print(metrics.classification_report(yr_test, yr_predict))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       771
           1       0.94      0.95      0.95       974

    accuracy                           0.94      1745
   macro avg       0.94      0.94      0.94      1745
weighted avg       0.94      0.94      0.94      1745



In [258]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[715  56]
 [ 50 924]]


##### Now we can see quite better results, i.e. **Accuracy: 93.5 %**, and a very good recall, precision & f1 score for minority class.

#### **Random Forest Classifier**

In [259]:
from sklearn.ensemble import RandomForestClassifier

In [260]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [261]:
model_rf_smote.fit(xr_train1,yr_train1)

In [262]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [263]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)
print(model_score_r1)

0.9382857142857143


##### **Accuracy:93.8 %**

In [264]:
print(metrics.classification_report(yr_test1, yr_predict1))

              precision    recall  f1-score   support

           0       0.96      0.90      0.93       786
           1       0.93      0.97      0.95       964

    accuracy                           0.94      1750
   macro avg       0.94      0.94      0.94      1750
weighted avg       0.94      0.94      0.94      1750



In [265]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[711  75]
 [ 33 931]]


#### **Logistic Regression**

In [266]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [267]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(xr_train1)
x_test_scaled = scaler.transform(xr_test1)

In [268]:
model_lr = LogisticRegression(random_state=42)
model_lr.fit(x_train_scaled, yr_train1)

In [269]:
y_pred = model_lr.predict(x_test_scaled)

In [270]:
accuracy = accuracy_score(yr_test1, y_pred)
accuracy

0.9468571428571428

##### **Accuracy : 94.6%**

In [271]:
print(metrics.classification_report(yr_test1, yr_predict1))

              precision    recall  f1-score   support

           0       0.96      0.90      0.93       786
           1       0.93      0.97      0.95       964

    accuracy                           0.94      1750
   macro avg       0.94      0.94      0.94      1750
weighted avg       0.94      0.94      0.94      1750



In [272]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[711  75]
 [ 33 931]]
