In [4]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [5]:
df = pd.read_csv('customer_churn.csv')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,True,False,False,True,True,...,False,False,True,False,True,False,False,False,False,False
1,1,0,56.95,1889.5,0,False,True,True,False,True,...,False,False,False,True,False,False,True,False,False,False
2,2,0,53.85,108.15,1,False,True,True,False,True,...,False,False,False,True,True,False,False,False,False,False
3,3,0,42.3,1840.75,0,False,True,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,4,0,70.7,151.65,1,True,False,True,False,True,...,False,False,True,False,True,False,False,False,False,False


In [7]:
df = df.drop('Unnamed: 0',axis=1)

### Determining dependent(y) and independent(x) variables

In [8]:
x = df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,True,False,False,True,True,False,True,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.50,False,True,True,False,True,False,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,False,True,True,False,True,False,False,...,False,False,False,True,True,False,False,False,False,False
3,0,42.30,1840.75,False,True,True,False,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,0,70.70,151.65,True,False,True,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,False,True,False,True,False,True,False,...,False,False,False,True,False,True,False,False,False,False
7028,0,103.20,7362.90,True,False,False,True,False,True,False,...,False,True,False,False,False,False,False,False,False,True
7029,0,29.60,346.45,True,False,False,True,False,True,True,...,False,False,True,False,True,False,False,False,False,False
7030,1,74.40,306.60,False,True,False,True,True,False,False,...,False,False,False,True,True,False,False,False,False,False


In [9]:
y = df.Churn
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y, train_size = 0.8,random_state = 42)

### Decision Tree Classifier

In [11]:
model_dt = DecisionTreeClassifier(criterion = 'gini',random_state=100,max_depth = 6,min_samples_leaf = 8)

In [12]:
model_dt.fit(x_train,y_train)

In [13]:
y_pred = model_dt.predict(x_test)
y_pred

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

#### Now here we have predicted Y values (y_pred) for x_test values. Let's compare it with our original y_test values.


In [14]:
model_dt.score(x_test, y_pred)

1.0

### The minority class is 1 or churners but have less predictive power,recall and f1-score

In [49]:
print(classification_report(y_test,y_pred,labels=[0,1]))
model_dt.score(x_test,y_test)

              precision    recall  f1-score   support

           0       0.83      0.88      0.85      1033
           1       0.60      0.50      0.54       374

    accuracy                           0.78      1407
   macro avg       0.71      0.69      0.70      1407
weighted avg       0.77      0.78      0.77      1407



0.7668798862828714

In [16]:
print(confusion_matrix(y_test, y_pred))

[[860 173]
 [155 219]]


#### In the above confusion matrix we have [['True Positive','False Positive']['False Negative,'True Negative']]

#### So accuracy =  (True+ve + True-ve)/Total
#### recall = True+ve/(True+ve + False-ve)

#### We will fix the model using SMOTE analysis.

In [17]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

## This xr_test,xr_train,yr_test,yr_train are used only for Models that is improvised using smote.

In [18]:
xr_train,xr_test,yr_train,yr_test = train_test_split(X_resampled,y_resampled, train_size = 0.8,random_state = 42)

In [19]:
model_dt_smote = DecisionTreeClassifier(criterion = 'gini',random_state=100,max_depth = 6,min_samples_leaf = 8)

In [20]:
model_dt_smote.fit(xr_train,yr_train)


In [21]:
y_pred_smote = model_dt_smote.predict(xr_test)

### So finally using smote we got better results for both churn(1) and non-churners(0)

In [22]:
print(classification_report(yr_test,y_pred_smote,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92       547
           1       0.92      0.95      0.93       624

    accuracy                           0.93      1171
   macro avg       0.93      0.93      0.93      1171
weighted avg       0.93      0.93      0.93      1171



In [23]:
print(confusion_matrix(yr_test, y_pred_smote))

[[496  51]
 [ 32 592]]


In [24]:
model_dt_smote.score(xr_test,yr_test)

0.9291204099060631

### End of Model-1 using Decision tree

### Model 2:Random Forest Classifier

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
model_rf = RandomForestClassifier(n_estimators = 100,criterion = 'gini',random_state=100,max_depth = 6,min_samples_leaf = 8)
model_rf.fit(x_train,y_train)
y_pred_rf = model_dt.predict(x_test)

In [47]:
print(classification_report(y_test,y_pred_rf,labels=[0,1]))
model_rf.score(x_test,y_test)

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1033
           1       0.56      0.59      0.57       374

    accuracy                           0.77      1407
   macro avg       0.70      0.71      0.71      1407
weighted avg       0.77      0.77      0.77      1407



0.7789623312011372

In [28]:
model_rf_smote = RandomForestClassifier(n_estimators = 100,criterion = 'gini',random_state=100,max_depth = 6,min_samples_leaf = 8)

In [29]:
model_rf_smote.fit(xr_train,yr_train)

In [30]:
y_pred_smote_rf = model_rf_smote.predict(xr_test)

In [31]:
print(classification_report(yr_test,y_pred_smote_rf,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.95      0.88      0.92       547
           1       0.91      0.96      0.93       624

    accuracy                           0.93      1171
   macro avg       0.93      0.92      0.93      1171
weighted avg       0.93      0.93      0.93      1171



In [32]:
print(confusion_matrix(yr_test, y_pred_smote_rf))

[[484  63]
 [ 23 601]]


In [33]:
model_rf_smote.score(xr_test,yr_test)

0.9265584970111016

### Model 3: XGBoost

In [34]:
from xgboost import XGBClassifier

In [35]:
xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(x_train, y_train)

In [36]:
y_pred = xgb.predict(x_test)

In [37]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.88      0.85      1033
           1       0.60      0.50      0.54       374

    accuracy                           0.78      1407
   macro avg       0.71      0.69      0.70      1407
weighted avg       0.77      0.78      0.77      1407



In [38]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Confusion Matrix:
 [[906 127]
 [187 187]]


In [39]:
xgb.score(xr_test,yr_test)

0.8804440649017934

## We will use Smotenn to improvise minority data precision and accuracy


In [40]:
xgb_smt = XGBClassifier(eval_metric='logloss')
xgb_smt.fit(xr_train,yr_train)

In [43]:
y_pred_smote_xgb = xgb_smt.predict(xr_test)

In [45]:
print("Classification Report:\n", classification_report(yr_test, y_pred_smote_xgb))

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94       547
           1       0.95      0.95      0.95       624

    accuracy                           0.95      1171
   macro avg       0.95      0.95      0.95      1171
weighted avg       0.95      0.95      0.95      1171



In [46]:
xgb_smt.score(xr_test,yr_test)

0.946199829205807