# Importing Libraries

In [1]:
##!pip install imblearn
##!pip install sklearn
##!pip install numpy
##!pip install pandas

In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

<h2> Importing data set

In [3]:
df=pd.read_csv("tel_churn.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [5]:
df=df.drop("Unnamed: 0",axis = 1)

<h2> Extracting Independent variables

In [6]:
x = df.drop("Churn", axis = 1)
x.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0


In [7]:
x.shape

(7032, 50)

<H2> Extracting Dependent variable

In [8]:
y=df.Churn

In [9]:
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

In [10]:
y.shape

(7032,)

<h3> Train Test Split

In [11]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 12)

In [12]:
x_train.shape

(5625, 50)

In [13]:
y_train.shape

(5625,)

In [14]:
x_test.shape

(1407, 50)

<h2> Decision Tree Classifier

In [15]:
model_dt = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth = 6, min_samples_leaf = 8)

In [16]:
model_dt.fit(x_train,y_train)

In [17]:
y_pred = model_dt.predict(x_test)

In [18]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [19]:
confusion_matrix(y_test,y_pred)

array([[923, 139],
       [163, 182]], dtype=int64)

In [20]:
accuracy_score(y_test,y_pred)

0.7853589196872779

In [21]:
model_dt.score(x_test,y_test)

0.7853589196872779

In [22]:
print(classification_report(y_test,y_pred,labels= [0,1]))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86      1062
           1       0.57      0.53      0.55       345

    accuracy                           0.79      1407
   macro avg       0.71      0.70      0.70      1407
weighted avg       0.78      0.79      0.78      1407



As we can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets

Hence, we need to check recall, precision and f1 score for the minority class and it's quite evident that the precision, recall and f1 score is too low for class 1 i.e., churned customers.

 Hence, moving ahead to call SMOTEENN(UpSampling + ENN)

<h2> Now we balance the data

In [23]:
sm = SMOTEENN()
x_resampled, y_resampled = sm.fit_resample(x,y)

In [24]:
xr_train,xr_test,yr_train,yr_test = train_test_split(x_resampled,y_resampled,test_size = 0.2,random_state = 12)

In [25]:
xr_train.shape

(4696, 50)

In [26]:
model_dt_smote = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth = 6,min_samples_leaf=8)

In [27]:
model_dt_smote.fit(xr_train,yr_train)

**Predicting the values**

In [28]:
yr_pred = model_dt_smote.predict(xr_test)

**Checking model accuracy, score**

In [29]:
model_score_r = model_dt_smote.score(xr_test,yr_test)
print(model_score_r)

0.92


In [30]:
print(metrics.classification_report(yr_test,yr_pred))

              precision    recall  f1-score   support

           0       0.94      0.87      0.90       506
           1       0.91      0.96      0.93       669

    accuracy                           0.92      1175
   macro avg       0.92      0.91      0.92      1175
weighted avg       0.92      0.92      0.92      1175



In [31]:
confusion_matrix(yr_test,yr_pred)

array([[442,  64],
       [ 30, 639]], dtype=int64)

In [32]:
accuracy_score(yr_test,yr_pred)

0.92

- Now, we can see better results with **92** accuracy and very good recall, precision and f1-score for minority class.

Also, consider other algorithms

In [33]:
from sklearn.ensemble import RandomForestClassifier

In [34]:
model_rf = RandomForestClassifier(n_estimators = 100, criterion = "gini", random_state = 100, max_depth = 6, min_samples_leaf = 8)

In [35]:
model_rf.fit(x_train,y_train)

In [36]:
y_pred_rf = model_rf.predict(x_test)

In [37]:
model_rf.score(x_test,y_test)

0.8109452736318408

In [38]:
print(classification_report(y_test,y_pred,labels = [0,1]))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86      1062
           1       0.57      0.53      0.55       345

    accuracy                           0.79      1407
   macro avg       0.71      0.70      0.70      1407
weighted avg       0.78      0.79      0.78      1407



In [39]:
print(accuracy_score(y_test,y_pred))

0.7853589196872779


Here, we can see low accuracy, as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets

Hence, we need to check recall, precision and f1 score for the minority class and it's quite evident that the precision, recall and f1 score is too low for class 1 i.e., churned customers.

Hence, moving ahead to call SMOTEENN(UpSampling + ENN)

<h2>  Now balance the data

In [40]:
sm = SMOTEENN()

In [41]:
model_rf_smote = RandomForestClassifier(n_estimators = 100, criterion = 'gini',random_state = 100,max_depth = 6,min_samples_leaf = 8)

In [42]:
model_rf_smote.fit(xr_train,yr_train)

In [43]:
yr_pred_smote = model_rf_smote.predict(xr_test)

<h2> Checking model accuracy, score, conduson matrix

In [44]:
confusion_matrix(yr_test,yr_pred_smote)

array([[445,  61],
       [ 25, 644]], dtype=int64)

In [45]:
accuracy_score(yr_test,yr_pred_smote)

0.9268085106382978

In [46]:
print(metrics.classification_report(yr_test,yr_pred_smote))

              precision    recall  f1-score   support

           0       0.95      0.88      0.91       506
           1       0.91      0.96      0.94       669

    accuracy                           0.93      1175
   macro avg       0.93      0.92      0.92      1175
weighted avg       0.93      0.93      0.93      1175



<h3> with Random Forest classifier, we get good results, infact better than Decision Tree Classifier

In [47]:
from sklearn.linear_model import LogisticRegression

In [48]:
model_lr = LogisticRegression()

In [49]:
model_lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
y_pred = model_lr.predict(x_test)

In [51]:
accuracy_score(y_test,y_pred)

0.8052594171997157

In [52]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.90      0.88      1062
           1       0.63      0.50      0.56       345

    accuracy                           0.81      1407
   macro avg       0.74      0.70      0.72      1407
weighted avg       0.79      0.81      0.80      1407



- This is much better accuracy than previous models but we can see precision, recall and f1-score for 1(churned customers) is very less

Hence we can't use this imbalanced data set to measure the model

<h2> Now we use balanced data to fit the model

In [53]:
sm = SMOTEENN()

In [54]:
model_lr_smote = LogisticRegression().fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
y_pred_lr = model_lr_smote.predict(xr_test)

In [56]:
accuracy_score(yr_test,y_pred_lr)

0.8340425531914893

In [57]:
print(classification_report(yr_test,y_pred_lr))

              precision    recall  f1-score   support

           0       0.73      0.97      0.83       506
           1       0.97      0.73      0.83       669

    accuracy                           0.83      1175
   macro avg       0.85      0.85      0.83      1175
weighted avg       0.87      0.83      0.83      1175



<h2> Performing PCA

In [58]:
## Applying PAC

from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train)
xr_test_pca = pca.transform(xr_test)
explained_variance = pca.explained_variance_ratio_

In [59]:
model = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 100,min_samples_leaf = 8,max_depth=6)

In [60]:
model.fit(xr_train_pca,yr_train)

In [61]:
yr_pred_pca = model.predict(xr_test_pca)

In [62]:
model_score_pca = model.score(xr_test_pca,yr_test)

In [63]:
print(model_score_pca)

0.7242553191489361


In [64]:
print(classification_report(yr_test,yr_pred_pca))

              precision    recall  f1-score   support

           0       0.70      0.62      0.66       506
           1       0.74      0.80      0.77       669

    accuracy                           0.72      1175
   macro avg       0.72      0.71      0.71      1175
weighted avg       0.72      0.72      0.72      1175



with PCA we couldn't see any better results, hence let's finalise the model which was created by **Random Forest Classifier**

<h2> Pickling the model

In [65]:
import pickle

In [66]:
filename = 'model.sav'

In [67]:
pickle.dump(model_rf_smote,open(filename,'wb'))

In [68]:
load_model = pickle.load(open(filename,'rb'))

In [69]:
model_score_r1 = load_model.score(xr_test,yr_test)

In [70]:
model_score_r1

0.9268085106382978

<h3> Our final model i.e., Random Forest Classifier with SMOTEENN is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI