### Importing Libraries

In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from imblearn.combine import SMOTEENN
import numpy as np

### Splitting data

In [2]:
df=pd.read_csv("customer-churn-data-preprocessed.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,gender_Female,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1-12,tenure_group_13-24,tenure_group_25-36,tenure_group_37-48,tenure_group_49-60,tenure_group_61-72
0,0,0,1,0,0,1,29.85,29.85,0,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,0,0,1,0,56.95,1889.5,0,0,...,0,0,0,1,0,0,1,0,0,0
2,2,0,0,0,1,1,53.85,108.15,1,0,...,0,0,0,1,1,0,0,0,0,0
3,3,0,0,0,0,0,42.3,1840.75,0,0,...,1,0,0,0,0,0,0,1,0,0
4,4,0,0,0,1,1,70.7,151.65,1,1,...,0,0,1,0,1,0,0,0,0,0


In [3]:
df=df.drop('Unnamed: 0',axis=1)

In [4]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,gender_Female,gender_Male,MultipleLines_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1-12,tenure_group_13-24,tenure_group_25-36,tenure_group_37-48,tenure_group_49-60,tenure_group_61-72
0,0,1,0,0,1,29.85,29.85,1,0,0,...,0,0,1,0,1,0,0,0,0,0
1,0,0,0,1,0,56.95,1889.50,0,1,1,...,0,0,0,1,0,0,1,0,0,0
2,0,0,0,1,1,53.85,108.15,0,1,1,...,0,0,0,1,1,0,0,0,0,0
3,0,0,0,0,0,42.30,1840.75,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,0,0,0,1,1,70.70,151.65,1,0,1,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,1,1,1,1,84.80,1990.50,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,1,1,1,1,103.20,7362.90,1,0,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,1,1,0,1,29.60,346.45,1,0,0,...,0,0,1,0,1,0,0,0,0,0
7030,1,1,0,1,1,74.40,306.60,0,1,0,...,0,0,0,1,1,0,0,0,0,0


In [5]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

In [6]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y)
#stratify because imbalanced data is there. so to ensure test sample has balanced data.

### Decision Tree Classifier

In [7]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [8]:
model_dt.fit(x_train,y_train)

In [9]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [10]:
model_dt.score(x_test,y_test)

0.7967306325515281

In [11]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1033
           1       0.64      0.54      0.58       374

    accuracy                           0.80      1407
   macro avg       0.74      0.71      0.72      1407
weighted avg       0.79      0.80      0.79      1407



In [12]:
print(metrics.confusion_matrix(y_test, y_pred))

[[920 113]
 [173 201]]


In [13]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [14]:
xr_train_dt,xr_test_dt,yr_train_dt,yr_test_dt=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [15]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [16]:
model_dt_smote.fit(xr_train_dt,yr_train_dt)
yr_predict_dt = model_dt_smote.predict(xr_test_dt)
model_score_r = model_dt_smote.score(xr_test_dt, yr_test_dt)
print(model_score_r)
print(metrics.classification_report(yr_test_dt, yr_predict_dt))

0.9245283018867925
              precision    recall  f1-score   support

           0       0.93      0.90      0.92       529
           1       0.92      0.94      0.93       637

    accuracy                           0.92      1166
   macro avg       0.93      0.92      0.92      1166
weighted avg       0.92      0.92      0.92      1166



In [17]:
print(metrics.confusion_matrix(yr_test_dt, yr_predict_dt))

[[477  52]
 [ 36 601]]


### Random Forest Classifier

In [18]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [19]:
model_rf.fit(x_train,y_train)

In [20]:
y_pred=model_rf.predict(x_test)

In [21]:
model_rf.score(x_test,y_test)

0.8137882018479033

In [22]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88      1033
           1       0.74      0.46      0.57       374

    accuracy                           0.81      1407
   macro avg       0.78      0.70      0.73      1407
weighted avg       0.80      0.81      0.80      1407



In [23]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [24]:
xr_train_rf,xr_test_rf,yr_train_rf,yr_test_rf=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [25]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [26]:
model_rf_smote.fit(xr_train_rf,yr_train_rf)

In [27]:
yr_predict_rf = model_rf_smote.predict(xr_test_rf)

In [28]:
model_score_r1 = model_rf_smote.score(xr_test_rf, yr_test_rf)

In [29]:
print(model_score_r1)
print(metrics.classification_report(yr_test_rf, yr_predict_rf))

0.9319148936170213
              precision    recall  f1-score   support

           0       0.94      0.90      0.92       521
           1       0.92      0.96      0.94       654

    accuracy                           0.93      1175
   macro avg       0.93      0.93      0.93      1175
weighted avg       0.93      0.93      0.93      1175



In [30]:
print(metrics.confusion_matrix(yr_test_rf, yr_predict_rf))

[[470  51]
 [ 29 625]]


With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.

### Logistic Regression

In [31]:
model_lg = LogisticRegression(max_iter=1000)

In [32]:
model_lg.fit(x_train,y_train)

In [33]:
y_pred = model_lg.predict(x_test)
y_pred

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [34]:
model_lg.score(x_test,y_test)

0.8095238095238095

In [35]:
print(metrics.confusion_matrix(y_test, y_pred))

[[943  90]
 [178 196]]


In [36]:
print(classification_report(y_test, y_pred,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.84      0.91      0.88      1033
           1       0.69      0.52      0.59       374

    accuracy                           0.81      1407
   macro avg       0.76      0.72      0.73      1407
weighted avg       0.80      0.81      0.80      1407



In [37]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [38]:
xr_train_lg,xr_test_lg,yr_train_lg,yr_test_lg=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [39]:
model_lg_smote=LogisticRegression(max_iter=1000)

In [40]:
model_lg_smote.fit(xr_train_lg,yr_train_lg)
yr_predict_lg = model_lg_smote.predict(xr_test_lg)
model_score_r = model_lg_smote.score(xr_test_lg, yr_test_lg)
print(model_score_r)
print(metrics.classification_report(yr_test_lg, yr_predict_lg))

0.9366319444444444
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       521
           1       0.94      0.95      0.94       631

    accuracy                           0.94      1152
   macro avg       0.94      0.94      0.94      1152
weighted avg       0.94      0.94      0.94      1152



In [41]:
print(metrics.confusion_matrix(yr_test_lg, yr_predict_lg))

[[482  39]
 [ 34 597]]


### Naive Bayes

In [42]:
from mixed_naive_bayes import MixedNB

In [43]:
model_nb = MixedNB(categorical_features=[ 0,  1,  2,  3,  4,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45])
model_nb.fit(x_train,y_train)
y_pred=model_nb.predict(x_test)
model_nb.score(x_test,y_test)


0.7377398720682303

In [44]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.91      0.71      0.80      1033
           1       0.50      0.81      0.62       374

    accuracy                           0.74      1407
   macro avg       0.71      0.76      0.71      1407
weighted avg       0.80      0.74      0.75      1407



In [45]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [46]:
xr_train_nb,xr_test_nb,yr_train_nb,yr_test_nb=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [47]:
model_nb_smote=MixedNB(categorical_features=[ 0,  1,  2,  3,  4,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45])

In [48]:
model_nb_smote.fit(xr_train_nb,yr_train_nb)

MixedNB(alpha=0.5, var_smoothing=1e-09)

In [49]:
yr_predict_nb = model_nb_smote.predict(xr_test_nb)

In [50]:
model_score_r1 = model_nb_smote.score(xr_test_nb, yr_test_nb)

In [51]:
print(model_score_r1)
print(metrics.classification_report(yr_test_nb, yr_predict_nb))

0.9073756432246999
              precision    recall  f1-score   support

           0       0.93      0.86      0.89       528
           1       0.89      0.95      0.92       638

    accuracy                           0.91      1166
   macro avg       0.91      0.90      0.91      1166
weighted avg       0.91      0.91      0.91      1166



In [52]:
print(metrics.confusion_matrix(yr_test_nb, yr_predict_nb))

[[453  75]
 [ 33 605]]


### Saving Model

In [53]:
import pickle

In [54]:
filenamedt = 'model-dt.sav'
filenamerf = 'model-rf.sav'
filenamelg = 'model-lg.sav'
filenamenb = 'model-nb.sav'

In [55]:
pickle.dump(model_dt_smote, open(filenamedt, 'wb'))

In [56]:
load_model = pickle.load(open(filenamedt, 'rb'))

In [57]:
model_score_r1 = load_model.score(xr_test_dt, yr_test_dt)

In [58]:
model_score_r1

0.9245283018867925

In [59]:
pickle.dump(model_rf_smote, open(filenamerf, 'wb'))

In [60]:
load_model = pickle.load(open(filenamerf, 'rb'))

In [61]:
model_score_r1 = load_model.score(xr_test_rf, yr_test_rf)

In [62]:
model_score_r1

0.9319148936170213

In [63]:
pickle.dump(model_lg_smote, open(filenamelg, 'wb'))

In [64]:
load_model = pickle.load(open(filenamelg, 'rb'))

In [65]:
model_score_r1 = load_model.score(xr_test_lg, yr_test_lg)

In [66]:
model_score_r1

0.9366319444444444

In [67]:
pickle.dump(model_nb_smote, open(filenamenb, 'wb'))

In [68]:
load_model = pickle.load(open(filenamenb, 'rb'))

In [69]:
model_score_r1 = load_model.score(xr_test_nb, yr_test_nb)

In [70]:
model_score_r1

0.9073756432246999