In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("D://data/churn.csv")

In [None]:
# We have already performed EDA on this dataset and there is no scope for data cleaning as well

In [None]:
df.shape

In [None]:
df.info()

In [None]:
round(df.describe(),2).T

In [None]:
df.head()

In [None]:
df['Exited'].value_counts()

In [None]:
# Check the class distribution
sns.countplot(df['Exited'])

In [None]:
# Baseline accuracy
7963/10000

In [None]:
df.columns

In [None]:
dummies = pd.get_dummies(df[['Geography','Gender']], drop_first=True)

In [None]:
dummies

In [None]:
x = df.iloc[:,[2,5,6,7,8,9,10,11]]

In [None]:
x = pd.concat([x,dummies], axis=1)

In [None]:
y = df['Exited']

In [None]:
x.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [None]:
x_train.shape

In [None]:
y_train.value_counts()

In [None]:
6346/8000

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
model_lr = lr.fit(x_train,y_train)

In [None]:
model_lr.coef_

In [None]:
pred_prob=pd.Series(model_lr.predict_proba(x_test)[:,1])

In [None]:
from sklearn import metrics

In [None]:
# Look for the fitness of the model before using it for prediction using ROCR/ AUC

# Receiver Operating Characteristic Plot
# Area Under the Curve

ypred = model_lr.predict_proba(x_test)[:,1]
fpr,tpr, _ = metrics.roc_curve(y_test,ypred)
auc = metrics.roc_auc_score(y_test,ypred)
plt.plot(fpr,tpr,label = "Curve, Auc = "+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
pred_churn = model_lr.predict(x_test)

In [None]:
pred_churn

In [None]:
pred_churn[:80]

In [None]:
y_test[:40]

In [None]:
pd.crosstab(y_test,pred_churn)

In [None]:
pred_prob.shape

In [None]:
pred_prob

In [None]:
pd.crosstab(y_test.reset_index(drop=True),pred_prob>0.5)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
confusion_matrix(y_test,pred_churn)

In [None]:
(1584+15)/2000

In [None]:
# Precision 

In [None]:
15/(15+33)

In [None]:
from statsmodels import api as sm

In [None]:
m = sm.GLM(y_train,x_train,family=sm.families.Binomial())

In [None]:
m1 = m.fit()

In [None]:
m1.summary()

In [None]:
m1.fittedvalues()

In [None]:
pred_churn1 = m1.predict(x_test)

In [None]:
pd.crosstab(y_test,pred_churn1>0.5)

In [None]:
accuracy_score(y_test,pred_churn1>0.5)

In [None]:
metrics.precision_score(y_test,pred_churn1>0.5)

In [None]:
recall = 55/(55+328)

In [None]:
# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
d1 = dtree.fit(x_train,y_train)

In [None]:
pred_tree = d1.predict(x_test)

In [None]:
pd.crosstab(y_test,pred_tree)

In [None]:
(1403+175)/2000

In [None]:
175/(175+214)

In [None]:
dtree = DecisionTreeClassifier(max_leaf_nodes=10, max_depth=5, min_samples_leaf=5)

In [None]:
d2 = dtree.fit(x_train,y_train)

In [None]:
from dtreeplt import dtreeplt

In [None]:
dtree_class = dtreeplt(
    model = d2, 
    feature_names=x_train.columns, 
    target_names=np.array(["neg","pos"]))

In [None]:
dtree_class.view()

In [None]:
pred_tree1 = d2.predict(x_test)

In [None]:
pd.crosstab(y_test,pred_tree1)

In [None]:
(1554+161)/2000

In [None]:
161/(161+63)

In [None]:
# Decision Tree pruning
# Grid search / Hyper parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [None]:
from sklearn import tree

In [None]:
params = {'max_depth' : [2,4,6,8,10],'max_leaf_nodes' :[5,10], 'min_samples_leaf' : [2,5]}
clf = tree.DecisionTreeClassifier()
gcv = GridSearchCV(estimator=clf, param_grid=params)
gcv.fit(x_train,y_train)

In [None]:
d2.get_params().keys()

In [None]:
gcv.best_estimator_

In [None]:
model_gcv = gcv.best_estimator_
model_gcv.fit(x_train,y_train)
pred_gcv = model_gcv.predict(x_test)

In [None]:
accuracy_score(y_test,pred_gcv)

In [None]:
path = d2.cost_complexity_pruning_path(x_train,y_train)

In [None]:
alpha = path['ccp_alphas']

In [None]:
alpha

In [None]:
acc_train, acc_test = [],[]

for i in alpha:
    tree = DecisionTreeClassifier(ccp_alpha=i)
    tree.fit(x_train,y_train)
    y_train.pred = tree.predict(x_train)
    y_test.pred = tree.predict(x_test)
    
    acc_train.append(accuracy_score(y_train,y_train.pred))
    acc_test.append(accuracy_score(y_test,y_test.pred))
    acc_train.append(metrics.precision_score(y_train,y_train.pred))
    acc_test.append(metrics.precision_score(y_test,y_test.pred))

In [None]:
acc_train

In [176]:
alpha

array([0.        , 0.0039947 , 0.00459475, 0.00522287, 0.00701547,
       0.00874131, 0.01302253, 0.01530457, 0.03868233])

In [None]:
acc_test

In [177]:
model_ccp = DecisionTreeClassifier(ccp_alpha=0.00459475)

In [178]:
model_ccp = model_ccp.fit(x_train,y_train)

In [181]:
pred_ccp=model_ccp.predict(x_test)

In [182]:
pd.crosstab(y_test,pred_ccp)

col_0,0,1
Exited,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1593,24
1,291,92


In [183]:
accuracy_score(y_test,pred_ccp)

0.8425

In [186]:
accuracy_score(y_test,pred_tree1)

0.8575

In [187]:
accuracy_score(y_test,pred_gcv)

0.8575

In [189]:
92/(92+24)

0.7931034482758621

In [190]:
pd.crosstab(y_test,pred_tree1)

col_0,0,1
Exited,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1554,63
1,222,161


In [191]:
161/(161+63)

0.71875

In [193]:
from sklearn.ensemble import RandomForestClassifier

In [194]:
rf = RandomForestClassifier()

In [195]:
model_rf = rf.fit(x_train,y_train)

In [196]:
pred_rf = model_rf.predict(x_test)

In [197]:
pd.crosstab(y_test,pred_rf)

col_0,0,1
Exited,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1555,62
1,223,160


In [198]:
(1555+160)/2000

0.8575

In [199]:
160/(160+62)

0.7207207207207207