In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("D://data/churn.csv")

In [None]:
# We have already performed EDA on this dataset and there is no scope for data cleaning as well

In [None]:
df.shape

In [None]:
df.info()

In [None]:
round(df.describe(),2).T

In [None]:
df.head()

In [None]:
df['Exited'].value_counts()

In [None]:
# Check the class distribution
sns.countplot(df['Exited'])

In [None]:
# Baseline accuracy
7963/10000

In [None]:
df.columns

In [None]:
dummies = pd.get_dummies(df[['Geography','Gender']], drop_first=True)

In [None]:
dummies

In [None]:
x = df.iloc[:,[2,5,6,7,8,9,10,11]]

In [None]:
x = pd.concat([x,dummies], axis=1)

In [None]:
y = df['Exited']

In [None]:
x.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [None]:
x_train.shape

In [None]:
y_train.value_counts()

In [None]:
6346/8000

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
model_lr = lr.fit(x_train,y_train)

In [None]:
model_lr.coef_

In [None]:
pred_prob=pd.Series(model_lr.predict_proba(x_test)[:,1])

In [None]:
from sklearn import metrics

In [None]:
# Look for the fitness of the model before using it for prediction using ROCR/ AUC

# Receiver Operating Characteristic Plot
# Area Under the Curve

ypred = model_lr.predict_proba(x_test)[:,1]
fpr,tpr, _ = metrics.roc_curve(y_test,ypred)
auc = metrics.roc_auc_score(y_test,ypred)
plt.plot(fpr,tpr,label = "Curve, Auc = "+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
pred_churn = model_lr.predict(x_test)

In [None]:
pred_churn

In [None]:
pred_churn[:80]

In [None]:
y_test[:40]

In [None]:
pd.crosstab(y_test,pred_churn)

In [None]:
pred_prob.shape

In [None]:
pred_prob

In [None]:
pd.crosstab(y_test.reset_index(drop=True),pred_prob>0.5)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
confusion_matrix(y_test,pred_churn)

In [None]:
(1584+15)/2000

In [None]:
# Precision 

In [None]:
15/(15+33)

In [None]:
from statsmodels import api as sm

In [None]:
m = sm.GLM(y_train,x_train,family=sm.families.Binomial())

In [None]:
m1 = m.fit()

In [None]:
m1.summary()

In [None]:
m1.fittedvalues()

In [None]:
pred_churn1 = m1.predict(x_test)

In [None]:
pd.crosstab(y_test,pred_churn1>0.5)

In [None]:
accuracy_score(y_test,pred_churn1>0.5)

In [None]:
metrics.precision_score(y_test,pred_churn1>0.5)

In [None]:
recall = 55/(55+328)

In [None]:
# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
d1 = dtree.fit(x_train,y_train)

In [None]:
pred_tree = d1.predict(x_test)

In [None]:
pd.crosstab(y_test,pred_tree)

In [None]:
(1403+175)/2000

In [None]:
175/(175+214)

In [None]:
dtree = DecisionTreeClassifier(max_leaf_nodes=10, max_depth=5, min_samples_leaf=5)

In [None]:
d2 = dtree.fit(x_train,y_train)

In [None]:
from dtreeplt import dtreeplt

In [None]:
dtree_class = dtreeplt(
    model = d2, 
    feature_names=x_train.columns, 
    target_names=np.array(["neg","pos"]))

In [None]:
dtree_class.view()

In [None]:
pred_tree1 = d2.predict(x_test)

In [None]:
pd.crosstab(y_test,pred_tree1)

In [None]:
(1554+161)/2000

In [None]:
161/(161+63)

In [None]:
# Decision Tree pruning
# Grid search / Hyper parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [None]:
from sklearn import tree

In [None]:
params = {'max_depth' : [2,4,6,8,10],'max_leaf_nodes' :[5,10], 'min_samples_leaf' : [2,5]}
clf = tree.DecisionTreeClassifier()
gcv = GridSearchCV(estimator=clf, param_grid=params)
gcv.fit(x_train,y_train)

In [None]:
d2.get_params().keys()

In [None]:
gcv.best_estimator_

In [None]:
model_gcv = gcv.best_estimator_
model_gcv.fit(x_train,y_train)
pred_gcv = model_gcv.predict(x_test)

In [None]:
accuracy_score(y_test,pred_gcv)

In [None]:
path = d2.cost_complexity_pruning_path(x_train,y_train)

In [None]:
alpha = path['ccp_alphas']

In [None]:
alpha

In [None]:
acc_train, acc_test = [],[]

for i in alpha:
    tree = DecisionTreeClassifier(ccp_alpha=i)
    tree.fit(x_train,y_train)
    y_train.pred = tree.predict(x_train)
    y_test.pred = tree.predict(x_test)
    
    acc_train.append(accuracy_score(y_train,y_train.pred))
    acc_test.append(accuracy_score(y_test,y_test.pred))
    acc_train.append(metrics.precision_score(y_train,y_train.pred))
    acc_test.append(metrics.precision_score(y_test,y_test.pred))

In [None]:
acc_train

In [None]:
alpha

In [None]:
acc_test

In [None]:
model_ccp = DecisionTreeClassifier(ccp_alpha=0.00459475)

In [None]:
model_ccp = model_ccp.fit(x_train,y_train)

In [None]:
pred_ccp=model_ccp.predict(x_test)

In [None]:
pd.crosstab(y_test,pred_ccp)

In [None]:
accuracy_score(y_test,pred_ccp)

In [None]:
accuracy_score(y_test,pred_tree1)

In [None]:
accuracy_score(y_test,pred_gcv)

In [None]:
92/(92+24)

In [None]:
pd.crosstab(y_test,pred_tree1)

In [None]:
161/(161+63)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

In [None]:
model_rf = rf.fit(x_train,y_train)

In [None]:
pred_rf = model_rf.predict(x_test)

In [None]:
model_rf.feature_importances_

In [None]:
pd.crosstab(y_test,pred_rf)

In [None]:
(1555+160)/2000

In [None]:
160/(160+62)

In [None]:
df[df['Exited']==1][:6]

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
s = SMOTE()

In [None]:
x_train_smote, y_train_smote = s.fit_sample(x_train,y_train)

In [None]:
from collections import Counter

In [None]:
print("before smote: ", Counter(y_train))
print("after smote: ", Counter(y_train_smote))

In [None]:
x_train.iloc[:10]

In [None]:
x_train_smote[:10][:]

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
glm = LogisticRegression()

In [None]:
model= glm.fit(x_train_smote,y_train_smote)

In [None]:
pred_glm = model.predict(x_test)

In [None]:
pd.crosstab(y_test,pred_glm)

In [None]:
(1072+243)/2000

In [None]:
243/(243+545)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier(max_depth=5, max_leaf_nodes=5,min_samples_leaf=5)
dtree=dtree.fit(x_train_smote,y_train_smote)

In [None]:
pred_smote = dtree.predict(x_test)

In [None]:
pd.crosstab(y_test,pred_smote)

In [None]:
199/(199+154)

In [None]:
from sklearn.svm import SVC

In [None]:
sv = SVC(kernel='sigmoid')
model_svm = sv.fit(x_train,y_train)

In [None]:
pred_svm = model_svm.predict(x_test)

In [None]:
pd.crosstab(y_test,pred_svm)

In [None]:
# Data Preparation Techniques

# Data Pre-processing : Sampling, Data Transformation --> Standardisation, Normalisation

# Cross-validation Techniques : K-fold, Stratified K-fold, Repeated K-fold

# Feature Selection Techniques : P-value, Step function, K-best, RFE, AUC/ROC

# Handling Class Imbalances : Under Sampling, Over Sampling, SMOTE

# Feature Extraction Techniques : Curse of Dimensionality : PCA, LDA

In [None]:
# Data Transformation

In [None]:
# Standardisation : (Center & Scale) : (xi-mean(x))/std(x)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
ss = StandardScaler().fit(x_train)

In [None]:
ss = ss.transform(x_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()

In [None]:
model_knn = knn.fit(ss,y_train)

In [None]:
pred_knn = model_knn.predict(x_test)

In [None]:
pd.crosstab(y_test,pred_knn)

In [None]:
# Normalisation --> Range function xi-min(x)/(min(x)-max(x))

In [None]:
range_x = MinMaxScaler(feature_range=(0,1))
range_x.fit_transform(x_train)

In [None]:
# Cross Validation Techniques

In [None]:
# K-fold cv - 3 fold cv

In [None]:
# 10,000

# 1 fold --> 3500
# 2 fold --> 3250
# 3 fold --> 3250

# 1 model : 1st & 2nd chunk (training) and predicting on the 3rd chunk        --> 85%

# 2nd model : 1st & 3rd chunk (training) and predicting on the 2nd chunk     --> 75%

# 3rd model : 2nd & 3rd (training) and predicting on the 1st chunk            --> 80%

# average of all the models accuarcy

In [None]:
# Feature Selection Techniques

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, RFE

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
rfe_lr = RFE(lr, 5)
result = rfe_lr.fit(x_train,y_train)

In [None]:
result.n_features_

In [None]:
result.ranking_

In [None]:
# Chi2 with Kbest

In [None]:
model = SelectKBest(score_func=chi2,k=5)
result_kbest = model.fit(x,y)

In [None]:
result_kbest.scores_

In [None]:
x_train.columns

In [None]:
for i in result_kbest.scores_:
    print(round(i,2))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
model = ExtraTreesClassifier()

In [None]:
model.fit(x,y)

In [None]:
model.feature_importances_

In [None]:
# Evaluation of Algorithms

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [None]:
models = []

In [None]:
models.append(("LR", LogisticRegression()))
models.append(("Tree", DecisionTreeClassifier()))
models.append(("SVM", SVC()))
models.append(("RF", RandomForestClassifier()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("NB", GaussianNB()))
models.append(("XGB", XGBClassifier()))

In [None]:
models

In [None]:
results = []
names = []

In [None]:
from sklearn.model_selection import KFold , cross_val_score, GridSearchCV

In [None]:
import warnings
warnings.ignore=True

In [None]:
for name,model in models:
    kfold = KFold(n_splits=5)
    cv_results = cross_val_score(model, x,y,cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    
    outcome = "%s: %f (%f)" % (name,cv_results.mean()*100, cv_results.std()*100)
    print(outcome)

In [None]:
cv_results

In [None]:
help(cross_val_score)

In [None]:
results

In [None]:
fig = plt.figure()
axis = fig.add_subplot(111)
plt.boxplot(results)
axis.set_xticklabels(names)
plt.show()

In [87]:
# Explore the pipeline method to evaluate algorithms