In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('D://data/churn.csv')

In [None]:
df[:5]

In [None]:
# Data dictionary
# Explore, clean & analyse data

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df['Exited'].unique()

In [None]:
df.corr()

In [None]:
# Check the class distribution of the variable

In [None]:
df.Exited.value_counts()

In [None]:
# Baseline accuracy
7963/10000

In [None]:
df[:5]

In [None]:
x = df.iloc[:,2:12]

In [None]:
x[:2]

In [None]:
y = df['Exited']

In [None]:
df1.corr()

In [None]:
# Linear  --> Linear regression, Logistic regression
# Non-linear --> Tree, SVM, KNN, Bayes
# Ensemble --> Bagging (Random forest), Boosting (XGboost)

In [None]:
df.groupby('Geography').mean()

In [None]:
dummies = pd.get_dummies(df[['Geography','Gender']], drop_first=True)

In [None]:
dummies

In [None]:
x.drop(['Geography','Gender'], axis=1, inplace=True)

In [None]:
x[:2]

In [None]:
x1 = pd.concat([x,dummies], axis=1)

In [None]:
x1[:2]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x1,y,test_size=0.2, random_state=2)

In [None]:
xtrain.shape

In [None]:
ytrain.shape

In [None]:
ytrain.value_counts()

In [None]:
6446/8000

In [None]:
ytest.value_counts()

In [None]:
1617/2000

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
model_lr = lr.fit(xtrain,ytrain)

In [None]:
model_lr.coef_

In [None]:
pred = model_lr.predict(xtest)

In [None]:
pred[:55]

In [None]:
ytest[:55]

In [None]:
ytest.value_counts()

In [None]:
pd.crosstab(ytest,pred)

In [None]:
# Accuracy
(1584+15)/2000

In [None]:
# Precision --> True positives among the positive results model has caught
15/(33+15)

In [None]:
# Check the model's goodness of fit usinf Area under curve --> ROCR pacakge

In [None]:
from sklearn import metrics 

In [None]:
y_pred = model_lr.predict_proba(xtest)[:,1]
fpr,tpr, _ = metrics.roc_curve(ytest, y_pred)
auc = metrics.roc_auc_score(ytest,y_pred)
plt.plot(fpr,tpr,label = "curve, auc = "+str(auc))
plt.legend(loc=4)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.show()

In [None]:
model_lr.predict_proba(xtest)

In [None]:
# Tree --> Decision trees are prone to overfitting

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=5, max_leaf_nodes=10)

In [None]:
model_tree = dtree.fit(xtrain,ytrain)

In [None]:
pred_tree = model_tree.predict(xtest)

In [None]:
pd.crosstab(ytest,pred_tree)

In [None]:
(1554+161)/2000

In [None]:
161/(161+63)

In [None]:
from dtreeplt import dtreeplt

In [None]:
#!pip install dtreeplt

In [None]:
dtree_plt = dtreeplt(model=model_tree, feature_names=xtrain.columns, target_names=np.array(['pos','neg']))

In [None]:
dtree_plt.view()

In [None]:
help(dtreeplt)

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
 
plt.figure(figsize=(12, 8))
plot_tree(model_tree, filled=True, feature_names=xtrain.columns, class_names=['pos', 'neg'])
plt.show()

In [None]:
# Decision tree pruning

In [None]:
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [None]:
from sklearn import tree

In [None]:
params = {'max_depth':[2,4,6,8,10],
         'max_leaf_nodes':[5,10],
         'min_samples_leaf':[1,2]}

In [None]:
clf = tree.DecisionTreeClassifier()
gcv = GridSearchCV(estimator=clf, param_grid=params)
gcv.fit(xtrain,ytrain)

In [None]:
gcv.best_estimator_

In [None]:
final_model = gcv.best_estimator_
final_model.fit(xtrain,ytrain)
ytrain_pred = final_model.predict(xtrain)
ytest_pred = final_model.predict(xtest)

In [None]:
pd.crosstab(ytrain,ytrain_pred)

In [None]:
(6087+744)/8000

In [None]:
744/(744+259)

In [None]:
pd.crosstab(ytest,ytest_pred)

In [None]:
(161+1554)/2000

In [None]:
161/(161+63)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=150)

In [None]:
rf_model = rf.fit(xtrain,ytrain)

In [None]:
pred_rf = rf_model.predict(xtest)

In [None]:
pd.crosstab(ytest,pred_rf)

In [None]:
158/(158+62)

In [None]:
rf_model.feature_importances_

In [None]:
xtrain.columns

In [None]:
#!pip install xgboost

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(learning_rate=0.1, max_depth=6)

In [None]:
help(XGBClassifier)

In [None]:
model_xgb = xgb.fit(xtrain,ytrain)

In [None]:
pred_xgb = model_xgb.predict(xtest)

In [None]:
pd.crosstab(ytest, pred_xgb)

In [None]:
158/(158+49)

In [None]:
df[:3]

In [None]:
# Data preparation techniques

In [None]:
# Data transformation --> Standardisation (xi-(mean(x))/std(x)), Normalisation (xi-min(x))/(max(x)-min(x))

In [None]:
df['EstimatedSalary'].min()

In [None]:
df['EstimatedSalary'].max()

In [None]:
from sklearn.preprocessing import StandardScaler, scale, Normalizer

In [None]:
# Center and scale

In [None]:
sc = StandardScaler()

In [None]:
est=sc.fit_transform(df['EstimatedSalary'].values.reshape(-1,1))

In [None]:
df['EstimatedSalary'][:3]

In [None]:
(101348.88-df['EstimatedSalary'].mean())/df['EstimatedSalary'].std()

In [None]:
nm = Normalizer()

In [None]:
norm=nm.fit_transform(df['EstimatedSalary'].values.reshape(-1,1))