In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('E:/data/churn.csv')

In [None]:
df.shape

In [None]:
df[:10]

In [None]:
# Handling categorical variable

In [None]:
x = df.iloc[:,2:-1]

In [None]:
x1=pd.get_dummies(x, drop_first=True)

In [None]:
x1.shape

In [None]:
x[:2]

In [None]:
y = df['Exited']

In [None]:
y[:2]

In [None]:
# Class Distribution of the target variable

In [None]:
df['Exited'].value_counts()

In [None]:
# Baseline accuracy

In [None]:
7963/10000

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x1,y,test_size=0.2, random_state=4)

In [None]:
xtrain.shape

In [None]:
ytrain.shape

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg = LogisticRegression(max_iter=10000, solver='liblinear')

In [None]:
lg_model = lg.fit(xtrain,ytrain)

In [None]:
lgpred = lg_model.predict(xtest)

In [None]:
lg_model.predict_proba(xtest)[:100][:,1]

In [None]:
ytest[:15]

In [None]:
lgpred[:155]

In [None]:
# Confusion

In [None]:
pd.crosstab(ytest,lgpred)

In [None]:
1573/(1573+31)

In [None]:
acc=(1573+25)/2000

In [None]:
acc

In [None]:
25/(25+31)

In [None]:
# Recall = tp/(tp+fn)

In [None]:
25/(25+371)

In [None]:
#from sklearn.metrics import confusion_matrix

In [None]:
xtrain[:2]

In [None]:
ytrain[:2]

In [None]:
xtest.shape

In [None]:
# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn import tree

In [None]:
dtree = DecisionTreeClassifier(max_depth=5,min_samples_leaf=5, min_samples_split=5)

In [None]:
dtmodel = dtree.fit(xtrain,ytrain)

In [None]:
plt.figure(figsize=(30,12))
tree.plot_tree(decision_tree=dtmodel, feature_names=x1.columns, class_names=['pos','neg'], 
               filled=True, fontsize=12)
plt.show()

In [None]:
n = np.array(['Pos','Neg'])

In [None]:
dtpred = dtmodel.predict(xtest)

In [None]:
pd.crosstab(ytest,dtpred)

In [None]:
196/(196+63)

In [None]:
196/(196+200)

In [None]:
220/(176+220)

In [None]:
220/(220+231)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score

In [None]:
pd.crosstab(ytest,lgpred)

In [None]:
371+25

In [None]:
recall_score(ytest,dtpred)

In [None]:
# Pruning
# grid search --> optimum hyper parameters
# cost complexity pruning --> based on a cost parameter

In [None]:
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

In [None]:
params = {'max_depth': [6,8,10,12],
         'min_samples_split': [2,3,4],
         'min_samples_leaf': [1,2]}

clf = tree.DecisionTreeClassifier()
gcv = GridSearchCV(estimator=dtmodel,param_grid=params)
gcv.fit(xtrain,ytrain)

In [None]:
gcv.best_score_

In [None]:
gcv.best_params_

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

In [None]:
rfmodel = rf.fit(xtrain, ytrain)

In [None]:
rfpred = rfmodel.predict(xtest) 

In [None]:
pd.crosstab(ytest, rfpred)

In [None]:
189/(189+52)

In [None]:
# Data preparation

In [None]:
# Data Preparation Techniques
# Data Pre-processing : Sampling, Data Transformation --> Standardisation, Normalisation
# Cross-validation Techniques : K-fold, Stratified K-fold, Repeated K-fold
# Feature Selection Techniques : P-value, Step function, K-best, RFE, AUC/ROC
# Handling Class Imbalances : Under Sampling, Over Sampling, SMOTE
# Feature Extraction Techniques : Curse of Dimensionality : PCA, LDA

In [None]:
# Data Transformation

In [None]:
# Standardisation : (Center & Scale) : (xi-mean(x))/std(x)

In [None]:
df['EstimatedSalary'][:5]

In [None]:
(101348.88-df['EstimatedSalary'].mean())/df['EstimatedSalary'].std()

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
stdscaler = StandardScaler()

In [None]:
stdscaler.fit_transform(df['EstimatedSalary'].values.reshape(-1,1))

In [None]:
# Normalisation --> xi-min(x)/(max(x)-min(x)) --> range function

In [None]:
norm=MinMaxScaler()

In [None]:
norm.fit_transform(df['EstimatedSalary'].values.reshape(-1,1))

In [None]:
xscaled=stdscaler.fit_transform(x1)

In [None]:
df['EstimatedSalary'].values.reshape(-1,1)

In [None]:
# Feature Selection Techniques

In [None]:
# p-value, step function

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, RFE

In [None]:
lr = LogisticRegression(solver='liblinear')

In [None]:
rfe_lr = RFE(estimator=lr, n_features_to_select=5)
result = rfe_lr.fit(xtrain,ytrain)

In [None]:
result.ranking_

In [None]:
xtrain.columns

In [None]:
# Chi2 with Kbest

In [None]:
model = SelectKBest(score_func=chi2,k=5)
result_kbest = model.fit(x1,y)

In [None]:
result_kbest.scores_

In [None]:
for i in result_kbest.scores_:
    print(round(i,2))

In [None]:
xtrain.columns

In [None]:
# Cross Validation Techniques

In [None]:
# K-fold cv - 3 fold cv

In [None]:
# 10,000

# 1 fold --> 3500
# 2 fold --> 3250
# 3 fold --> 3250

# 1 model : 1st & 2nd chunk (training) and predicting on the 3rd chunk        --> 85%
# 2nd model : 1st & 3rd chunnk (training) and predicting on the 2nd chunk     --> 75%
# 3rd model : 2nd & 3rd (training) and predicting on the 1st chunk            --> 80%
# average of all the models accuarcy

In [None]:
# Evaluation of Algorithms

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [None]:
models = []

In [None]:
models.append(("LR", LogisticRegression(solver='liblinear',max_iter=1000)))
models.append(("Tree", DecisionTreeClassifier(max_depth=5,min_samples_leaf=1, min_samples_split=3)))
models.append(("SVM", SVC(kernel='sigmoid')))
models.append(("RF", RandomForestClassifier()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("LDA", LinearDiscriminantAnalysis()))
models.append(("NB", GaussianNB()))
models.append(("XGB", XGBClassifier()))

In [None]:
models

In [None]:
results = []
names = []

In [None]:
from sklearn.model_selection import KFold , cross_val_score, GridSearchCV

In [None]:
import warnings
warnings.ignore=True

In [None]:
for name,model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, x1,y,cv=kfold, scoring="precision")
    results.append(cv_results)
    names.append(name)
    
    outcome = "%s: %f (%f)" % (name,cv_results.mean()*100, cv_results.std()*100)
    print(outcome)

In [None]:
results

In [None]:
fig = plt.figure()
axis = fig.add_subplot(111)
plt.boxplot(results)
axis.set_xticklabels(names)
plt.show()

In [None]:
# Class Imbalance

# Under Sampling : Remove the extra observations from the majority class in order to match the records in the minority class
# Over Sampling : Create duplicate records for Minority class inorder to match the majority class

# SMOTE[Synthetic Minority Over Sampling] : Over sampling by taking the avg of random samples and creating new records ut of those samples

In [None]:
df[df['Exited']==1]

In [None]:
from imblearn.over_sampling import SMOTEfrom imblearn.over_sampling import SMOTE

In [None]:
#!pip install django-utils-six

In [None]:
#from sklearn.externals import six

In [None]:
s = SMOTE()

In [None]:
x_train_smote,y_train_smote = s.fit_sample(x_train,y_train)

In [None]:
from collections import Counter

In [None]:
print("before SMOTE:", Counter(y_train))
print("After SMOTE:", Counter(y_train_smote))

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(x_train_smote,y_train_smote)

In [None]:
pred_glm = lr.predict(x_test)

In [None]:
confusion_matrix(y_test,pred_glm)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(x_train_smote,y_train_smote)

In [None]:
pred_tree = dtree.predict(x_test)

In [None]:
confusion_matrix(y_test,pred_tree)

In [None]:
# Summary of Supervised Learning

In [None]:
# Predictive model : Learning from the statistical relationship b/w y & all the x vars
# Train & Test samples : Devide the data randomly in such a way both train & test samples represent the population data

In [None]:
# Linear models
# Sklearn & Statsmodels
# Linear Regression : OLS[Line of best fit], Fitted values, Actual Values, Slope co-efficient, Intercept, R2 and Adjusted R2
# Cost function
# Assumptions : all x vars should have a liner relationship with the Y var, Avoid multi-collinearity, Residuals should be
# normally distributed or should be independent
# Overfitted (Less bias) : Regularisation


# Non-linear models : Exponential, Reciprocal, Squared, Square root, Log, SVM, Decision Trees
# Ensemble : Random Forest. XGBoost

# Decision Trees Regressor : Split Criterea (MSE), Arguments to control the growth of the tree (Max-depth, max_leafnodes, min_leafnodes)
# Pruning based on cost complexity

# Handling outliers
# Dummy variables (categorical x vars)

# Logistic Regression : Maximum Likelihood, Log Loss, Probabilistic model, AUC/ROCR
 
# Cost function : Function which quantifies the error b/w actual and fitted values

# Decision Tree Classifier : Split Criteria(Gini or (entropy and informaation gain) or Chi2)

# Data Preparation & Evaluation