## Standard libraries for data analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("Churn_Modelling.csv")

In [None]:
df.head()

## EDA Pre-processing

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
# Graph to check there is any non null values

df.isnull().sum().plot(kind='bar')
plt.title("No Null Values")
plt.xlabel("Attributes")
plt.ylabel("Counts")

In [None]:
# check unique values of Geography column.

df["Geography"].unique()

In [None]:
# Storytelling - Visualization Ratio
# Pi-Chart For Active & Not Active Customers

values = df.IsActiveMember.value_counts()
labels = ['Not Active', 'Active']

fig, ax = plt.subplots(figsize = (4, 3), dpi = 100)
explode = (0, 0.09)

patches, texts, autotexts = ax.pie(values, labels = labels, autopct = '%1.2f%%', shadow = True,
                                   startangle = 90, explode = explode)

plt.setp(texts, color = 'grey')
plt.setp(autotexts, size = 8, color = 'white')
autotexts[1].set_color('black')
plt.title("Pi-Chart For Active & Not Active Customers")
plt.show()

In [None]:
# Pi- Chart For Exited & Non Excited Customer

values = df.Exited.value_counts()
labels = ['Not Exited', 'Exited']

fig, ax = plt.subplots(figsize = (4, 3), dpi = 100)
explode = (0, 0.09)

patches, texts, autotexts = ax.pie(values, labels = labels, autopct = '%1.2f%%', shadow = True,
                                   startangle = 90, explode = explode, colors = ['darkblue','deeppink'])

plt.setp(texts, color = 'grey')
plt.setp(autotexts, size = 8, color = 'white')
plt.title("Pi- Chart For Exited & Non Excited Customer")
autotexts[1].set_color('black')
plt.show()

In [None]:
# Pi-Chart For Customers Having Card or Not

values = df.HasCrCard.value_counts()
labels = ['doesnt has CrCard', 'HasCrCard']

fig, ax = plt.subplots(figsize = (4, 3), dpi = 100)
explode = (0, 0.09)

patches, texts, autotexts = ax.pie(values, labels = labels, autopct = '%1.2f%%', shadow = True,
                                   startangle = 90, explode = explode, colors = ['tab:cyan' , 'tab:olive'])

plt.setp(texts, color = 'grey')
plt.setp(autotexts, size = 8, color = 'white')
autotexts[1].set_color('black')
plt.title("Pi-Chart For Customers Having Card or Not")
plt.show()

In [None]:
#Pi-chart for Gender

values = df.Gender.value_counts()
labels = ['Male', 'Female']

fig, ax = plt.subplots(figsize = (4, 3), dpi = 100)
explode = (0, 0.09)

patches, texts, autotexts = ax.pie(values, labels = labels, autopct = '%1.2f%%', shadow = True,
                                   startangle = 90, explode = explode, colors = ['teal','mediumvioletred'])

plt.setp(texts, color = 'grey')
plt.setp(autotexts, size = 8, color = 'white')
autotexts[1].set_color('black')
plt.title("Pi-chart for Gender")
plt.show()

In [None]:
# Comparing each feature in terms of number of people

fig, ax = plt.subplots(3, 2, figsize = (9, 8))

sns.countplot('Geography', hue = 'Exited', data = df, ax = ax[0][0])
sns.countplot('Gender', hue = 'Exited', data = df, ax = ax[0][1])
sns.countplot('Tenure', hue = 'Exited', data = df, ax = ax[1][0])
sns.countplot('NumOfProducts', hue = 'Exited', data = df, ax = ax[1][1])
sns.countplot('HasCrCard', hue = 'Exited', data = df, ax = ax[2][0])
sns.countplot('IsActiveMember', hue = 'Exited', data = df, ax = ax[2][1])

plt.tight_layout()
plt.show()

In [None]:
# check correlation between features and target

plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True)
plt.title(' Correlation Heatmap', fontsize=14)
plt.show()

In [None]:
# Handling Outlier for Numerical Column

plt.figure()
sns.boxplot(df["Age"])  
plt.show()

q1 = np.quantile(df["Age"],0.25)
q3 = np.quantile(df["Age"],0.75)

print("Value of Qurtile 1:",q1)
print("Value of Qurtile 3:",q3)

iqr = q3 - q1
uw = q3+(1.5*iqr)
lw = q1-(1.5*iqr)

print("Upper Whishker:",uw)
print("Lower Whishker:",lw)

df.Age.min(), df.Age.max()

In [None]:
# Handling Outlier for Categorical Column and remove it 

plt.figure(figsize=(5,5))
sns.boxplot(data=df, x="Balance",y="Geography")
plt.title("Box plot for Geography with outliers")
plt.show()

In [None]:
for make in df["Geography"].unique():
    price = df[df["Geography"] == make]["Balance"]
    q1 = np.quantile(price,0.25)
    q3 = np.quantile(price,0.75)
    iqr = q3-q1
    uw = q3+(1.5*iqr)
    
    index = price [price > uw].index
    if len(index) != 0:
        df.drop(index, inplace=True)

In [None]:
#Box plot after removing outliers

plt.figure(figsize=(5,5))
sns.boxplot(data=df, x="Balance",y="Geography")
plt.title("Box plot after removing outliers")
plt.show()

In [None]:
# spliting into categoral and numerical

df_num = df.select_dtypes(["int64","float64"])
df_cat = df.select_dtypes(["object"])

In [None]:
df_num.head()

In [None]:
df_cat.head()

In [None]:
# Categorical data plot 

for col in df_cat:
    plt.figure()
    sns.countplot(data=df_cat,y=col)
    plt.show()

In [None]:
# Handling Categorical Data
# 1. Label Encoding

pd.get_dummies(data=df, columns=['Geography', 'Gender'])

In [None]:
# for gender 1 hot encoding

data=df_cat["Gender"].head()
pd.get_dummies(df_cat["Gender"]).head()

In [None]:
# 2. Label Encoding

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(df_cat["Gender"])

In [None]:
# label encoding on all columns

for col in df:
    le = LabelEncoder()
    df[col]= le.fit_transform(df[col])

In [None]:
# Categorical data

df_cat.head()

In [None]:
# Numerical data

df_num.head()

In [None]:
# Handling Skweness

from scipy.stats import skew
for col in df_num:
    plt.figure()
    sns.distplot(df_num[col])
    plt.show()
    
    print("skewness: ",skew(df_num[col]))

In [None]:
# Square root

df_num["Balance"] = np.sqrt(df_num["Balance"])
df_num["EstimatedSalary"] = np.sqrt(df_num["EstimatedSalary"])
df_num["Age"] = np.sqrt(df_num["Age"])

In [None]:
print(skew(df_num["Balance"])) 
print(skew(df_num["EstimatedSalary"]))
print(skew(df_num["Age"]))

In [None]:
# scaling

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
# a) Standard Scaler:-

for col in df_num:
    ss = StandardScaler()
    df_num[col] = ss.fit_transform(df_num[[col]])
    
df_num.head()

In [None]:
# b) MinMax Scaler:-

for col in df_num:
    ss = MinMaxScaler()
    df_num[col] = ss.fit_transform(df_num[[col]])

df_num.head()

In [None]:
# combine both categorical and numerical data using concat function

df_new = pd.concat([df_cat,df_num], axis=1)
df_new.head()

In [None]:
# drop customerID column as it is of no use

df1 = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
df1.head()

## MODEL SELECTION

##  Classification Algorithm

In [None]:
from sklearn.model_selection import train_test_split

for col in df:
    le = LabelEncoder()
    df[col]= le.fit_transform(df[col])

X = df.drop("Exited",axis=1)
y = df["Exited"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

## 1. Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Fitting Naive Byes to the Training set:

model1 = GaussianNB()
model1.fit(X_train, y_train)


# Trainning score for naive bayes classifier
model1.score(X_train, y_train)


# Predicting the Test set results 
y_pred = model1.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
# confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
print(tp, fp)
print(fn, tn)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression


# Fitting Logistic Regression to the Training set:
model2 = LogisticRegression(random_state=1)
model2.fit(X_train,y_train)


# Trainning score for Logistic Regression:
model2.score(X_train, y_train)


# Prediction on test
y_pred = model2.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
# confusion matrix

from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(model2, X_train, y_train, cmap = 'PiYG')
plt.title("Confusion Matrix for trainning")
plt.show()

In [None]:
# confusion_matrix on testing

tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
print(tp, fp)
print(fn, tn)

## ROC

In [None]:
## roc auc

prob = model2.predict_proba(X_test)[:,-1]

thresholds = [0.5,0.4,0.3,0.2,0.1]
tprs = []
fprs = []

In [None]:
for th in thresholds:
    y_pred = np.where(prob >= th, 1, 0)
    tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
    
    tpr = tp/(tp + fn)
    fpr = fp/(fp + tn)
    
    tprs.append(tpr)
    fprs.append(fpr)

In [None]:
# ROC Graph

plt.figure()
plt.plot(fprs,tprs,"x--")
plt.xlabel("FPR")
plt.ylabel("TPR/Recall")
plt.scatter(fprs,tprs)
plt.title("ROC Graph")
plt.show()

In [None]:
# final model

y_pred = np.where(prob >= 0.1, 1, 0) 
print(classification_report(y_test,y_pred))

In [None]:
# Area Under Curve

from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred)

## KNN Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Whenever we perform data always scale the data

for col in X:
    ss = StandardScaler()
    X_train[col] = ss.fit_transform(X_train[[col]])
    X_test[col] = ss.transform(X_test[[col]])

In [None]:
neighbors = [3,7,9,11,13,15,17]

train_scores = []
test_scores = []

for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    
    train_score = knn.score(X_train,y_train) 
    test_score = knn.score(X_test,y_test)
    
    train_scores.append(train_score)
    test_scores.append(test_score)

In [None]:
# KNN Graph

plt.figure()
plt.plot(train_scores, color="red")
plt.plot(test_scores, color="blue")
plt.title("KNN Graph")
plt.show()

In [None]:
# Final model fit data in KNN Classifier:-

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)

# Prediction:-
y_pred = knn.predict(X_test)
print(classification_report(y_test,y_pred))

## DECISION TREE

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [None]:
#  fit data for trainning in  Decision Tree Classifier by default it is gini index.

model3 = DecisionTreeClassifier()
model3.fit(X_train,y_train)


# train score Decision Tree Classifier
model3.score(X_train,y_train)


# Predict Decision Tree Classifier:-
y_pred = model3.predict(X_test)
print(classification_report(y_test,y_pred))


In [None]:
model3.get_depth()

In [None]:
# fit data for trainning in  Decision Tree Classifier for entropy

model4 = DecisionTreeClassifier(criterion="entropy")
model4.fit(X_train,y_train)


# train score Decision Tree Classifier:-
model4.score(X_train,y_train)


# Predict Decision Tree Classifier:-
y_pred = model4.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
# 1. max depth

# fit data for trainning 
model5 = DecisionTreeClassifier(max_depth=18)
model5.fit(X_train,y_train)


# Prediction
y_pred = model5.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
# 2 - min samples leaf

# fit data for trainning 
model5 = DecisionTreeClassifier(min_samples_leaf=5)
model5.fit(X_train,y_train)


# Prediction
y_pred = model5.predict(X_test)
print(classification_report(y_test,y_pred))


## Decision Tree Regressor

In [None]:
from sklearn.model_selection import cross_val_score 

model6 = DecisionTreeRegressor()

cv = cross_val_score(model6,X,y,cv=4, scoring='neg_mean_squared_error')
print(cv)


# Fitting Decision Tree to the Training set:
model6 = DecisionTreeRegressor()
model6.fit(X_train,y_train)


# Predicting the Test set results :
y_pred = model6.predict(X_test)


# Evaluate results:
mse = mean_squared_error(y_test,y_pred)
print("MSE:-", mse)

rmse = np.sqrt(mse)
print("RMSE:-", rmse)

## Random over sampling

In [None]:
rs = RandomOverSampler(random_state=1)
X_train_rs, y_train_rs = rs.fit_resample(X_train,y_train)

y_train_rs.value_counts()

# Training
model7 = DecisionTreeClassifier()
model7.fit(X_train_rs, y_train_rs)


# Prediction
y_pred = model7.predict(X_test)
print(classification_report(y_test,y_pred))


In [None]:
# get max depth of model

model7.get_depth()

In [None]:
# Training
model8 = DecisionTreeClassifier(max_depth=20)
model8.fit(X_train_rs,y_train_rs)


# Prediction
y_pred = model8.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
# Training
model9 = DecisionTreeClassifier(min_samples_leaf=9)
model9.fit(X_train_rs,y_train_rs)


# Prediction
y_pred = model9.predict(X_test)
print(classification_report(y_test,y_pred))

## Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Training
rf=RandomForestClassifier(n_estimators=70, max_depth=10, max_features=5)
rf.fit(X_train_rs, y_train_rs)

# Predictition
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))

## BOOSTING

## Adaptive Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Training
model10 = AdaBoostClassifier(n_estimators=100, random_state=1)
model10.fit(X_train,y_train)

# Train Score
model10.score(X_train, y_train)

# Prediction
y_pred = model10.predict(X_test)
print(classification_report(y_test,y_pred))

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Training
model11 = GradientBoostingClassifier(random_state=1, n_estimators=200, learning_rate=0.5)
model11.fit(X_train,y_train)


# Prediction
y_pred = model11.predict(X_test)
print(classification_report(y_test,y_pred))

## XG Boost

In [None]:
from xgboost import XGBClassifier

# Training
xgb = XGBClassifier(n_estimators=100,random_state=1)
xgb.fit(X_train,y_train)

# Prediction
y_pred = xgb.predict(X_test)
print(classification_report(y_test,y_pred))

## Hyper parameter tuning using GridSearchCV

In [None]:
param_grid = {"n_estimators":[75,100,125,150],
              "learning_rate":[0.1,0.2,0.3],
              "min_samples_leaf":[10,20],
              "max_depth":[2,3,4]}

In [None]:
# model
model13 = GradientBoostingClassifier(random_state=1)

In [None]:
# Grid Search CV
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(model13, param_grid, cv=2)

gs.fit(X_train,y_train)

gs.best_params_

In [None]:
# Final Model
final_model = GradientBoostingClassifier(random_state=1,learning_rate=0.2,min_samples_leaf=10,n_estimators=150)
final_model.fit(X_train,y_train)


# Prediction
y_pred = final_model.predict(X_test)
print(classification_report(y_test,y_pred))