
## Bank Customer Churn Prediction

In this kernel I am going to make an Exploratory Data Analysis (EDA) on this dataset. Also I am going to make different predictive models and find out the best one with highest prediction accuracy.


**Kernel Outlines:**

*     Importing Necessary Packages
*     Statistical Summary of the Dataset
*     Dropping Irrelevant Features
*     One Hot Encoding
*     Data Visualization
*     Detecting Outliers using Tukey Boxplot
*     Hand written function for detecting and removing outliers
*     Checking Correlation with Heatmap
*     Different ML predictive models
*         Gaussian Naive Bayes
*         Logistic Regression
*         Decision Tree
*         Random Forest
*         Extra Gradient Boosting Tree (XGBoost)
*     Improve the Predictive Model
*         Feature Scaling
*         Over Sampling

#### Importing Necessary Packages



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import keras

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(palette="Set2")

from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, f1_score,average_precision_score, confusion_matrix,
                             average_precision_score, precision_score, recall_score, roc_auc_score, )
from mlxtend.plotting import plot_confusion_matrix

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler


from xgboost import XGBClassifier, plot_importance
from imblearn.over_sampling import SMOTE
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# read dataset
dataset = pd.read_csv("/kaggle/input/deep-learning-az-ann/Churn_Modelling.csv")

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
sns.countplot(x="Gender", data=dataset)

In [None]:
dataset.describe()

In [None]:
# checking datatypes and null values
dataset.info()


Dropping Irrelevant Feature¶

`RowNumber`, `CustomerId` and `Surname` are irrelivant, so we drop those features.


In [None]:
dataset.drop(["RowNumber","CustomerId","Surname"], axis=1, inplace=True)

In [None]:
_, ax = plt.subplots(1, 3, figsize=(18, 6))
plt.subplots_adjust(wspace=0.3)
sns.countplot(x = "NumOfProducts", hue="Exited", data = dataset, ax= ax[0])
sns.countplot(x = "HasCrCard", hue="Exited", data = dataset, ax = ax[1])
sns.countplot(x = "IsActiveMember", hue="Exited", data = dataset, ax = ax[2])



**Customer with 3 or 4 products are higher chances to Churn
**

In [None]:
_, ax = plt.subplots(1, 3, figsize=(18, 6))
plt.subplots_adjust(wspace=0.3)
sns.swarmplot(x = "NumOfProducts", y = "Age", hue="Exited", data = dataset, ax= ax[0])
sns.swarmplot(x = "HasCrCard", y = "Age", data = dataset, hue="Exited", ax = ax[1])
sns.swarmplot(x = "IsActiveMember", y = "Age", hue="Exited", data = dataset, ax = ax[2])

In [None]:
encoder = LabelEncoder()
dataset["Geography"] = encoder.fit_transform(dataset["Geography"])
dataset["Gender"] = encoder.fit_transform(dataset["Gender"])

In [None]:
dataset["Age"].value_counts().plot.bar(figsize=(20,6))

In [None]:
facet = sns.FacetGrid(dataset, hue="Exited", aspect=3)
facet.map(sns.kdeplot,"Age",shade= True)
facet.set(xlim=(0, dataset["Age"].max()))
facet.add_legend()

plt.show()

In [None]:
_, ax =  plt.subplots(1, 2, figsize=(15, 7))
cmap = sns.cubehelix_palette(light=1, as_cmap=True)
sns.scatterplot(x = "Age", y = "Balance", hue = "Exited", cmap = cmap, sizes = (10, 200), data = dataset, ax=ax[0])
sns.scatterplot(x = "Age", y = "CreditScore", hue = "Exited", cmap = cmap, sizes = (10, 200), data = dataset, ax=ax[1])




*         **40 to 70 years old customers are higher chances to churn**
*         **Customer with CreditScore less then 400 are higher chances to churn**



In [None]:
plt.figure(figsize=(8,8))
sns.swarmplot(x="HasCrCard",y = "Age", data=dataset, hue="Exited")

In [None]:
facet = sns.FacetGrid(dataset, hue="Exited",aspect=3)
facet.map(sns.kdeplot,"Balance",shade= True)
facet.set(xlim=(0, dataset["Balance"].max()))
facet.add_legend()

plt.show()

In [None]:
_, ax = plt.subplots(1, 2, figsize=(15, 6))
sns.scatterplot(x = "Balance", y = "Age", data = dataset, hue="Exited", ax = ax[0])
sns.scatterplot(x = "Balance", y = "CreditScore", data = dataset, hue="Exited", ax = ax[1])



In [None]:
facet = sns.FacetGrid(dataset, hue="Exited",aspect=3)
facet.map(sns.kdeplot,"CreditScore",shade= True)
facet.set(xlim=(0, dataset["CreditScore"].max()))
facet.add_legend()

plt.show()

#### Detecting Outliers using Tukey Boxplot

In [None]:
plt.figure(figsize=(12,6))
bplot = dataset.boxplot(patch_artist=True)
plt.xticks(rotation=90)       
plt.show()

### Checking Correlation

In [None]:
plt.subplots(figsize=(11,8))
sns.heatmap(dataset.corr(), annot=True, cmap="RdYlBu")
plt.show()

### Prediction with ML models:

In [None]:
X = dataset.drop("Exited", axis=1)
y = dataset["Exited"]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
clf = GaussianNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test)

In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test)

In [None]:
sns.regplot(pred,y_test )

In [None]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test)

In [None]:
sns.regplot(pred,y_test )

In [None]:
clf = RandomForestClassifier(n_estimators = 200, random_state=200)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test)


In [None]:
sns.regplot(pred,y_test, color="red")

In [None]:
clf  = XGBClassifier(max_depth = 10,random_state = 10, n_estimators=220, eval_metric = 'auc', min_child_weight = 3,
                    colsample_bytree = 0.75, subsample= 0.9)

clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test)

In [None]:
sns.regplot(pred,y_test, color="red")

In [None]:
scaler = MinMaxScaler() 

bumpy_features = ["CreditScore", "Age", "Balance",'EstimatedSalary']

df_scaled = pd.DataFrame(data = X)
df_scaled[bumpy_features] = scaler.fit_transform(X[bumpy_features])

In [None]:
df_scaled.head()

In [None]:
X = df_scaled
sm  = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size= 0.2, random_state=7)


In [None]:
clf = XGBClassifier(max_depth = 12,random_state=7, n_estimators=100, eval_metric = 'auc', min_child_weight = 3,
                    colsample_bytree = 0.75, subsample= 0.8)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("Area under precision (AUC) Recall:", average_precision_score(y_test, y_pred))

In [None]:
 #Confusion Matrix
confusion_matrix(y_test, y_pred)

In [None]:
#splitting data into Train, DEV, test
from sklearn.model_selection import train_test_split
y=dataset.Exited # pulling values into another array so that we can drop
X=dataset.drop(['Exited'],axis='columns')
X_train, X_Dev, y_train, y_Dev = train_test_split(X,y,test_size=0.3,random_state=0,shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size=0.2,random_state=0,shuffle=False)


In [None]:
#[Train] divide train data into categories , numerical and binary

binary_columns=["HasCrCard","IsActiveMember"]
binary_df=pd.DataFrame(X_train[binary_columns])

numerical_columns =["CreditScore","Age","Tenure","Balance","NumOfProducts","EstimatedSalary"]
numerical_df=pd.DataFrame(X_train[numerical_columns])

category_columns=['Geography','Gender']
category_df=pd.DataFrame(X_train[category_columns])

In [None]:
#[TRAIN] Encode Categorical Data

category_df['Geography'] = category_df['Geography'].astype('category')
category_df['Gender'] = category_df['Gender'].astype('category')
category_df_Final = pd.get_dummies(category_df)


In [None]:
#[TRAIN] feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_df_train_mean=numerical_df.mean()
numerical_df_train_std=numerical_df.std(axis=0)
numerical_df_scale =pd.DataFrame(scaler.fit_transform(numerical_df),columns=numerical_columns)


In [None]:
# [TRAIN] Concatenate Columns
X_train = pd.concat([numerical_df_scale, category_df_Final,binary_df], axis=1)

In [None]:
#is there any NULL row ?
dataset.isnull().any().any(), dataset.shape

In [None]:
 df = dataset.copy

In [None]:
#customers churn 
churn= dataset[dataset["Exited"]==1]
#customers retention
retention = dataset[dataset["Exited"]==0]

print ("Total Churn          :", len(churn))
print ("Total Retention      :", len(retention))
## return total length "size"
total= len(dataset)
print ("Churn Rate           :",round((float(len(churn)) / float(total))*100,2),"%" )
print ("Retention Rate       :",(float(len(retention)) / float(total))*100,"%")


In [None]:
## Churn By Gender
female = churn[churn['Gender']=='Female']
male   = churn[churn['Gender']=='Male']
print ("Feramle Churn     :",round((float(len(female)) / float(len(churn)))*100,2),"%" )
print ("Male Churn        :",round((float(len(male)) / float(len(churn)))*100,2),"%")



In [None]:
from imblearn.over_sampling import SMOTE

X = dataset.drop("Exited",axis = 1)
y = dataset['Exited']
sm  = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X, y)

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size= 0.2, random_state=7)
print("The split of the under_sampled data is as follows")
print("X_train: ", len(X_train))
print("X_test: ", len(X_test))
print("y_train: ", len(y_train))
print("y_test: ", len(y_test))

In [None]:
import xgboost as xgb


model = xgb.XGBClassifier(max_depth = 12,random_state=7,n_estimators=100,eval_metric = 'auc' ,min_child_weight = 3
                          ,colsample_bytree = 0.75, subsample= 0.8)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("Area under precision Recall:", average_precision_score(y_test, y_pred))


In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.title("The Confusion Matrix")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()
print("The Accuracy is : "+str((float(cm[1,1])+float(cm[0,0]))/(float(cm[0,0]) + float(cm[0,1])+float(cm[1,0]) + float(cm[1,1]))*100) + "%")
print("The Recall   is : "+ str(float(cm[1,1])/(float(cm[1,0]) + float(cm[1,1]))*100) +"%")