# Importing the libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the Dataset

In [None]:
bank_data=pd.read_csv("/kaggle/input/predicting-churn-for-bank-customers/Churn_Modelling.csv")

In [None]:
bank_data.head()

# Entire EDA at a single go by using Pandas-Profiling 

In [None]:
pip install pandas-profiling

In [None]:
bank_data.columns

In [None]:
from pandas_profiling import ProfileReport
report=ProfileReport(bank_data,title="profile report")

In [None]:
report.to_widgets()

In [None]:
report.to_file("report.html")

In [None]:
pip install sweetviz

In [None]:
import sweetviz

In [None]:
my_report = sweetviz.analyze([bank_data, "data"],target_feat='Exited')

In [None]:
my_report.show_html('my_report.html')

We note the following.
* No Missing values. 
* Number of variables is 14.
* Number of observations is 10000
* Numerical variables are 7, categorical variables are 4, and boolean variables are 3.
* Number of missing values is zero for all variables.


We also note the following.
* Row number is not required for our analysis.
* Customer Id is not required
* Surname has high cardinality and i will remove it from the analysis part.
* Credit score has minimal left skewed and almost normally distributed
* Geography has 3 categories. Germany,Spain, France. so we need to encode this 
* Gender is categorical and has two categories and we need to encode this.
* Age is slightly right skewed.
* Tenure has zeros but no ned of any opeartion there.
* Balance has zeros but no need of any operation there.
* Number of products is categorical.
* Has credit card is boolean which will explain whether the customer has credit card or not.
* Is Active member is boolean and it says that the customer is an active member or not.
* Estimated Salary is having its importance in the analysis.
* Exited is boolean which is our target variable and we have whether the customer is exited or not.

* No Strong correlation can be seen on either sides.

In [None]:
genderwise=bank_data.groupby('Gender')['Exited'].sum()
genderwise

In [None]:
y=genderwise.tolist()
label=['Female','Male']

In [None]:
plt.pie(y, labels=label,shadow=True,
   counterclock=False, startangle=90,autopct='%1.f%%',
       colors=['blue','orange'])
plt.pie([1],colors=['white'],radius=.4)
plt.show()

* Female is churning more.

In [None]:
geographywise=bank_data.groupby('Geography')['Exited'].sum()
geographywise

In [None]:
y=geographywise.tolist()
label=['France','Germany','Spain']

In [None]:
plt.pie(y, labels=label,shadow=True,
   counterclock=False, startangle=90,autopct='%1.f%%',
       colors=['blue','orange','green'])
plt.pie([1],colors=['white'],radius=.4)
plt.show()

* Germany the most churned.

In [None]:
nofproductswise=bank_data.groupby('NumOfProducts')['Exited'].sum()
nofproductswise

In [None]:
y=nofproductswise.tolist()
label=['1','2','3','4']

In [None]:
plt.pie(y, labels=label,shadow=True,
   counterclock=False, startangle=90,autopct='%1.f%%',
       colors=['orange','indigo','blue','green'])
plt.pie([1],colors=['white'],radius=.4)
plt.show()

* customer using only one product are the most churned

In [None]:
hascreditwise=bank_data.groupby('HasCrCard')['Exited'].sum()
hascreditwise

In [None]:
plt.pie(hascreditwise.tolist(), labels=['0','1'],shadow=True,
   counterclock=False, startangle=90,autopct='%1.f%%',
       colors=['orange','yellow'])
plt.pie([1],colors=['white'],radius=.4)
plt.show()

* The people who are having credit card churned more.

# We first review for categorical variables

In [None]:
sns.set(style="darkgrid")
sns.countplot(x='Geography', hue = 'Exited',data = bank_data, palette="Set3")

* Majority of the data is from persons from France. 
* The proportion of churned customers is with inversely related to the population of customers alluding to the bank possibly having a problem in the areas where it has fewer clients.

In [None]:
sns.set(style="darkgrid")
sns.countplot(x='Gender', hue = 'Exited',data = bank_data, palette="Set1")

* The proportion of female customers churning is also greater than that of male customers

In [None]:
sns.set(style="darkgrid")
sns.countplot(x='HasCrCard', hue = 'Exited',data = bank_data, palette="Set2")

* Majority of the customers that churned are those with credit cards.

In [None]:
sns.set(style="darkgrid")
sns.countplot(x='IsActiveMember', hue = 'Exited',data = bank_data, palette="Set3")

* Unsurprisingly the inactive members have a greater churn. Worryingly is that the overall proportion of inactive mebers is quite high suggesting that the bank may need a program implemented to turn this group to active customers as this will definately have a positive impact on the customer churn.

# Relations based on the continuous data attributes

In [None]:
sns.set(style="darkgrid")
sns.boxplot(y='CreditScore',x = 'Exited', hue = 'Exited',data = bank_data,palette="Set3",dodge=False)

* There is not much difference in the credit score distribution between retained and churned customers.

In [None]:
sns.set(style="darkgrid")
sns.boxplot(y='Age',x = 'Exited', hue = 'Exited',data = bank_data,dodge=False)

* The older customers are churning at more than the younger ones hinting to a difference in service preference in the age categories. The bank may need to review their target market or review the strategy for retention between the different age groups

In [None]:
g = sns.FacetGrid(bank_data, col = "Exited")
g.map(sns.distplot, "Age", bins = 25)
plt.show()

* It seems younger customers tend to stick with the company more compared to older customers.

In [None]:
sns.set(style="darkgrid")
sns.boxplot(y='Tenure',x = 'Exited', hue = 'Exited',data = bank_data,palette="Set2",dodge=False)

* With regard to the tenure, the clients on either extreme end (spent little time with the bank or a lot of time with the bank) are more likely to churn compared to those that are of average tenure.

In [None]:
sns.set(style="darkgrid")
sns.boxplot(y='Balance',x = 'Exited', hue = 'Exited',data = bank_data,palette="Set1",dodge=False)

* The bank is losing customers with significant bank balances which is likely to hit their available capital for lending.

In [None]:
sns.set(style="darkgrid")
sns.boxplot(y='NumOfProducts',x = 'Exited', hue = 'Exited',data = bank_data,palette="Set3",dodge=False)

* The number of product has no significant effect on the likelihood to churn.

In [None]:
sns.set(style="darkgrid")
sns.boxplot(y='EstimatedSalary',x = 'Exited', hue = 'Exited',data = bank_data,palette="Set1")

* The salary has no significant effect on the likelihood to churn.

# Data Preprocessing

* Target variable count

In [None]:
target_count = bank_data.Exited.value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])

* Lets see the proportion of the imbalance of the classes.

In [None]:
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

target_count.plot(kind='bar', title='Count (target)');

In [None]:
# Class count
count_class_0, count_class_1 = bank_data.Exited.value_counts()

# Divide by class
df_class_0 = bank_data[bank_data['Exited'] == 0]
df_class_1 = bank_data[bank_data['Exited'] == 1]

In [None]:
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
bank_data_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(bank_data_over.Exited.value_counts())

bank_data_over.Exited.value_counts().plot(kind='bar', title='Count (target)')

* Row number, Customer ID, Surname are not part of our analysis, So, I dont include them .

In [None]:
X=bank_data_over.iloc[:,3:13]
y=bank_data_over.iloc[:,13]

# Geography and gender features are categorical and we need to create dumies for them.

In [None]:
geography=pd.get_dummies(X['Geography'], drop_first= True)
gender=pd.get_dummies(X['Gender'], drop_first= True)

# Lets concatenate the data frames

In [None]:
X=pd.concat([X,geography,gender],axis=1)

* Dropping the unwanted columns now which are Geography and Gender

In [None]:
X=X.drop(['Geography','Gender'],axis=1)


# Lets split the data to train and test

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

# Lets do the feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [None]:
X_train

* Lets dive into Machine learning models

* Training my models.

We need to get the best algorithm which is giving us the best output as predictability. So, we need to try out with various ML algorithms and then we can select the best ones which wll give us the best performance. Then we can choose the baseline models and then we can proceed further by doing hyper parametr tuning to that baseline models to get the better accuracy.
#The algorithms we are implementing is as follows
1. Logistic Regression
2. Naive Bayes Model.
3. Decision tree
4. Random forest
5. XG Boost
6. Ada Boost
7. Gradient Boosting.

#So, we will be comparing the accuracies and ROC AUC scores and finalise the baseline models.


# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(C=100,random_state=0)
lr.fit(X_train,y_train)
y_pred_logistic=lr.predict(X_test)
correct = (y_test == y_pred_logistic).sum()
incorrect = (y_test != y_pred_logistic).sum()
accuracy = correct / (correct + incorrect) * 100

print('\nPercent Accuracy: %0.1f' %accuracy)

In [None]:
prediction = pd.DataFrame()
prediction['actual'] = y_test
prediction['predicted'] = y_pred_logistic
prediction['correct'] = prediction['actual'] == prediction['predicted']

print ('\nDetailed results for first 20 tests:')
print (prediction.head(20))

In [None]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
c_logistic=confusion_matrix(y_test,y_pred_logistic)
print(c_logistic)

#Accuracy of our model.
Accuracy_logistic=sum(np.diag(c_logistic))/(np.sum(c_logistic))
Accuracy_logistic

In [None]:
#Evaluation 
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_logistic))

In [None]:
# ROC_AUC 
from sklearn.metrics import roc_auc_score, roc_curve
logistic_roc_auc = roc_auc_score(y_test, y_pred_logistic , average = 'macro', sample_weight = None)
logistic_roc_auc

# Naive Bayes Model

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier_naive=GaussianNB()

# Fitting the model with training data
classifier_naive.fit(X_train, y_train)

# Predicting the Test set results
y_predict_naive=classifier_naive.predict(X_test)

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
c_naive=confusion_matrix(y_test,y_predict_naive)
print(c_naive)

#Accuracy of our model.
Accuracy_naive=sum(np.diag(c_naive))/(np.sum(c_naive))
Accuracy_naive

In [None]:
#Evaluation 
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict_naive))

In [None]:
# ROC AUC
naive_roc_auc = roc_auc_score(y_test, y_predict_naive , average = 'macro')
naive_roc_auc

# Decision Tree

In [None]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier_tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)

# Fitting the model with training data
classifier_tree=classifier_tree.fit(X_train, y_train)

# Predicting the Test set results
y_predict_tree = classifier_tree.predict(X_test)

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
c_tree=confusion_matrix(y_test,y_predict_tree)
print(c_tree)

#Accuracy of our model.
Accuracy_tree=sum(np.diag(c_tree))/(np.sum(c_tree))
Accuracy_tree

In [None]:
#Evaluation 

from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict_tree))

In [None]:
# ROC AUC
tree_roc_auc = roc_auc_score(y_test, y_predict_tree , average = 'macro')
tree_roc_auc

In [None]:
features_label = X.columns
importances = classifier_tree.feature_importances_
indices = np.argsort(importances)[::-1]
for i in range(X.shape[1]):
    print ("%2d) %-*s %f" % (i + 1, 30, features_label[i], importances[indices[i]]))

In [None]:
# Visualization of the Feature importances
plt.title('Feature Importances')
plt.bar(range(X.shape[1]), importances[indices], color = "magenta", align = "center")
plt.xticks(range(X.shape[1]), features_label, rotation = 90)
plt.show()

from sklearn import tree
from sklearn.tree import export_text
plt.figure(figsize=(60,50))
tree.plot_tree(classifier_tree,filled=True)

# Random forest

In [None]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier_ensemble = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)

# Fitting the model with training data
classifier_ensemble.fit(X_train, y_train)

# Predicting the Test set results
y_predict_ensemble = classifier_ensemble.predict(X_test)

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
c_ensemble=confusion_matrix(y_test,y_predict_ensemble)
print(c_ensemble)

#Accuracy of our model.
Accuracy_ensemble=sum(np.diag(c_ensemble))/(np.sum(c_ensemble))
Accuracy_ensemble

In [None]:
#Evaluation 

from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict_ensemble))

In [None]:
# ROC AUC
ensemble_roc_auc = roc_auc_score(y_test, y_predict_ensemble , average = 'macro')
ensemble_roc_auc

In [None]:
features_label = X.columns
importances = classifier_ensemble.feature_importances_
indices = np.argsort(importances)[::-1]
for i in range(X.shape[1]):
    print ("%2d) %-*s %f" % (i + 1, 30, features_label[i], importances[indices[i]]))

In [None]:
# Visualization of the Feature importances
plt.title('Feature Importances')
plt.bar(range(X.shape[1]), importances[indices], color = "red", align = "center")
plt.xticks(range(X.shape[1]), features_label, rotation = 90)
plt.show()

# XG Boost

In [None]:
!pip install xgboost

In [None]:
# Fitting the XGBoost to the training set
from xgboost import XGBClassifier
classifier_xg=XGBClassifier()

# Fitting the model with training data
classifier_xg.fit(X_train,y_train)

# Predicting the test results
y_predict_xg= classifier_xg.predict(X_test)

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
c_xg=confusion_matrix(y_test,y_predict_xg)
print(c_xg)

#Accuracy of our model.
Accuracy_xg=sum(np.diag(c_xg))/(np.sum(c_xg))
Accuracy_xg

In [None]:
#Evaluation 
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict_xg))

In [None]:
#ROC AUC
xg_roc_auc = roc_auc_score(y_test, y_predict_xg , average = 'macro')
xg_roc_auc

In [None]:
features_label = X.columns
importances = classifier_xg.feature_importances_
indices = np.argsort(importances)[::-1]
for i in range(X.shape[1]):
    print ("%2d) %-*s %f" % (i + 1, 30, features_label[i], importances[indices[i]]))

In [None]:
# Visualization of the Feature importances
plt.title('Feature Importances')
plt.bar(range(X.shape[1]), importances[indices], color = "yellow", align = "center")
plt.xticks(range(X.shape[1]), features_label, rotation = 90)
plt.show()

# Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
classifier_ada = AdaBoostClassifier(base_estimator = None, n_estimators = 200, learning_rate = 1.0)

# Fitting the model with training data 
classifier_ada.fit(X_train, y_train)


# Predicting the test results
y_predict_ada= classifier_ada.predict(X_test)

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
c_ada=confusion_matrix(y_test,y_predict_ada)
print(c_ada)

#Accuracy of our model.
Accuracy_ada=sum(np.diag(c_ada))/(np.sum(c_ada))
Accuracy_ada

In [None]:
#Evaluation 
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict_ada))

In [None]:
#ROC AUC
ada_roc_auc = roc_auc_score(y_test, y_predict_ada , average = 'macro')
ada_roc_auc

# GradientBoosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
classifier_GB = GradientBoostingClassifier(loss = 'deviance', n_estimators = 200)

# Fitting the model with training data 
classifier_GB.fit(X_train, y_train)

# Predicting the test results
y_predict_GB= classifier_GB.predict(X_test)

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
c_GB=confusion_matrix(y_test,y_predict_GB)
print(c_GB)

#Accuracy of our model
Accuracy_GB=sum(np.diag(c_GB))/(np.sum(c_GB))
Accuracy_GB

In [None]:
#Evaluation 
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict_GB))

In [None]:
# ROC AUC
GB_roc_auc = roc_auc_score(y_test, y_predict_GB , average = 'macro')
GB_roc_auc

In [None]:
models = ['Logistic Regression', 'Naive Bayes', 'Decision Tree', 'Random Forest', 'XGBoost', 'AdaBoost', 'GradientBoosting']
accuracy = [Accuracy_logistic, Accuracy_naive, Accuracy_tree, Accuracy_ensemble, Accuracy_xg, Accuracy_ada, Accuracy_GB]
roc_auc = [logistic_roc_auc, naive_roc_auc, tree_roc_auc, ensemble_roc_auc, xg_roc_auc, ada_roc_auc, GB_roc_auc]

metrics = {'accuracy': accuracy, 'roc_auc': roc_auc}
table_metrics = pd.DataFrame(metrics, index = models)
table_metrics

Lets us do the cross validation approach and then move on to hyper parametr tuning.

Now we will go for the best ones to go for hyper parameter tuning.

Implementing a cross-validation based approach

In [None]:
from sklearn.model_selection import cross_val_score

# Function that will track the mean value and the standard deviation of the metric

def cvDictGen(functions, score, X_train = X, y_train = y, cv = 5):
    cvDict = {}
    for func in functions:
        cvScore = cross_val_score(func, X_train, y_train, cv = cv, scoring = score)
        cvDict[str(func).split('(')[0]] = [cvScore.mean(), cvScore.std()]
    
    return cvDict

In [None]:
model = [lr, classifier_naive, classifier_tree, classifier_ensemble, classifier_xg, classifier_ada, classifier_GB]
cvD = cvDictGen(model, score = 'roc_auc')
cvD


1. Decision Tree
2. Random forest
3. xgboost

Are performing well.

# Hyper parameter tuning

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}

# Random forest

In [None]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]
max_depth.append(None)
min_samples_split = [5, 10]
min_samples_leaf = [4, 6, 8, 10]
ccp_alpha= [0.001,0.005,0.010,0.015,0.020,0.025,0.030]


random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'criterion':["entropy", "gini"],
               'ccp_alpha' : ccp_alpha
               }

print(random_grid)

* Setting refit='AUC', refits an estimator on the whole dataset with the
* parameter setting that has the best cross-validated AUC score.

In [None]:
rf_random = RandomizedSearchCV(estimator = classifier_ensemble, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2,scoring='roc_auc',refit='AUC', random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)
rf_random.best_params_,rf_random.best_score_

In [None]:
rf_random = RandomizedSearchCV(estimator = classifier_ensemble, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2,scoring='accuracy', random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)
rf_random.best_params_,rf_random.best_score_

# xgboost

In [None]:
min_child_weight = [1,3, 5, 7, 10]
gamma = [0.1,0.2,0.3,0.4, 0.5, 1, 1.5, 2, 5]
subsample = [0.6, 0.8, 1.0]
colsample_bytree = [0.3,0.4,0.5,0.6, 0.8, 1.0]
max_depth = [int(x) for x in np.linspace(3, 50, num = 5)]
max_depth.append(None)
sampling_method = ['uniform','gradient_based']
learning_rate = [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ]

xg_grid = {
        'min_child_weight': min_child_weight,
        'gamma': gamma,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'max_depth': max_depth,
        'sampling_method' : sampling_method,
        'learning_rate' : learning_rate
        }

print(xg_grid)

In [None]:
xg_random = RandomizedSearchCV(estimator = classifier_xg, param_distributions = xg_grid, n_iter = 100, cv = 3, verbose=2, random_state=42,scoring='roc_auc', n_jobs = -1)
xg_random.fit(X_train, y_train)
xg_random.best_params_,xg_random.best_score_

In [None]:
xg_random = RandomizedSearchCV(estimator = classifier_xg, param_distributions = xg_grid, n_iter = 100, cv = 3, verbose=2, random_state=42,scoring='accuracy', n_jobs = -1)
xg_random.fit(X_train, y_train)
xg_random.best_params_,xg_random.best_score_

# Decision tree

In [None]:
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(3, 110, num = 11)]
max_depth.append(None)
min_samples_split = [5, 10]
min_samples_leaf = [4,6,8,10]
ccp_alpha = [0.001,0.005,0.010,0.015,0.020,0.025,0.030]

decision_grid = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'criterion':["entropy", "gini"],
               'ccp_alpha' : ccp_alpha
                }

print(decision_grid)

In [None]:
decision_random = RandomizedSearchCV(estimator = classifier_tree, param_distributions = decision_grid, n_iter = 100, cv = 3, verbose=2,scoring='roc_auc', random_state=42, n_jobs = -1)
decision_random.fit(X_train, y_train)
decision_random.best_params_,decision_random.best_score_

In [None]:
decision_random = RandomizedSearchCV(estimator = classifier_tree, param_distributions = decision_grid, n_iter = 100, cv = 3, verbose=2,scoring='accuracy', random_state=42, n_jobs = -1)
decision_random.fit(X_train, y_train)
decision_random.best_params_,decision_random.best_score_

# So, now after this XGBoost,RANDOM FOREST has taken the complete edge over the Decision Tree.

* After looking at the scores, we can clearly see that randome forest accuracy and ROC_AUC score is ggod to go with.
* accuracy is 91% and roc-auc score is 97% for train data and lets check for the test data as well

# Final XGBoost Model

In [None]:
final_model_xg=XGBClassifier(subsample= 1.0, min_child_weight= 1, max_depth= 38, gamma= 0.3, colsample_bytree= 0.3, sampling_method= 'uniform',learning_rate= 0.15)
final_model_xg.fit(X_train,y_train)

In [None]:
y_pred = final_model_xg.predict(X_test)
print(final_model_xg.__class__.__name__, accuracy_score(y_test, y_pred))

In [None]:
results = confusion_matrix(y_test, y_pred)

In [None]:
print ('Confusion Matrix :')
print(results) 
print ('Accuracy Score :',accuracy_score(y_test, y_pred))
print ('Report : ')
print (classification_report(y_test, y_pred))

This is performing well for test dataset. The accuracy turns out to be 95%

# Final Random Forest Model.

In [None]:
final_model_rf=RandomForestClassifier(n_estimators= 2000, min_samples_split= 5,criterion="entropy", min_samples_leaf= 6, max_features= 'log2', max_depth= 20,ccp_alpha = 0.001)
final_model_rf.fit(X_train,y_train)

In [None]:
y_pred = final_model_rf.predict(X_test)
print(final_model_rf.__class__.__name__, accuracy_score(y_test, y_pred))

In [None]:
results = confusion_matrix(y_test, y_pred)

In [None]:
print ('Confusion Matrix :')
print(results) 
print ('Accuracy Score :',accuracy_score(y_test, y_pred))
print ('Report : ')
print (classification_report(y_test, y_pred))

The acuuracy is 82.29% for test set.

In [None]:
final_model_rf.estimators_

In [None]:
len(final_model_rf.estimators_)

In [None]:
from sklearn import tree

plt.figure(figsize=(50,20))
tree.plot_tree(final_model_rf.estimators_[2],filled= True)

In [None]:
# look for the explaianbility of 2000 decision trees.
#for i in range(len(final_model_rf.estimators_)):
    #print(tree.export_text(final_model_rf.estimators_[i]))

    

##Look for the explaianability of which ever tree you wants by entering the number between 0-1999    
print(tree.export_text(final_model_rf.estimators_[1]))

# lets concentrate on ANN

In [None]:
# Importing the libraries

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU,PReLU,ELU
from keras.layers import Dropout

Initialising the ANN

In [None]:
classifier=Sequential()

Adding the Input layer and the first hidden layer

In [None]:
classifier.add(Dense(units = 6, kernel_initializer = 'he_uniform',activation='relu',input_dim=11))

Adding the second hidden layer

In [None]:
classifier.add(Dense(units = 6, kernel_initializer = 'he_uniform',activation='relu'))

Adding the third hidden layer

In [None]:
classifier.add(Dense(units = 6, kernel_initializer = 'he_uniform',activation='relu'))

Adding the output layer

In [None]:
classifier.add(Dense(units = 1, kernel_initializer = 'glorot_uniform',activation='sigmoid'))

In [None]:
classifier.summary()

# Compiling the ANN

In [None]:
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to training set

In [None]:
model_history=classifier.fit(X_train,y_train,validation_split=0.33,batch_size=10,nb_epoch=100)

Accuracy and validation accuracy are almost similar.

# Predicting the test set results

In [None]:
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

# confusion matrix and the metrics

In [None]:
from sklearn.metrics import confusion_matrix
c=confusion_matrix(y_test,y_pred)

from sklearn.metrics import accuracy_score
score=accuracy_score(y_pred,y_test)

In [None]:
print(c,score)

Only 78% accuracy is obtained.

# Lets do the hyper parameter tuning to get the best hyper parameters

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

from keras.models import Sequential
from keras.layers import Dense,Activation,Embedding,Flatten,LeakyReLU,BatchNormalization
from keras.activations import relu, sigmoid
from keras.layers import Dropout

In [None]:
def create_model(layers,activation):
    model=Sequential()
    for i,nodes in enumerate(layers):
        if i==0:
            model.add(Dense(nodes,input_dim=X_train.shape[1]))
            model.add(Activation(activation))
            model.add(Dropout(0.3))
            
        else:
            model.add(Dense(nodes))
            model.add(Activation(activation))
            model.add(Dropout(0.3))
    model.add(Dense(units=1,kernel_initializer='uniform',activation='sigmoid'))
    model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model
model=KerasClassifier(build_fn=create_model, verbose=0)

In [None]:
layers=[(20,),(40,20),(20,10),(30,20),(25,20)]
activations=['sigmoid','relu']
param_grid = dict(layers= layers,activation= activations,batch_size= [128,256],epochs= [30])

grid=GridSearchCV(estimator=model,param_grid=param_grid,cv=5)

In [None]:
grid_result=grid.fit(X_train,y_train)

In [None]:
print(grid_result.best_score_,grid_result.best_params_)

These are the best parameters

In [None]:
pred_y=grid.predict(X_test)
y_pred=(pred_y>0.5)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
c1=confusion_matrix(y_pred,y_test)
score=accuracy_score(y_pred,y_test)
print(c1,score)

There is a slight increase in the accuracy even after some hyper parameter tuning.

# Interpretability of the  ML models.

In [None]:
import shap

In [None]:
# Initialize JS For Plot
shap.initjs()

In [None]:
rf_explainer = shap.TreeExplainer(final_model_rf)
rf_shap_values = rf_explainer.shap_values(X_test)

* Variable importance plot.— Global Interpretability

* lists the most significant variables in descending order. The top variables contribute more to the model than the bottom ones and thus have high predictive power.

In [None]:
shap.summary_plot(rf_shap_values, X_train)

In [None]:
# summarize the effects of all the features
shap.summary_plot(rf_shap_values, X)

In [None]:
bank_data.columns

 * The collective force plot

In [None]:
shap.force_plot(rf_explainer.expected_value[0], rf_shap_values[0], X)

# SHAP Summary Plot

The SHAP values for RF explain the margin output of the model.

This summary plot replaces the typical bar chart of feature importance. It tells which features are most important, and also their range of effects over the dataset. The color allows us match how changes in the value of a feature effect the change in risk.