In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('Churn_Modelling.csv')
df.head()

In [None]:
# Data exploration - Gives Information about the dataset
df.shape
df.info()
df.describe()

In [None]:
# Dealing with missing values
df.isnull().sum()

In [None]:
df=df.drop(columns=['RowNumber','CustomerId','Surname'])
df.head()

In [None]:
df.groupby('Geography').mean()
df.groupby('Gender').mean()

In [None]:
# One Hot Encoding for categorical variables
df=pd.get_dummies(df,columns=['Geography','Gender'],drop_first=True)
df.head()

In [None]:
sns.histplot(df['Exited'])
plt.plot()

In [None]:
(df.Exited==0).sum() # Staying with the bank
(df.Exited==1).sum() # Left the bank

In [None]:
df_2=df.drop(columns='Exited')
df_2.head()

In [None]:
# Establishing correlation of the dependent variable with the independent variables
df_2.corrwith(df['Exited']).plot.bar(
    figsize=(20,10),title='Correlation with exiting the bank',rot=45,grid=True
)

In [None]:
# Correlation matrix
corr=df.corr()
corr

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(corr,annot=True)

In [None]:
# Splitting the dataset
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(df.drop(columns='Exited'),df['Exited'],test_size=0.2,random_state=0)

In [None]:
X_train.shape
X_test.shape
Y_train.shape
Y_test.shape

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
clf_1=LogisticRegression()
clf_1.fit(X_train_scaled,Y_train)

In [None]:
y_pred1=clf_1.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,confusion_matrix
acc=accuracy_score(Y_test,y_pred1)
f1=f1_score(Y_test,y_pred1)
precision=precision_score(Y_test,y_pred1)
recall=recall_score(Y_test,y_pred1)

In [None]:
result_1=pd.DataFrame([['Logistic Regression',acc,f1,precision,recall]],
                      columns=['Model','Accuracy','F1','Precision','Recall'])
result_1

In [None]:
cm=confusion_matrix(Y_test,y_pred1)
cm

In [None]:
from sklearn.model_selection import cross_val_score
cv_1=cross_val_score(clf_1,X_train_scaled,Y_train,cv=10)
cv_1

In [None]:
print("Accuracy is ",np.mean(cv_1)*100,"%")
print("Standard Deviation is ",np.std(cv_1)*100,"%")

Accuracy is  80.8375 %
Standard Deviation is  0.8786530885395 %


In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_2=RandomForestClassifier()
clf_2.fit(X_train_scaled,Y_train)
y_pred2=clf_2.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,precision_score,recall_score
acc=accuracy_score(Y_test,y_pred2)
f1=f1_score(Y_test,y_pred2)
precision=precision_score(Y_test,y_pred2)
recall=recall_score(Y_test,y_pred2)

In [None]:
result_2=pd.DataFrame([['Random Forest',acc,f1,precision,recall]],
                      columns=['Model','Accuracy','F1','Precision','Recall'])
result_2

In [None]:
result_1=result_1.append(result_2,ignore_index=True)
result_1

In [None]:
cm=confusion_matrix(Y_test,y_pred2)
cm

In [None]:
from sklearn.model_selection import cross_val_score
cv_2=cross_val_score(clf_2,X_train_scaled,Y_train,cv=10)
cv_2

In [None]:
print("Accuracy is ",np.mean(cv_2)*100,"%")
print("Standard Deviation is ",np.std(cv_2)*100,"%")

Accuracy is  86.075 %
Standard Deviation is  0.9457140159688864 %


In [None]:
from xgboost import XGBClassifier
clf_3=XGBClassifier()
clf_3.fit(X_train_scaled,Y_train)
y_pred3=clf_3.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,precision_score,recall_score
acc=accuracy_score(Y_test,y_pred3)
f1=f1_score(Y_test,y_pred3)
precision=precision_score(Y_test,y_pred3)
recall=recall_score(Y_test,y_pred3)

In [None]:
result_3=pd.DataFrame([['XGBoost',acc,f1,precision,recall]],
                      columns=['Model','Accuracy','F1','Precision','Recall'])
result_3

In [None]:
cm=confusion_matrix(Y_test,y_pred3)
cm

In [None]:
from sklearn.model_selection import cross_val_score
cv_3=cross_val_score(clf_3,X_train_scaled,Y_train,cv=10)
cv_3

In [None]:
print("Accuracy is ",np.mean(cv_3)*100,"%")
print("Standard Deviation is ",np.std(cv_3)*100,"%")

Accuracy is  85.125 %
Standard Deviation is  0.9953014618697171 %


In [None]:
# Hyperparameter Tuning using Randomized Search
from sklearn.model_selection import RandomizedSearchCV
parameters={
    'learning_rate':[0.05,0.10,0.15,0.20,0.25,0.30],
    'max_depth':[3,4,5,6,7,8,10,12,15],
    'min_child_weight':[1,3,5,7],
    'gamma':[0.0,0.1,0.2,0.3,0.4],
    'colsample_bytree':[0.3,0.4,0.5,0.7]
}
parameters

In [None]:
random_search=RandomizedSearchCV(estimator=clf_3,param_distributions=parameters,n_iter=5,
                                 scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)
random_search.fit(X_train_scaled,Y_train)
random_search.best_estimator_

In [None]:
# Building the final model

In [None]:
from xgboost import XGBClassifier
clf_final=XGBClassifier(colsample_bytree=0.7,gamma=0.3,learning_rate=0.05,max_depth=8,min_child_weight=1)
clf_final.fit(X_train_scaled,Y_train)
y_pred_final=clf_final.predict(X_test_scaled)

In [None]:
from sklearn.model_selection import cross_val_score
cv_final=cross_val_score(clf_final,X_train_scaled,Y_train,cv=10)
cv_final

In [None]:
print("Accuracy is ",np.mean(cv_final)*100,"%")
print("Standard Deviation is ",np.std(cv_final)*100,"%")

In [None]:
# Predicting a single observation using finalised model
single_obs = [[625,	45,	5,	12500.01,	1,	0,	1,	101348.88, 0,	0, 1]]
result=clf_final.predict(single_obs)
result

array([0])