In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('bank.csv')

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
df.isnull().sum() ## Checking for missing values

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [6]:
df = df.drop(['RowNumber','CustomerId','Surname'],axis=1)

In [7]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
cat_features = df.select_dtypes('object')
cat_features

Unnamed: 0,Geography,Gender
0,France,Female
1,Spain,Female
2,France,Female
3,France,Female
4,Spain,Female
...,...,...
9995,France,Male
9996,France,Male
9997,France,Female
9998,Germany,Male


In [9]:
df['CreditScoreByAge'] = df['CreditScore']/df['Age']

In [10]:
gender_dummies = pd.get_dummies(df['Gender'],drop_first=True)
gender_dummies.columns=['gender_male']
df = pd.concat([df,gender_dummies],axis=1)
df = df.drop('Gender',axis=1)

In [11]:
df['Geography'] = np.where(df['Geography']=='Spain',0,1)

In [12]:
### detecting outlier

outlier=[]

def detect_outlier(dataframe,feature):
    thresh = 3
    mean = np.mean(dataframe[feature])
    std = np.std(dataframe[feature])
    
    for i in dataframe[feature]:
        z_score = (i-mean)/std
        if z_score > thresh:
            outlier.append(i)
    return outlier

In [13]:
detect_outlier(df,'Age')

[75,
 73,
 72,
 79,
 80,
 75,
 72,
 82,
 74,
 71,
 72,
 74,
 76,
 71,
 73,
 77,
 74,
 74,
 74,
 74,
 74,
 72,
 77,
 74,
 88,
 71,
 72,
 71,
 75,
 73,
 76,
 85,
 74,
 76,
 72,
 71,
 74,
 72,
 72,
 84,
 71,
 74,
 84,
 77,
 79,
 76,
 73,
 73,
 76,
 72,
 71,
 80,
 74,
 76,
 75,
 77,
 74,
 71,
 75,
 78,
 74,
 71,
 77,
 79,
 81,
 79,
 71,
 72,
 71,
 72,
 72,
 78,
 75,
 71,
 73,
 71,
 71,
 76,
 73,
 75,
 73,
 71,
 72,
 73,
 92,
 75,
 71,
 77,
 92,
 72,
 71,
 76,
 72,
 77,
 74,
 72,
 73,
 77,
 71,
 72,
 81,
 76,
 74,
 71,
 76,
 72,
 81,
 73,
 71,
 75,
 71,
 71,
 71,
 73,
 72,
 71,
 81,
 73,
 74,
 83,
 71,
 78,
 72,
 74,
 80,
 72,
 76,
 71,
 71,
 78,
 78,
 77,
 77]

In [14]:
df['Age'] = np.where(df['Age'] >=71,71,df['Age']) ##Replacing outlier with border values

In [15]:
df.head()

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreByAge,gender_male
0,619,1,42,2,0.0,1,1,1,101348.88,1,14.738095,0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,14.829268,0
2,502,1,42,8,159660.8,3,1,0,113931.57,1,11.952381,0
3,699,1,39,1,0.0,2,0,0,93826.63,0,17.923077,0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,19.767442,0


In [16]:
df['Salary/Age'] = df['EstimatedSalary']/df['Age']

In [17]:
df['NumOfProducts'] = np.where(df['NumOfProducts']>1,1,0) ##Multiple products (>1)

In [18]:
cat_features =['Geography','NumOfProducts',
       'HasCrCard', 'IsActiveMember', 
       'gender_male']

In [19]:
cont_features = ['Age', 'Balance','EstimatedSalary', 'CreditScoreByAge', 'Salary/Age']

In [20]:
for feature in cat_features: 
    df[feature] = np.where(df[feature]==0,-1,df[feature]) ###To decorrelate them we will replace 0 with -1

In [21]:
df.head()

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreByAge,gender_male,Salary/Age
0,619,1,42,2,0.0,-1,1,1,101348.88,1,14.738095,-1,2413.068571
1,608,-1,41,1,83807.86,-1,-1,1,112542.58,0,14.829268,-1,2744.940976
2,502,1,42,8,159660.8,1,1,-1,113931.57,1,11.952381,-1,2712.656429
3,699,1,39,1,0.0,1,-1,-1,93826.63,0,17.923077,-1,2405.811026
4,850,-1,43,2,125510.82,-1,1,1,79084.1,0,19.767442,-1,1839.165116


In [22]:
X = df.drop(['Exited','CreditScore'],axis=1).values
y = df['Exited'].values

In [23]:
X.shape

(10000, 11)

In [24]:
y.shape

(10000,)

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42,stratify=y)

In [28]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [101]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD

In [102]:
encoder = Sequential()
encoder.add(Dense(11,activation="relu"))
encoder.add(Dense(6,activation="relu"))
encoder.add(Dense(3,activation="relu"))

In [103]:
decoder = Sequential()
decoder.add(Dense(6,activation="relu"))
decoder.add(Dense(11,activation="relu"))

In [104]:
autoencoder= Sequential([encoder,decoder])
autoencoder.compile(loss="binary_crossentropy",optimizer="SGD",metrics=["accuracy"])

In [None]:
autoencoder.fit(scaled_X_train,y_train,epochs=40,validation_data=(scaled_X_test,y_test))

Train on 8000 samples, validate on 2000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40

In [None]:
lower_layer = Sequential()
lower_layer.add(Dense(1,activation="sigmoid"))

In [None]:
final_model = Sequential([encoder,lower_layer])
final_model.compile(loss="binary_crossentropy",optimizer="SGD",metrics=["accuracy"])

In [None]:
final_model.fit(scaled_X_train,y_train,epochs=10,validation_data=(scaled_X_test,y_test))

In [None]:
losses= final_model.history.history

In [None]:
losses = pd.DataFrame(losses)

In [None]:
losses[["loss","val_loss"]].plot()

In [None]:
predictions = final_model.predict_classes(scaled_X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_auc_score,roc_curve,balanced_accuracy_score

In [None]:
print(roc_auc_score(y_test,predictions))

In [None]:
print(accuracy_score(y_test,predictions))

In [None]:
print(balanced_accuracy_score(predictions,y_test))

In [None]:
print(classification_report(predictions,y_test))

In [None]:
fpr,tpr,threshold  =roc_curve(predictions,y_test)

In [None]:
sns.countplot(df["Exited"])