In [41]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras

In [93]:
data = pd.read_csv("bank.csv")

In [94]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [95]:
#Surname is also dropped as it is test data provides no impact on assessing customer retention probability
data.drop(columns = ['RowNumber', 'CustomerId', 'Surname'], axis = 1, inplace = True)

In [96]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


Target set would be the 'isActive' column since it checks whether bank customer is active, which would be useful for
validating the possibility of a bank customer staying or leaving

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             10000 non-null int64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(2), int64(6), object(2)
memory usage: 781.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 1 columns):
IsActiveMember    10000 non-null int64
dtypes: int64(1)
memory usage: 78.2 KB
None


In [71]:
X[X.isnull().any(axis=1)]

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,EstimatedSalary,Exited


In [72]:
Y[Y.isnull().any(axis=1)]

Unnamed: 0,IsActiveMember


Observation is '0' null values in both training and test set

In [73]:
X.nunique()

CreditScore         460
Geography             3
Gender                2
Age                  70
Tenure               11
Balance            6382
NumOfProducts         4
HasCrCard             2
EstimatedSalary    9999
Exited                2
dtype: int64

In [74]:
Y.nunique()

IsActiveMember    2
dtype: int64

In [75]:
#Checking Categorical Values in X
print(X['Geography'].unique())
print(X['Gender'].unique())

['France' 'Spain' 'Germany']
['Female' 'Male']


In [76]:
#Converting Categorical Values to Numeric Format, dropping redundant first dummy variable for decorrelation
X = pd.get_dummies(X, columns = ['Geography', 'Gender'], dtype = np.int64, drop_first = True)
X.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,79084.1,0,0,1,0


In [85]:
#Capping Outliers in the given dataset. We shall only consider features which shall qualify for outlier analysis i.e.
#features with continuous values and not features that are categorical, order or count variables as they are ineligible for outlier
#analysis.

capDf = pd.DataFrame()
toCapDf = X.loc[:, ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']]
for col in X.columns:
    if(X[col].dtype == object):
        capDf[col] = X[col]
        continue
    if(col in toCapDf.columns):
        percentiles = toCapDf[col].quantile([0.25,0.75]).values
        Q1 = percentiles[0]
        Q3 = percentiles[1]
        IQR = Q3 - Q1
        minCap = Q1 - (IQR) * 1.5
        maxCap = Q3 + (IQR) * 1.5
        capDf[col] = toCapDf[col][(toCapDf[col] >= minCap) & (toCapDf[col] <= maxCap)]
    else:
        capDf[col] = X[col]
        capDf[''] = Y[col]
    
capDf.dropna(inplace=True);

In [87]:
#Printing the outliers which were removed (In all, 374 eligible outlier rows were removed)
outlierDf = pd.concat([X, capDf])
outlierDf = outlierDf.reset_index(drop=True)
df_gpby = outlierDf.groupby(list(outlierDf.columns))
idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]
outlierDf.reindex(idx)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
1838,350,39.0,0,109733.20,2,0,123602.11,1,1,0,1
9624,350,40.0,0,111098.85,1,1,172321.21,1,0,0,0
8723,350,51.0,10,0.00,1,1,125823.79,1,0,0,1
1631,350,54.0,1,152677.48,1,1,191973.49,1,0,1,1
8762,350,60.0,3,0.00,1,0,113796.15,1,0,0,0
2473,351,57.0,4,163146.46,1,1,169621.69,1,1,0,0
1962,358,52.0,8,143542.36,3,1,141959.11,1,0,1,0
1405,359,44.0,6,128747.69,1,1,146955.71,1,0,0,0
1193,363,28.0,6,146098.43,3,1,100615.14,1,0,1,0
2579,365,30.0,0,127760.07,1,1,81537.85,1,1,0,1


In [90]:
capDf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9626 entries, 0 to 9999
Data columns (total 11 columns):
CreditScore          9626 non-null int64
Age                  9626 non-null float64
Tenure               9626 non-null int64
Balance              9626 non-null float64
NumOfProducts        9626 non-null int64
HasCrCard            9626 non-null int64
EstimatedSalary      9626 non-null float64
Exited               9626 non-null int64
Geography_Germany    9626 non-null int64
Geography_Spain      9626 non-null int64
Gender_Male          9626 non-null int64
dtypes: float64(3), int64(8)
memory usage: 902.4 KB


In [92]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(capDf, Y, test_size=0.3, random_state=1)

ValueError: Found input variables with inconsistent numbers of samples: [9626, 10000]

In [None]:
from sklearn.preprocessing import Imputer