In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')
import time
pd.options.display.max_columns = None
pd.options.display.max_rows = 80
%matplotlib inline
import tensorflow as tf
from tensorflow import keras as keras

In [2]:
tf.__version__

'2.14.0'

In [3]:
df = pd.read_csv('Churn_Modelling.csv')

In [4]:
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [5]:
df.drop(columns=['RowNumber','CustomerId','Surname'], inplace=True)

In [6]:
df.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
X = df.drop(columns=['Exited'])
y = df['Exited']

In [9]:
X.head(3), y.head(3)

(   CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
 0          619    France  Female   42       2       0.00              1   
 1          608     Spain  Female   41       1   83807.86              1   
 2          502    France  Female   42       8  159660.80              3   
 
    HasCrCard  IsActiveMember  EstimatedSalary  
 0          1               1        101348.88  
 1          0               1        112542.58  
 2          1               0        113931.57  ,
 0    1
 1    0
 2    1
 Name: Exited, dtype: int64)

In [10]:
for col in df.columns:
    if df[col].value_counts().count() < 10:
        print(col)
        print("-------"*10)
        print(df[col].value_counts())
        print("-------"*10)

Geography
----------------------------------------------------------------------
France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64
----------------------------------------------------------------------
Gender
----------------------------------------------------------------------
Male      5457
Female    4543
Name: Gender, dtype: int64
----------------------------------------------------------------------
NumOfProducts
----------------------------------------------------------------------
1    5084
2    4590
3     266
4      60
Name: NumOfProducts, dtype: int64
----------------------------------------------------------------------
HasCrCard
----------------------------------------------------------------------
1    7055
0    2945
Name: HasCrCard, dtype: int64
----------------------------------------------------------------------
IsActiveMember
----------------------------------------------------------------------
1    5151
0    4849
Name: IsActiveMember, dtyp

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['Gender'] = le.fit_transform(X['Gender'])

In [12]:
X.head(4)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,0,42,2,0.0,1,1,1,101348.88
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58
2,502,France,0,42,8,159660.8,3,1,0,113931.57
3,699,France,0,39,1,0.0,2,0,0,93826.63


In [13]:
X = pd.get_dummies(columns=['Geography'], data=X)

In [14]:
X.head(4)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,0,0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0,1
2,502,0,42,8,159660.8,3,1,0,113931.57,1,0,0
3,699,0,39,1,0.0,2,0,0,93826.63,1,0,0


In [15]:
X[X['Balance'] == 0.00]

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,0,0
3,699,0,39,1,0.0,2,0,0,93826.63,1,0,0
6,822,1,50,7,0.0,2,1,1,10062.80,1,0,0
11,497,1,24,3,0.0,2,1,0,76390.01,0,0,1
12,476,0,34,10,0.0,2,1,0,26260.98,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9989,841,1,28,4,0.0,2,1,1,179436.60,0,0,1
9992,726,1,36,2,0.0,1,1,0,195192.40,0,0,1
9994,800,0,29,2,0.0,2,0,0,167773.55,1,0,0
9995,771,1,39,5,0.0,2,1,0,96270.64,1,0,0


## Splitting The Data

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [17]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_trainS = sc.fit_transform(X_train)
X_testS = sc.transform(X_test)

In [18]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=9)
rfc.fit(X_trainS, y_train)

In [19]:
y_pred = rfc.predict(X_testS)

In [20]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test)

In [21]:
print("Accuracy with Random Forest: {}% ".format((accuracy*100)))

Accuracy with Random Forest: 87.03999999999999% 


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [23]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_trainS = sc.fit_transform(X_train)
X_testS = sc.transform(X_test)

## Using deep learning/ ANN

In [24]:
ann = keras.models.Sequential()

In [25]:
ann.add(keras.layers.Dense(units=6, activation='relu'))
ann.add(keras.layers.Dense(units=6, activation='relu'))
ann.add(keras.layers.Dense(units=1, activation='sigmoid'))

In [26]:
ann.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [34]:
ann.fit(X_trainS, y_train, batch_size=1, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50

KeyboardInterrupt: 

In [31]:
y_pred = ann.predict(X_testS)
y_pred = (y_pred > 0.5)



In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1937   66]
 [ 268  229]]


0.8664