In [1]:
from google.colab import files
upload = files.upload()

Saving Churn_Modelling.csv to Churn_Modelling.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
import pickle

In [3]:
data = pd.read_csv("Churn_Modelling.csv")

In [4]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
# Preprocess the Data
# Drop Irrelevant columns
data = data.drop(['RowNumber','CustomerId','Surname'],axis = 1)

In [6]:
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [7]:
## Encode categorical variables
label_encoder_gender = LabelEncoder()
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.80,3,1,0,113931.57,1
3,699,France,0,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,1,39,5,0.00,2,1,0,96270.64,0
9996,516,France,1,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,0,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1


In [8]:
# One Hot Encode Geography column
from sklearn.preprocessing import OneHotEncoder
OHE_geo = OneHotEncoder()
geo_encoder = OHE_geo.fit_transform(data[['Geography']]).toarray()

In [9]:
geo_encoder

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [10]:
OHE_geo.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [11]:
geo_encoded_df = pd.DataFrame(geo_encoder,columns= OHE_geo.get_feature_names_out(['Geography']))

In [12]:
geo_encoded_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [13]:
# combine one hot encoded columns with the original data
data = pd.concat([geo_encoded_df,data.drop('Geography',axis = 1)],axis = 1)
data

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1.0,0.0,0.0,619,0,42,2,0.00,1,1,1,101348.88,1
1,0.0,0.0,1.0,608,0,41,1,83807.86,1,0,1,112542.58,0
2,1.0,0.0,0.0,502,0,42,8,159660.80,3,1,0,113931.57,1
3,1.0,0.0,0.0,699,0,39,1,0.00,2,0,0,93826.63,0
4,0.0,0.0,1.0,850,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.0,0.0,0.0,771,1,39,5,0.00,2,1,0,96270.64,0
9996,1.0,0.0,0.0,516,1,35,10,57369.61,1,1,1,101699.77,0
9997,1.0,0.0,0.0,709,0,36,7,0.00,1,0,1,42085.58,1
9998,0.0,1.0,0.0,772,1,42,3,75075.31,2,1,0,92888.52,1


In [14]:
## Save the encoders and scaler
with open('label_encoder_gender.pkl','wb') as file:
  pickle.dump(label_encoder_gender,file)

with open('OHE_geo.pkl','wb')  as file:
  pickle.dump(OHE_geo,file)

In [15]:
data.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1.0,0.0,0.0,619,0,42,2,0.0,1,1,1,101348.88,1
1,0.0,0.0,1.0,608,0,41,1,83807.86,1,0,1,112542.58,0
2,1.0,0.0,0.0,502,0,42,8,159660.8,3,1,0,113931.57,1
3,1.0,0.0,0.0,699,0,39,1,0.0,2,0,0,93826.63,0
4,0.0,0.0,1.0,850,0,43,2,125510.82,1,1,1,79084.1,0


In [16]:
# Divide the dataset into Independent and Dependent Features
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

# X = data.drop('Exited',axis =1 )
# y = data['Exited']

# Split the data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Scaling the features of train and test set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # prevent the test data from data leakage

In [17]:
X_train_scaled

array([[ 1.00150113, -0.57946723, -0.57638802, ...,  0.64920267,
         0.97481699,  1.36766974],
       [-0.99850112,  1.72572313, -0.57638802, ...,  0.64920267,
         0.97481699,  1.6612541 ],
       [-0.99850112, -0.57946723,  1.73494238, ...,  0.64920267,
        -1.02583358, -0.25280688],
       ...,
       [ 1.00150113, -0.57946723, -0.57638802, ..., -1.54035103,
        -1.02583358, -0.1427649 ],
       [ 1.00150113, -0.57946723, -0.57638802, ...,  0.64920267,
        -1.02583358, -0.05082558],
       [-0.99850112,  1.72572313, -0.57638802, ...,  0.64920267,
         0.97481699, -0.81456811]])

In [18]:
X_test_scaled

array([[-0.99850112,  1.72572313, -0.57638802, ..., -1.54035103,
        -1.02583358, -1.01960511],
       [ 1.00150113, -0.57946723, -0.57638802, ...,  0.64920267,
         0.97481699,  0.79888291],
       [-0.99850112, -0.57946723,  1.73494238, ...,  0.64920267,
        -1.02583358, -0.72797953],
       ...,
       [ 1.00150113, -0.57946723, -0.57638802, ...,  0.64920267,
        -1.02583358, -1.16591585],
       [ 1.00150113, -0.57946723, -0.57638802, ...,  0.64920267,
        -1.02583358, -0.41163463],
       [-0.99850112,  1.72572313, -0.57638802, ...,  0.64920267,
         0.97481699,  0.12593183]])

In [19]:
with open('scaler.pkl','wb') as file:
  pickle.dump(scaler,file)

In [20]:
data

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1.0,0.0,0.0,619,0,42,2,0.00,1,1,1,101348.88,1
1,0.0,0.0,1.0,608,0,41,1,83807.86,1,0,1,112542.58,0
2,1.0,0.0,0.0,502,0,42,8,159660.80,3,1,0,113931.57,1
3,1.0,0.0,0.0,699,0,39,1,0.00,2,0,0,93826.63,0
4,0.0,0.0,1.0,850,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.0,0.0,0.0,771,1,39,5,0.00,2,1,0,96270.64,0
9996,1.0,0.0,0.0,516,1,35,10,57369.61,1,1,1,101699.77,0
9997,1.0,0.0,0.0,709,0,36,7,0.00,1,0,1,42085.58,1
9998,0.0,1.0,0.0,772,1,42,3,75075.31,2,1,0,92888.52,1


In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

In [22]:
(X_train.shape[1],)  # Single dimension having 12 inputs

(12,)

In [23]:
# Build ANN Model
model = Sequential([
    Dense(64,activation = 'relu',input_shape = (X_train.shape[1],)), # HL1 connected with input layer
    Dense(32,activation='relu'),   # All the layers will be connected sequentially so no need to give input_shape as a parameter    ## HL2
    Dense(1,activation = 'sigmoid')  # Binary classificatin problem single neuron in output layer
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [24]:
model.summary()

In [25]:
import tensorflow
opt = tensorflow.keras.optimizers.Adam(learning_rate =0.01)
loss = tensorflow.keras.losses.BinaryCrossentropy()

In [26]:
# loss = "binary_crossentropy"  for binary classification problem
# loss = "sparse_crossentropy"  for multi-class classification problem
# Compile teh model
model.compile(optimizer =opt,loss = 'binary_crossentropy' ,metrics = ['accuracy'])

In [28]:
''' Early Stopping to monitor the loss value if loss value is not decreasing further after certain epochs no need to run the model for stated epochs
 We can apply Early Stopping where we can check loss value and if it is not decreasing for 5-10(stated) epochs we can order to stop the training model
 at that stage'''

 # Set up Early sstopping
early_stopping_callback = EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True) # validation loss/ patience for 5 epochs and performance is not good stop training the model
#/ while going through forward and backward propogation at which epoch you find best weights you can consider that and reload while doing early stopping

In [29]:
# Train model
history = model.fit(
    X_train,y_train,validation_data = (X_test,y_test),epochs = 100,
    callbacks = [tensorflow_callback,early_stopping_callback]
)


Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6916 - loss: 1122.0646 - val_accuracy: 0.6900 - val_loss: 64.7442
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6873 - loss: 44.6977 - val_accuracy: 0.8035 - val_loss: 24.5410
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6736 - loss: 12.8985 - val_accuracy: 0.7650 - val_loss: 0.9807
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6967 - loss: 1.7952 - val_accuracy: 0.8035 - val_loss: 0.5781
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7488 - loss: 0.6268 - val_accuracy: 0.8035 - val_loss: 0.5238
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7864 - loss: 0.5394 - val_accuracy: 0.8035 - val_loss: 0.5007
Epoch 7/100
[1

In [33]:
model.save('model.h5')   # h5 file will be compatible with keras



In [34]:
import tensorflow
from tensorflow.keras.models import load_model
import pandas as pd
import numpy as np
import pickle

In [36]:
# Load the trained model, scaler pickle, One Hot pickle
model = load_model('model.h5')

with open('OHE_geo.pkl','rb') as file:   #pickling is deserialising/seralising format
  OHE_geo = pickle.load(file)


with open('label_encoder_gender.pkl','rb') as file:
  label_encoder_gender = pickle.load(file)


with open('scaler.pkl','rb') as file:
  scaler = pickle.load(file)



In [37]:
data.columns

Index(['Geography_France', 'Geography_Germany', 'Geography_Spain',
       'CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [55]:
input_data = {
    'CreditScore':600,
    'Geography':'France',
    'Gender':'Male',
    'Age': 40,
    'Tenure': 3,
    'Balance': 60000,
    'NumOfProducts':2,
    'HasCrCard':1,
    'IsActiveMember': 1,
    'EstimatedSalary':50000
}

In [56]:
input_data_df = pd.DataFrame([input_data])
input_data_df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,600,France,Male,40,3,60000,2,1,1,50000


In [57]:
geo_encoded = OHE_geo.transform([[input_data['Geography']]]).toarray()
geo_encoded_df = pd.DataFrame(geo_encoded,columns = OHE_geo.get_feature_names_out(['Geography']))
geo_encoded_df



Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0


In [58]:
# Encode categorical variables
input_data_df['Gender'] = label_encoder_gender.transform(input_data_df['Gender'])
input_data_df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,600,France,1,40,3,60000,2,1,1,50000


In [59]:
## concat
input_data_df = pd.concat([geo_encoded_df,input_data_df.drop('Geography',axis = 1)],axis =1 )

In [60]:
input_data_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1.0,0.0,0.0,600,1,40,3,60000,2,1,1,50000


In [61]:
# Scaling the input data
input_scaled = scaler.transform(input_data_df)
input_scaled

array([[ 1.00150113, -0.57946723, -0.57638802, -0.53598516,  0.91324755,
         0.10479359, -0.69539349, -0.25781119,  0.80843615,  0.64920267,
         0.97481699, -0.87683221]])

In [64]:
# Predict Churn
prediction = model.predict(input_scaled)
prediction

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step


array([[0.22386585]], dtype=float32)

In [65]:
prediction_proba = prediction[0][0]
prediction_proba

0.22386585

In [67]:
if prediction_proba > 0.5:
  print("The customer is likely to churn")
else:
  print("The customer is not likely to churn")

The customer is not likely to churn
