# `Project Overview`

# `Importing Libraries`

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight

from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

from xgboost import XGBClassifier
import pickle

# `Exporing the Data`

In [3]:
data = pd.read_csv("Data/Churn_Modelling_Dataset.csv")
data.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [4]:
data.shape

(10000, 14)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [6]:
data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [7]:
data.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


# `Cleaning The Data / Feature Engineering`

In [8]:
# We can Remove RowNumber, CustomerId, Surname

data = data.drop(["RowNumber", "CustomerId", "Surname"], axis = 1)
data.head(2)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [9]:
cat_cols = ["Geography", "Gender"]

encoder = OneHotEncoder(drop = "first" , sparse_output = False)
encoded_cols = encoder.fit_transform(data[cat_cols])

In [10]:
encoded_cols_df = pd.DataFrame(data = encoded_cols, columns = encoder.get_feature_names_out())

In [11]:
data = pd.concat([data.drop(columns=cat_cols), encoded_cols_df], axis=1)
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0.0,0.0,0.0
1,608,41,1,83807.86,1,0,1,112542.58,0,0.0,1.0,0.0
2,502,42,8,159660.8,3,1,0,113931.57,1,0.0,0.0,0.0
3,699,39,1,0.0,2,0,0,93826.63,0,0.0,0.0,0.0
4,850,43,2,125510.82,1,1,1,79084.1,0,0.0,1.0,0.0


In [12]:
with open("encoder.pkl", "wb") as file:
    pickle.dump(encoder, file)

In [13]:
# Or You Can Simply Do This : 

# data = pd.get_dummies(data = data, columns = ["Geography", "Gender"], drop_first = True)
# data.head(3)

In [14]:
# Splitting into Dependent and Independent Features
X = data.drop(["Exited"], axis = 1)
y = data["Exited"]

In [15]:
# Splitting into Trianing and Testing Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [16]:
# Scaling the Data 
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
with open('scaler.pkl','wb') as file:
    pickle.dump(scaler,file)

In [18]:
X_train[:2]

array([[ 0.35649971, -0.6557859 ,  0.34567966, -1.21847056,  0.80843615,
         0.64920267,  0.97481699,  1.36766974, -0.57946723, -0.57638802,
         0.91324755],
       [-0.20389777,  0.29493847, -0.3483691 ,  0.69683765,  0.80843615,
         0.64920267,  0.97481699,  1.6612541 ,  1.72572313, -0.57638802,
         0.91324755]])

In [19]:
INPUT_SHAPE = X_train.shape[1]
INPUT_SHAPE

11

In [20]:
y_train.value_counts()


Exited
0    6356
1    1644
Name: count, dtype: int64

In [21]:
# There is an Imbance in the Data
classes = np.array([0, 1])

class_weights = compute_class_weight(
    class_weight = 'balanced',
    classes = classes,
    y = y_train
)

In [22]:
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}
print(class_weights_dict)

# I have Trained my Model But the Training Has Become worse
# If we use Strong Weights Then This Can happen

{0: 0.6293266205160478, 1: 2.4330900243309004}


# `Training a Model`

## `Using Neural Network`

In [23]:
NN_model = Sequential([
    Dense(64, activation="relu", input_shape=(INPUT_SHAPE,)),
    Dropout(0.20),

    Dense(32, activation="relu"),
    Dropout(0.20),

    Dense(16, activation="relu"),
    Dropout(0.20),

    Dense(1, activation="sigmoid")
])


In [24]:
NN_model.compile(
    loss = "binary_crossentropy",
    optimizer = "adam",
    metrics = ["accuracy"]
)

In [25]:
# Callbacks
early_stoppper = EarlyStopping(monitor = 'val_loss', patience = 10, restore_best_weights = True)

log_dir = "logs/NN_model"
tensorboard_callback = TensorBoard(log_dir = log_dir, histogram_freq = 1)

In [26]:
NN_model.fit(
    x = X_train,
    y = y_train,
    epochs = 100,
    validation_data = (X_test, y_test),
    callbacks = [early_stoppper, tensorboard_callback],
    verbose = 1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100


<keras.callbacks.History at 0x2c193ea6140>

In [27]:
NN_model.save('NN_model.h5')

In [None]:
# load the TensorBoard notebook extension (only works in Jupyter / Colab)
%load_ext tensorboard

In [None]:
# start TensorBoard and point it to your logs folder
# this will open the TensorBoard UI so you can see graphs
%tensorboard --logdir logs

## `Using XgBoost`

In [None]:
xgboost_model = XGBClassifier()

xgboost_model.fit(X_train, y_train)

In [None]:
y_preds = xgboost_model.predict(X_test)

acc = accuracy_score(y_test, y_preds)
print(f"The Accuracy For XgBoost Model is : {acc}")