In [1]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
import pickle

In [2]:
data = pd.read_csv("../Data/Churn_Modelling.csv")
data.head

<bound method NDFrame.head of       RowNumber  CustomerId    Surname  CreditScore Geography  Gender  Age  \
0             1    15634602   Hargrave          619    France  Female   42   
1             2    15647311       Hill          608     Spain  Female   41   
2             3    15619304       Onio          502    France  Female   42   
3             4    15701354       Boni          699    France  Female   39   
4             5    15737888   Mitchell          850     Spain  Female   43   
...         ...         ...        ...          ...       ...     ...  ...   
9995       9996    15606229   Obijiaku          771    France    Male   39   
9996       9997    15569892  Johnstone          516    France    Male   35   
9997       9998    15584532        Liu          709    France  Female   36   
9998       9999    15682355  Sabbatini          772   Germany    Male   42   
9999      10000    15628319     Walker          792    France  Female   28   

      Tenure    Balance  NumOfPro

In [3]:
## Preprocess the data - drop irrelevant features
data.drop(['RowNumber' , 'CustomerId' , 'Surname'] , axis = 1, inplace= True)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
# Label encode gender
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])

In [5]:
# One hot encode geography
ohe_encoder = OneHotEncoder()
geo_encoder = ohe_encoder.fit_transform(data[['Geography']])
geo_encoded_df = pd.DataFrame(geo_encoder.toarray() , columns = ohe_encoder.get_feature_names_out(['Geography']))
geo_encoded_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [6]:
df = pd.concat([data, geo_encoded_df] , axis = 1).drop('Geography', axis=1)
df

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,1,0.0,1.0,0.0


In [7]:
# Split the data into independent and dependent features
X = df.drop('EstimatedSalary' , axis = 1)
y = df['EstimatedSalary']

In [8]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 33)

# standard scaling
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

In [9]:
# Saving all the encoders to pickle file
with open('regression_label_encoder_gender.pkl' , 'wb') as file:
    pickle.dump(label_encoder, file)
    
with open('regression_ohe.pkl' , 'wb') as file:
    pickle.dump(ohe_encoder, file)

with open('regression_scalar.pkl' , 'wb') as file:
    pickle.dump(scalar, file)

In [10]:
## ANN with Regression problem statement
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense 

In [11]:
# Build the model
model = Sequential([
    Dense(64, activation = 'relu' , input_shape = (X_train_scaled.shape[1],)), # input layer
    Dense(32, activation = 'relu'), # hidden layer
    Dense(1) # output layer
])

# compile the model
model.compile(optimizer = 'adam' , loss = 'mean_absolute_error' , metrics = ['mae'])

# summary
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [12]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

# log dir
log_dir = "reglogs/fit"
tensorboard_callback = TensorBoard(log_dir = log_dir, histogram_freq = 1)

In [13]:
# set up early stopping
early_stopping_callback = EarlyStopping(monitor = 'val_loss' , patience = 10, restore_best_weights= True)

In [14]:
# train the model
history = model.fit(X_train_scaled, y_train, 
                    validation_data=(X_test_scaled , y_test), 
                    epochs=100, 
                    callbacks=[tensorboard_callback, early_stopping_callback])

Epoch 1/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 869us/step - loss: 101789.8516 - mae: 101789.8516 - val_loss: 99699.9453 - val_mae: 99699.9453
Epoch 2/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 555us/step - loss: 99812.0078 - mae: 99812.0078 - val_loss: 98655.3438 - val_mae: 98655.3438
Epoch 3/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 554us/step - loss: 98269.8203 - mae: 98269.8203 - val_loss: 95843.9766 - val_mae: 95843.9766
Epoch 4/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 536us/step - loss: 94362.7734 - mae: 94362.7734 - val_loss: 90943.3672 - val_mae: 90943.3672
Epoch 5/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 574us/step - loss: 89910.5000 - mae: 89910.5000 - val_loss: 84242.0078 - val_mae: 84242.0078
Epoch 6/100
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 574us/step - loss: 81753.5391 - mae: 81753.5391 - val_loss: 76401

In [15]:
%load_ext tensorboard

In [16]:
%tensorboard --logdir reglogs/fit

Reusing TensorBoard on port 6009 (pid 22339), started 1 day, 22:37:06 ago. (Use '!kill 22339' to kill it.)

In [17]:
# Evaluate the model
test_loss , test_mae = model.evaluate(X_test_scaled, y_test)
print(f"Test MAE: {test_mae}")

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 488us/step - loss: 50705.6641 - mae: 50705.6641
Test MAE: 49346.953125


In [18]:
model.save('regression_model.h5')

