## Importing Libraries && Loading the Dataset

In [81]:
import pandas as pd

In [82]:
df = pd.read_csv('Data/Data.csv')
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [83]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [84]:
df.shape

(10000, 14)

## Preprocessing && Feature Engineering using Data Wrangler

In [85]:
df.Surname.unique()

array(['Hargrave', 'Hill', 'Onio', ..., 'Kashiwagi', 'Aldridge',
       'Burbidge'], shape=(2932,), dtype=object)

In [95]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler



def clean_df(df):        
    sur_le = LabelEncoder()
    gen_le = LabelEncoder()
    geo_one = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    scaler = StandardScaler()
    
    df = df.drop(['RowNumber'], axis=1)
    
    # Encode Gender
    df['Gender'] = gen_le.fit_transform(df['Gender'])
    
    # One-hot encode Geography
    geo_vals = geo_one.fit_transform(df[['Geography']])
    geo_cols = geo_one.get_feature_names_out(['Geography'])
    geo_df = pd.DataFrame(geo_vals, columns=geo_cols, index=df.index)
    
    # TF-IDF encode Surname
    df['Surname'] = sur_le.fit_transform(df['Surname'])
    
    # Concatenate encoded columns
    df = pd.concat([df, geo_df], axis=1)
    df = df.drop(['Geography', 'Surname'],axis=1)
    
    # Prepare features and target
    X = df.drop(['EstimatedSalary'], axis=1)
    y = df['EstimatedSalary']
    
    # Scale features
    X = scaler.fit_transform(X)
    
    return df, X, y


In [96]:
df1, X, y = clean_df(df.copy())

# Compute correlation
corr = df1.corr()

# Sort features by correlation with EstimatedSalary
corr_sorted = corr['EstimatedSalary'].sort_values(ascending=False)

print(corr_sorted)


EstimatedSalary      1.000000
CustomerId           0.015271
NumOfProducts        0.014204
Balance              0.012797
Exited               0.012097
Geography_Germany    0.010297
Tenure               0.007784
CreditScore         -0.001384
Geography_France    -0.003332
Geography_Spain     -0.006482
Age                 -0.007201
Gender              -0.008112
HasCrCard           -0.009933
IsActiveMember      -0.011421
Name: EstimatedSalary, dtype: float64


In [97]:
# Compute correlation
corr = df1.corr()

# Sort features by correlation with EstimatedSalary
corr_sorted = corr['Exited'].sort_values(ascending=False)

print(corr_sorted)


Exited               1.000000
Age                  0.285323
Geography_Germany    0.173488
Balance              0.118533
EstimatedSalary      0.012097
CustomerId          -0.006248
HasCrCard           -0.007138
Tenure              -0.014001
CreditScore         -0.027094
NumOfProducts       -0.047820
Geography_Spain     -0.052667
Geography_France    -0.104955
Gender              -0.106512
IsActiveMember      -0.156128
Name: Exited, dtype: float64


In [98]:
X

array([[-0.78321342, -0.32622142, -1.09598752, ...,  0.99720391,
        -0.57873591, -0.57380915],
       [-0.60653412, -0.44003595, -1.09598752, ..., -1.00280393,
        -0.57873591,  1.74273971],
       [-0.99588476, -1.53679418, -1.09598752, ...,  0.99720391,
        -0.57873591, -0.57380915],
       ...,
       [-1.47928179,  0.60498839, -1.09598752, ...,  0.99720391,
        -0.57873591, -0.57380915],
       [-0.11935577,  1.25683526,  0.91241915, ..., -1.00280393,
         1.72790383, -0.57380915],
       [-0.87055909,  1.46377078, -1.09598752, ...,  0.99720391,
        -0.57873591, -0.57380915]], shape=(10000, 13))

In [99]:
y

0       101348.88
1       112542.58
2       113931.57
3        93826.63
4        79084.10
          ...    
9995     96270.64
9996    101699.77
9997     42085.58
9998     92888.52
9999     38190.78
Name: EstimatedSalary, Length: 10000, dtype: float64

In [100]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=42)

## ANN Implementation

In [101]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

In [102]:
model = Sequential([
    Dense(256,activation='relu',input_shape= (X_train.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='linear'),
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
opt = Adam(learning_rate=0.0001)
early_stopping = EarlyStopping(monitor='mae', patience=2, restore_best_weights=True)

In [104]:
model.compile(optimizer=opt,
            loss='mae',
            metrics=['mae','mse'])

In [105]:
history = model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=200,batch_size=16,callbacks=[early_stopping])

Epoch 1/200


[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 101104.3750 - mae: 101104.3750 - mse: 13551680512.0000 - val_loss: 97943.5078 - val_mae: 97943.5078 - val_mse: 12861529088.0000
Epoch 2/200
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 97531.8047 - mae: 97531.8047 - mse: 12824076288.0000 - val_loss: 88590.9531 - val_mae: 88590.9531 - val_mse: 11017103360.0000
Epoch 3/200
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 77999.3281 - mae: 77999.3281 - mse: 8892622848.0000 - val_loss: 61344.2969 - val_mae: 61344.2969 - val_mse: 5631465984.0000
Epoch 4/200
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 55088.3984 - mae: 55088.3984 - mse: 4293749504.0000 - val_loss: 50189.8594 - val_mae: 50189.8594 - val_mse: 3434599424.0000
Epoch 5/200
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 50419.5430 - mae: 50419.5430 - mse:

KeyboardInterrupt: 

In [44]:
from sklearn.metrics import r2_score
y_pred = model.predict(X_test)
r2_score(y_test,y_pred)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


-0.8383569974168563