# Spending habits
Analysing data handling with both K-NN and a basic neural network

In [1]:
import numpy as np 
import pandas as pd 
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers

from sklearn.model_selection import train_test_split
import sklearn
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.metrics import mean_squared_error 


In [2]:
data = pd.read_csv("./Mall_Customers.csv")

## Data handling

In [3]:
data_target = data['Spending Score (1-100)']

In [4]:
data = data.drop(['CustomerID', 'Spending Score (1-100)'], axis=1)

In [5]:
data.head()

Unnamed: 0,Gender,Age,Annual Income (k$)
0,Male,19,15
1,Male,21,15
2,Female,20,16
3,Female,23,16
4,Female,31,17


In [6]:
# Binary encode gender
data['Gender'] = data['Gender'].str.strip().str.lower().map({'male': 1, 'female': 0})

In [7]:
data_features = data.copy()
data_features['Income_per_Age'] = data_features['Annual Income (k$)'] / data_features['Age']

### For K-NN

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, data_target, test_size=0.3, random_state=42)

### Normalising data

In [9]:
train_numeric = X_train.drop(columns=["Gender"])
test_numeric = X_test.drop(columns=["Gender"])
test_numeric.tail()

Unnamed: 0,Age,Annual Income (k$)
73,60,50
140,57,75
98,48,61
172,36,87
96,47,60


In [10]:
norm = tf.keras.layers.Normalization(axis=-1)
norm.adapt(train_numeric.to_numpy())

In [11]:
X_train_norm = norm(train_numeric.to_numpy()).numpy()
X_test_norm = norm(test_numeric.to_numpy()).numpy()

In [12]:
X_train_final = np.concatenate([X_train_norm, X_train[['Gender']].to_numpy()], axis=1)
X_test_final = np.concatenate([X_test_norm, X_test[['Gender']].to_numpy()], axis=1)

In [13]:
print(X_train_final)

[[-0.47625229  0.98322105  1.        ]
 [-0.83128601 -0.01051569  0.        ]
 [-1.25732648 -1.11466765  0.        ]
 [ 1.369923   -1.48271823  0.        ]
 [-1.25732648 -1.00425243  0.        ]
 [ 0.80186903  0.24711975  0.        ]
 [ 0.65985554 -0.78342205  1.        ]
 [-0.19222532  1.57210207  0.        ]
 [ 2.07999039 -0.04732076  0.        ]
 [ 0.37582859  0.46795014  0.        ]
 [-0.40524554 -0.67300683  1.        ]
 [ 0.09180164  0.39434001  1.        ]
 [-1.39933991  0.76239067  1.        ]
 [-1.04430616 -0.82022715  1.        ]
 [ 0.87287581  0.24711975  0.        ]
 [-1.39933991  0.13670456  1.        ]
 [ 0.44683534 -1.18827772  0.        ]
 [-0.26323205  0.5047552   0.        ]
 [-1.11531293  0.06309444  0.        ]
 [-1.39933991  0.0998995   0.        ]
 [-1.32833314 -1.62993848  0.        ]
 [ 0.09180164 -0.23134609  1.        ]
 [ 0.80186903 -0.74661696  0.        ]
 [-0.76027924  1.498492    1.        ]
 [-0.47625229  1.57210207  0.        ]
 [ 0.65985554 -0.23134609

## Using K-NN

In [14]:
knn_regressor = KNeighborsRegressor(n_neighbors=20) 
knn_regressor.fit(X_train_final, y_train)

0,1,2
,n_neighbors,20
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [15]:
y_pred = knn_regressor.predict(X_test_final)
mse = mean_squared_error(y_test, y_pred)

In [16]:
print(f"Mean Squared Error at 20: {mse}")
print(f"Root Mean Squared Error at 20: {np.sqrt(mse)}")

Mean Squared Error at 20: 523.364125
Root Mean Squared Error at 20: 22.877152904153085


In [17]:
for i in range(1,10):
    knn_regressor = KNeighborsRegressor(n_neighbors=i) 
    knn_regressor.fit(X_train_final, y_train)

    y_pred = knn_regressor.predict(X_test_final)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Mean Squared Error at {i}: {mse}")
    print(f"Root Mean Squared Error at {i}: {np.sqrt(mse)}")

Mean Squared Error at 1: 825.8666666666667
Root Mean Squared Error at 1: 28.737896002781184
Mean Squared Error at 2: 510.84166666666664
Root Mean Squared Error at 2: 22.601806712443736
Mean Squared Error at 3: 488.76111111111106
Root Mean Squared Error at 3: 22.107942263157625
Mean Squared Error at 4: 513.3625
Root Mean Squared Error at 4: 22.65750427562577
Mean Squared Error at 5: 467.25
Root Mean Squared Error at 5: 21.615966321217286
Mean Squared Error at 6: 480.1592592592593
Root Mean Squared Error at 6: 21.912536577476814
Mean Squared Error at 7: 500.17823129251695
Root Mean Squared Error at 7: 22.36466479275996
Mean Squared Error at 8: 516.5572916666666
Root Mean Squared Error at 8: 22.727896771735537
Mean Squared Error at 9: 531.5578189300411
Root Mean Squared Error at 9: 23.05553770637417


As seen, K=5 is the best so far with a value of 21.6

## Using Neural Network

In [65]:
linear_model = tf.keras.Sequential([
    layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.0003)),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.0006)),
    layers.Dropout(0.1),
    # layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.0003)),
    layers.Dense(1, activation='linear')
])

In [66]:
linear_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
    loss='mean_squared_error', 
    metrics=['mse']
)

In [77]:
history = linear_model.fit(
    X_train_final, 
    y_train,     
    epochs=2000,    
    validation_split=0.2,
    batch_size=32,
    verbose=0,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)] 
)

In [78]:
print("Evaluating model on the test set:")
results = linear_model.evaluate(X_test_final, y_test, verbose=0)
loss = results[0]
mse = results[1]

rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Loss is: {loss}")

Evaluating model on the test set:
Mean Squared Error (MSE): 399.09
Root Mean Squared Error (RMSE): 19.98
Loss is: 399.3721618652344


Lowest score I've gotten was 19.83

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming 'data_features' includes your engineered features
# and 'data_target' is your 'Spending Score (1-100)'
X = data_features[['Gender', 'Age', 'Annual Income (k$)', 'Income_per_Age']]
y = data_target

X_train_rfr, X_test_rfr, y_train_rfr, y_test_rfr = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
rf_model = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=10)
rf_model.fit(X_train_rfr, y_train_rfr)

# Evaluate the model
y_pred_rfr = rf_model.predict(X_test_rfr)
mse = mean_squared_error(y_test_rfr, y_pred_rfr)
rmse = np.sqrt(mse)

print(f"Random Forest RMSE: {rmse:.2f}")

Score of 19.63, a still better

## Conclusion

RMSE of:
- K-NN = 21.6
- Nueral network = 19.83
- Random Forest = 19.63