<a href="https://colab.research.google.com/github/ShuHuiK/WIE3007_Group_Assignment/blob/ShuHui/Step_2_Neural_Network_Predictive%20Model_Development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Load Dataset**

In [7]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Load the dataset
df = pd.read_csv('2025_Sterling_Financial_Dataset_clean.csv')

**Feature Selection & Categorical Encoding**

In [8]:
# Defining features (X) and target (y)
# We exclude IDs and date-based columns that don't have predictive power for income
features = ['business_sector', 'age', 'credit_score', 'savings_ratio',
            'debt_to_income', 'credit_utilization', 'risk_category', 'customer_segment']
target = 'income'

X = df[features]
y = df[target]

# Define which columns are categorical and which are numerical
categorical_features = ['business_sector', 'risk_category', 'customer_segment']
numerical_features = ['age', 'credit_score', 'savings_ratio', 'debt_to_income', 'credit_utilization']

# Preprocessing: One-Hot Encode categories and Scale numerical values
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

**Train-Test Split**

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing to training and testing sets
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

**Model Architecture & Training**

In [10]:
# Initialize the MLP Regressor
mlp = MLPRegressor(
    hidden_layer_sizes=(64, 32),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42,
    early_stopping=True
)

# Train the model
mlp.fit(X_train_transformed, y_train)

**Model Evaluation**

In [11]:
# Predictions
y_pred = mlp.predict(X_test_transformed)

# Metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"--- MLP Model Performance ---")
print(f"R-squared Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): ${mae:.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:.2f}")

--- MLP Model Performance ---
R-squared Score: 0.2671
Mean Absolute Error (MAE): $21261.07
Root Mean Squared Error (RMSE): $27650.29
