In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error, make_scorer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingRandomSearchCV


In [None]:
df = pd.read_csv('../Models/Real_Estate_Model.csv')

In [3]:
df.dtypes

status             object
price             float64
bed                 int64
bath                int64
city               object
state              object
zip_code          float64
house_size        float64
Price_per_sqft    float64
dtype: object

In [7]:
#change data type of city, state, and zip_code to category
df['city'] = df['city'].astype('category')
df['state'] = df['state'].astype('category')
df['zip_code'] = df['zip_code'].astype('category')
df['bed'] = df['bed'].astype(int)
df['bath'] = df['bath'].astype(int)

In [8]:
df[['price', 'house_size', 'bed', 'bath']].corr()

Unnamed: 0,price,house_size,bed,bath
price,1.0,0.460242,0.275841,0.461083
house_size,0.460242,1.0,0.659245,0.724302
bed,0.275841,0.659245,1.0,0.573037
bath,0.461083,0.724302,0.573037,1.0


In [9]:
# Select features (X) and target (y)
X = df[['bath', 'bed', 'house_size', 'state', 'city']].copy()  # Explicitly create a copy
y = df['price']  # Target variable

# Encode categorical variables
label_encoder_state = LabelEncoder()
label_encoder_city = LabelEncoder()

X['state'] = label_encoder_state.fit_transform(X['state'])
X['city'] = label_encoder_city.fit_transform(X['city'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# Create the MLP model
mlp = MLPRegressor(
    hidden_layer_sizes=(64, 32),  # Two hidden layers with 64 and 32 neurons
    activation='relu',           # Activation function
    solver='adam',               # Optimizer
    max_iter=500,     
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,           # Maximum number of iterations
    random_state=42
)

# Train the model
mlp.fit(X_train, y_train)

In [11]:
y_pred = mlp.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("Mean Absolute Percentage Error:", mape)

Mean Squared Error: 25916166748.622704
R^2 Score: 0.4487242577066005
Root Mean Squared Error: 160984.9892027909
Mean Absolute Error: 120943.42982411437
Mean Absolute Percentage Error: 0.42822387893158276


Hyper-parameter tuning

In [12]:
param_dist = {
    'hidden_layer_sizes': [(64, 32), (128, 64)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': uniform(0.01),
    'learning_rate_init': uniform(0.01)
}

halving_search = HalvingRandomSearchCV(
    estimator=mlp,
    param_distributions=param_dist,
    n_candidates=20,
    scoring='neg_mean_absolute_percentage_error',
    cv=2,
    factor=3,  # default: reduce candidates by a factor of 3 in each iteration
    resource='n_samples',
    max_resources=100000,
    min_resources='smallest',
    aggressive_elimination=True,
    random_state=42,
    n_jobs=-1,
    verbose=2
)

halving_search.fit(X_train, y_train)

print("Best Parameters:", halving_search.best_params_)
print("Best MAPE Score:", abs(halving_search.best_score_))

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 10
min_resources_: 4
max_resources_: 100000
aggressive_elimination: True
factor: 3
----------
iter: 0
n_candidates: 20
n_resources: 8748
Fitting 2 folds for each of 20 candidates, totalling 40 fits
----------
iter: 1
n_candidates: 7
n_resources: 26244
Fitting 2 folds for each of 7 candidates, totalling 14 fits
----------
iter: 2
n_candidates: 3
n_resources: 78732
Fitting 2 folds for each of 3 candidates, totalling 6 fits
Best Parameters: {'activation': 'relu', 'alpha': np.float64(0.4419450186421158), 'hidden_layer_sizes': (64, 32), 'learning_rate_init': np.float64(0.5347746602583892), 'solver': 'adam'}
Best MAPE Score: 0.44507388963043965


Hyperparameter tuning confirmed that the parameters used in the base model were optimal, as they were ultimately selected as the best. Since LGBM outperformed the other models when it comes to performance and efficiency, I will proceed with it for the app deployment.