# Load necessary packages

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, make_scorer, pairwise_distances
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesRegressor
from utils.data_utils import *

# Loading the cleaned dataset

In [16]:
train_df = pd.read_csv('train_clean.csv')
X_test = pd.read_csv('test_clean.csv')

In [4]:
# split the train df into train and val
X = train_df.drop('monthly_rent', axis=1)
y = train_df['monthly_rent']

In [5]:
# split it into train and val datasets from the train_df
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# First look at X_train and y_train

In [6]:
X_train.head()

Unnamed: 0,rent_approval_date,flat_type,floor_area_sqm,lease_commence_date,latitude,longitude,distance_to_nearest_existing_mrt,distance_to_nearest_planned_mrt,distance_to_nearest_school,distance_to_nearest_mall,...,town_pasir ris,town_punggol,town_queenstown,town_sembawang,town_sengkang,town_serangoon,town_tampines,town_toa payoh,town_woodlands,town_yishun
49578,0.866081,0.5,0.39779,0.603774,1.308722,103.796751,0.147207,0.3264,0.507243,0.097611,...,False,False,True,False,False,False,False,False,False,False
50763,0.767289,0.75,0.563536,0.339623,1.346522,103.734843,0.201964,0.068745,0.346423,0.682251,...,False,False,False,False,False,False,False,False,False,False
24147,0.933041,0.75,0.491713,0.490566,1.367566,103.951903,0.260956,0.060712,0.028869,0.249039,...,True,False,False,False,False,False,False,False,False,False
13290,0.599341,0.25,0.21547,0.396226,1.363576,103.745977,0.336514,0.071959,0.14546,0.647535,...,False,False,False,False,False,False,False,False,False,False
17890,0.198683,0.75,0.558011,0.54717,1.346176,103.757834,0.385762,0.162398,0.06242,0.461309,...,False,False,False,False,False,False,False,False,False,False


In [7]:
y_train.head()

49578    4000
50763    4000
24147    3550
13290    2750
17890    2000
Name: monthly_rent, dtype: int64

# Using GridSearchCV to get best model

In [11]:
# Create a param grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5,10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt']
}


# Create the KNN regressor
model = ExtraTreesRegressor(n_estimators=100, random_state=0)

# Define the scoring metric (MSE)
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Create the GridSearchCV object with verbose logging
grid_search = GridSearchCV(model, param_grid, scoring=mse_scorer, cv=5, verbose=3)

# Fit the model to the data (X_train, y_train)
grid_search.fit(X_train, y_train)

# Get the best estimator (model)
best_extra_tree_regressor = grid_search.best_estimator_

# Make predictions on the validation data
y_pred = best_extra_tree_regressor.predict(X_val)


# Calculate the Root Mean Squared Error (RMSE)
rms = mean_squared_error(y_val, y_pred) ** 0.5

# Print the best hyperparameters and RMSE
print("Best Hyperparameters:", grid_search.best_params_)
print(f"Root Mean Squared Error: {rms}")


Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV 1/5] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-259917.662 total time=   1.5s
[CV 2/5] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-262612.920 total time=   1.5s
[CV 3/5] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-258862.724 total time=   1.5s
[CV 4/5] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-255877.956 total time=   1.5s
[CV 5/5] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=-271580.472 total time=   1.7s
[CV 1/5] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=-259130.500 total time=   3.7s
[CV 2/5] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_sp

# Results obtained

Best Hyperparameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}

Root Mean Squared Error: 486.9619342942999

# Making predictions on X_test using the best extra tree regressor model

In [14]:
y_pred = best_extra_tree_regressor.predict(X_test)

save_test_predictions_in_kaggle_format(y_pred, "Extra-Tree-Regression", True)

Unnamed: 0,Id,Predicted
0,0,3184.528553
1,1,2756.905670
2,2,3541.841460
3,3,1987.271219
4,4,2567.384394
...,...,...
29995,29995,2950.536718
29996,29996,2891.700439
29997,29997,2656.544013
29998,29998,3238.637614
