Load necessary packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from utils.data_utils import *

Loading the cleaned dataset

In [2]:
train_df = pd.read_csv('train_clean.csv')
X_test = pd.read_csv('test_clean.csv')

In [3]:
# split the train df into train and val
X = train_df.drop('monthly_rent', axis=1)
y = train_df['monthly_rent']

In [4]:
# split it into train and val datasets from the train_df
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

First look at X_train and y_train

In [5]:
X_train.head()

Unnamed: 0,rent_approval_date,flat_type,floor_area_sqm,lease_commence_date,latitude,longitude,distance_to_nearest_existing_mrt,distance_to_nearest_planned_mrt,distance_to_nearest_school,distance_to_nearest_mall,...,town_pasir ris,town_punggol,town_queenstown,town_sembawang,town_sengkang,town_serangoon,town_tampines,town_toa payoh,town_woodlands,town_yishun
49578,0.866081,0.5,0.39779,0.603774,1.308722,103.796751,0.147207,0.3264,0.507243,0.097611,...,False,False,True,False,False,False,False,False,False,False
50763,0.767289,0.75,0.563536,0.339623,1.346522,103.734843,0.201964,0.068745,0.346423,0.682251,...,False,False,False,False,False,False,False,False,False,False
24147,0.933041,0.75,0.491713,0.490566,1.367566,103.951903,0.260956,0.060712,0.028869,0.249039,...,True,False,False,False,False,False,False,False,False,False
13290,0.599341,0.25,0.21547,0.396226,1.363576,103.745977,0.336514,0.071959,0.14546,0.647535,...,False,False,False,False,False,False,False,False,False,False
17890,0.198683,0.75,0.558011,0.54717,1.346176,103.757834,0.385762,0.162398,0.06242,0.461309,...,False,False,False,False,False,False,False,False,False,False


In [6]:
y_train.head()

49578    4000
50763    4000
24147    3550
13290    2750
17890    2000
Name: monthly_rent, dtype: int64

Using GridSearchCV to get best model - The below code makes use of the GridSearchCV in order to find the optimal hyper parameters. A pipeline is used here, which consists of Standard Scaler for feature scaling and an SVR(Support Vector Regressor) model. Param_grid specifies the hyper parameters of the models to tune. We specify the pipeline, the hyper parameter grid, the number of cross-validation folds(cv), the scoring metric, which is root mean squared error here, and verbsity.

In [5]:
# Create a pipeline for data preprocessing and model
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('model', SVR())  # Support Vector Machine Regressor
])

param_grid = {
    'model__C': [0.01, 0.1, 1],  # Regularization parameter
    'model__kernel': ['rbf'],  # Kernel type (linear or radial basis function)
    'model__gamma': [0.1, 1, 'auto']  # Kernel coefficient for 'rbf' kernel
}

# Create GridSearchCV with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3)

# Fit the model to the training data while searching for the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best estimator (model)
best_model = grid_search.best_estimator_

# Make predictions on the test data
y_pred = best_model.predict(X_val)

# Evaluate the model's performance
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)
print("Best Hyperparameters:", grid_search.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END model__C=0.01, model__gamma=0.1, model__kernel=rbf;, score=-544046.431 total time= 2.7min
[CV 2/5] END model__C=0.01, model__gamma=0.1, model__kernel=rbf;, score=-551290.569 total time= 2.6min
[CV 3/5] END model__C=0.01, model__gamma=0.1, model__kernel=rbf;, score=-555532.456 total time= 2.6min
[CV 4/5] END model__C=0.01, model__gamma=0.1, model__kernel=rbf;, score=-508283.656 total time= 2.6min
[CV 5/5] END model__C=0.01, model__gamma=0.1, model__kernel=rbf;, score=-564347.296 total time= 2.8min
[CV 1/5] END model__C=0.01, model__gamma=1, model__kernel=rbf;, score=-546617.837 total time= 3.9min
[CV 2/5] END model__C=0.01, model__gamma=1, model__kernel=rbf;, score=-553654.115 total time= 3.9min
[CV 3/5] END model__C=0.01, model__gamma=1, model__kernel=rbf;, score=-557604.794 total time= 3.9min
[CV 4/5] END model__C=0.01, model__gamma=1, model__kernel=rbf;, score=-509002.404 total time= 3.8min
[CV 5/5] END model__C

Making predictions on X_test using the best SVM model

In [9]:
y_pred = best_model.predict(X_test)

save_test_predictions_in_kaggle_format(y_pred, "SVM", True)

Unnamed: 0,Id,Predicted
0,0,2822.229715
1,1,2532.028558
2,2,2807.762254
3,3,2216.400079
4,4,2630.054019
...,...,...
29995,29995,2678.942455
29996,29996,2603.261625
29997,29997,2604.903743
29998,29998,2849.078478
