Importing required packages

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer, pairwise_distances
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
from utils.data_utils import *

Loading the cleaned dataset

In [2]:
train_df = pd.read_csv('train_clean.csv')
X_test = pd.read_csv('test_clean.csv')

In [3]:
# split the train df into train and val
X = train_df.drop('monthly_rent', axis=1)
y = train_df['monthly_rent']

In [4]:
# split it into train and val datasets from the train_df
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

First look at X_train and y_train

In [5]:
X_train.head()

Unnamed: 0,rent_approval_date,flat_type,floor_area_sqm,lease_commence_date,latitude,longitude,distance_to_nearest_existing_mrt,distance_to_nearest_planned_mrt,distance_to_nearest_school,distance_to_nearest_mall,...,town_pasir ris,town_punggol,town_queenstown,town_sembawang,town_sengkang,town_serangoon,town_tampines,town_toa payoh,town_woodlands,town_yishun
49578,0.866081,0.5,0.39779,0.603774,1.308722,103.796751,0.147207,0.3264,0.507243,0.097611,...,False,False,True,False,False,False,False,False,False,False
50763,0.767289,0.75,0.563536,0.339623,1.346522,103.734843,0.201964,0.068745,0.346423,0.682251,...,False,False,False,False,False,False,False,False,False,False
24147,0.933041,0.75,0.491713,0.490566,1.367566,103.951903,0.260956,0.060712,0.028869,0.249039,...,True,False,False,False,False,False,False,False,False,False
13290,0.599341,0.25,0.21547,0.396226,1.363576,103.745977,0.336514,0.071959,0.14546,0.647535,...,False,False,False,False,False,False,False,False,False,False
17890,0.198683,0.75,0.558011,0.54717,1.346176,103.757834,0.385762,0.162398,0.06242,0.461309,...,False,False,False,False,False,False,False,False,False,False


In [6]:
y_train.head()

49578    4000
50763    4000
24147    3550
13290    2750
17890    2000
Name: monthly_rent, dtype: int64

In [7]:
def save_test_predictions_in_kaggle_format(array, experiment=None, save=False):
    # Create a DataFrame with "Id" and "Predicted" columns
    df = pd.DataFrame({'Id': range(len(array)), 'Predicted': array})

    if save:
        assert experiment != None
        # Save the DataFrame to a CSV file
        df.to_csv(f"{experiment}-submission.csv", index=False)

    return df

Using GridSearchCV to get best model

In [8]:
# Doing grid search
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False],
    'copy_X': [True],
    'n_jobs': [None, -1],
}


# Create the Linear Regression regressor
model = LinearRegression()

# Define the scoring metric (MSE)
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Create the GridSearchCV object with verbose logging
grid_search = GridSearchCV(model, param_grid, scoring=mse_scorer, cv=10, verbose=4)

# Fit the model to the data (X_train, y_train)
grid_search.fit(X_train, y_train)

# Get the best estimator (model)
best_linear_regressor = grid_search.best_estimator_

# Make predictions on the validation data
y_pred = best_linear_regressor.predict(X_val)

# Calculate the Root Mean Squared Error (RMSE)
rms = mean_squared_error(y_val, y_pred) ** 0.5

# Print the best hyperparameters and RMSE
print("Best Hyperparameters:", grid_search.best_params_)
print(f"Root Mean Squared Error: {rms}")


Fitting 10 folds for each of 8 candidates, totalling 80 fits
[CV 1/10] END copy_X=True, fit_intercept=True, n_jobs=None, positive=True;, score=-260509.370 total time=   0.3s
[CV 2/10] END copy_X=True, fit_intercept=True, n_jobs=None, positive=True;, score=-258574.337 total time=   0.3s
[CV 3/10] END copy_X=True, fit_intercept=True, n_jobs=None, positive=True;, score=-270041.336 total time=   0.3s
[CV 4/10] END copy_X=True, fit_intercept=True, n_jobs=None, positive=True;, score=-263266.268 total time=   0.3s
[CV 5/10] END copy_X=True, fit_intercept=True, n_jobs=None, positive=True;, score=-259633.387 total time=   0.3s
[CV 6/10] END copy_X=True, fit_intercept=True, n_jobs=None, positive=True;, score=-264391.432 total time=   0.3s
[CV 7/10] END copy_X=True, fit_intercept=True, n_jobs=None, positive=True;, score=-254230.753 total time=   0.3s
[CV 8/10] END copy_X=True, fit_intercept=True, n_jobs=None, positive=True;, score=-267841.125 total time=   0.3s
[CV 9/10] END copy_X=True, fit_inte

Making predictions on X_test using the best linear regressor

In [9]:
y_pred_test = best_linear_regressor.predict(X_test)

save_test_predictions_in_kaggle_format(y_pred_test, "Linear-Regression", True)

Unnamed: 0,Id,Predicted
0,0,3106.846239
1,1,2673.968348
2,2,3323.360515
3,3,1912.389619
4,4,2787.199251
...,...,...
29995,29995,2807.784869
29996,29996,2773.745638
29997,29997,2810.657691
29998,29998,3398.259834
