# Importing required packages

In [2]:
import pandas as pd
from sklearn.metrics import mean_squared_error, make_scorer, pairwise_distances
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from utils.data_utils import *

# Loading the cleaned dataset

In [3]:
train_df = pd.read_csv('train_clean.csv')
X_test = pd.read_csv('test_clean.csv')

In [4]:
# split the train df into train and val
X = train_df.drop('monthly_rent', axis=1)
y = train_df['monthly_rent']

In [5]:
# split it into train and val datasets from the train_df
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# First look at X_train and y_train

In [6]:
X_train.head()

Unnamed: 0,rent_approval_date,flat_type,floor_area_sqm,lease_commence_date,latitude,longitude,distance_to_nearest_existing_mrt,distance_to_nearest_planned_mrt,distance_to_nearest_school,distance_to_nearest_mall,...,town_pasir ris,town_punggol,town_queenstown,town_sembawang,town_sengkang,town_serangoon,town_tampines,town_toa payoh,town_woodlands,town_yishun
49578,0.866081,0.5,0.39779,0.603774,1.308722,103.796751,0.147207,0.3264,0.507243,0.097611,...,False,False,True,False,False,False,False,False,False,False
50763,0.767289,0.75,0.563536,0.339623,1.346522,103.734843,0.201964,0.068745,0.346423,0.682251,...,False,False,False,False,False,False,False,False,False,False
24147,0.933041,0.75,0.491713,0.490566,1.367566,103.951903,0.260956,0.060712,0.028869,0.249039,...,True,False,False,False,False,False,False,False,False,False
13290,0.599341,0.25,0.21547,0.396226,1.363576,103.745977,0.336514,0.071959,0.14546,0.647535,...,False,False,False,False,False,False,False,False,False,False
17890,0.198683,0.75,0.558011,0.54717,1.346176,103.757834,0.385762,0.162398,0.06242,0.461309,...,False,False,False,False,False,False,False,False,False,False


In [7]:
y_train.head()

49578    4000
50763    4000
24147    3550
13290    2750
17890    2000
Name: monthly_rent, dtype: int64

# Fit the model

In [6]:
degree = 2  # You can adjust the degree as needed
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)

# Create and fit a linear regression model on the training data
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Predict on X_train and X_val

In [7]:
y_train_pred = model.predict(X_train_poly)
y_val_pred = model.predict(X_val_poly)

# Calculate RMSE

In [8]:
rms_train = mean_squared_error(y_train, y_train_pred, squared=False)
rms_val = mean_squared_error(y_val, y_val_pred, squared=False)

In [9]:
print("rmse train = ", rms_train)
print("rms val = ", rms_val)

rmse train =  484.9438816351702
rms val =  484.0383976317791


# Predict on X_test

In [10]:
X_test_poly = poly.transform(X_test)

In [11]:
y_pred_test = model.predict(X_test_poly)

# Save the predictions

In [14]:
save_test_predictions_in_kaggle_format(y_pred_test, "Polynomial-Regression", True)

Unnamed: 0,Id,Predicted
0,0,3161.272846
1,1,2730.308002
2,2,3521.933002
3,3,1912.933002
4,4,2629.933002
...,...,...
29995,29995,2872.214252
29996,29996,2794.933002
29997,29997,2767.409565
29998,29998,3218.183002
