In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import HistGradientBoostingRegressor

In [69]:
df_train = pd.read_csv("../data/train_new.csv")
print('Number of train records: {}'.format(len(df_train)))
df_train.head()

Number of train records: 60000


Unnamed: 0,rent_approval_date,town_0,town_1,town_2,town_3,town_4,flat_model,floor_area_sqm,lease_commence_date,latitude,...,flat_type_3-room,flat_type_4-room,flat_type_executive,flat_type_5-room,flat_type_2-room,central region,east region,north region,north-east region,west region
0,0.038835,0,0,0,0,1,2,67.0,1983,1.344518,...,1,0,0,0,0,0,0,0,0,1
1,0.504854,0,0,0,1,0,2,92.0,1978,1.330186,...,0,1,0,0,0,0,1,0,0,0
2,0.529126,0,0,0,1,1,7,67.0,1971,1.332242,...,1,0,0,0,0,1,0,0,0,0
3,0.033981,0,0,1,0,0,9,149.0,1993,1.370239,...,0,0,1,0,0,0,1,0,0,0
4,0.533981,0,0,1,0,1,7,68.0,1972,1.320502,...,1,0,0,0,0,1,0,0,0,0


In [70]:
df_test = pd.read_csv("../data/test_new.csv")
print('Number of test records: {}'.format(len(df_test)))
df_test.head()

Number of test records: 30000


Unnamed: 0,rent_approval_date,town_0,town_1,town_2,town_3,town_4,flat_model,floor_area_sqm,lease_commence_date,latitude,...,flat_type_3-room,flat_type_4-room,flat_type_executive,flat_type_5-room,flat_type_2-room,central region,east region,north region,north-east region,west region
0,0.970874,1,0,1,1,1,7,121.0,1984,1.358411,...,0,0,0,1,0,0,0,0,1,0
1,0.524272,0,1,1,1,0,6,100.0,1999,1.446343,...,0,1,0,0,0,0,0,1,0,0
2,1.0,0,1,1,0,0,2,91.0,1980,1.305719,...,0,1,0,0,0,0,0,0,0,1
3,0.033981,0,0,0,0,1,6,74.0,1986,1.344832,...,1,0,0,0,0,0,0,0,0,1
4,0.495146,0,0,0,0,1,7,121.0,1983,1.345437,...,0,0,0,1,0,0,0,0,0,1


In [115]:
X = df_train.drop('monthly_rent', axis=1)
y =  df_train['monthly_rent']
X_test = df_test

# initialize models

In [110]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso

lr_model =  Pipeline([
    ('scaler', MinMaxScaler()),  # Normalize the features
    ('regressor', LinearRegression())  # Random Forest Regressor model
])

ridge_model = Pipeline([
    ('scaler', MinMaxScaler()),  # Normalize the features
    ('regressor', Ridge(alpha=1.0))  # Random Forest Regressor model
])

lasso_model = Pipeline([
    ('scaler', MinMaxScaler()),  # Normalize the features
    ('regressor', Lasso(alpha=1.0))  # Random Forest Regressor model
])

rf_model = RandomForestRegressor()

In [123]:
from sklearn.model_selection import cross_val_score

lr_scores = cross_val_score(lr_model, X, y, scoring='neg_mean_squared_error', cv=5)
average_lr_mse = -lr_scores.mean()
print('lr', average_lr_mse)

ridge_scores = cross_val_score(ridge_model, X, y, scoring='neg_mean_squared_error', cv=5)
average_ridge_mse = -ridge_scores.mean()
print('ridge', average_ridge_mse)

lasso_scores = cross_val_score(lasso_model, X, y, scoring='neg_mean_squared_error', cv=5)
average_lasso_mse = -lasso_scores.mean()
print('lasso', average_lasso_mse)

lr 266205.7218890253
ridge 266201.5742625672
lasso 266762.8102323966


## simple linear regression seems behave bad, consider trying feature selection first.

# use random forest to find importances of the features

In [104]:
rf_model.fit(X_train, y_train)

RandomForestRegressor()

In [107]:
from sklearn.feature_selection import SelectFromModel

feature_importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
})


In [109]:
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)
feature_importances_df

Unnamed: 0,Feature,Importance
0,rent_approval_date,0.33664
7,floor_area_sqm,0.166977
9,latitude,0.135723
13,min_distance,0.094897
10,longitude,0.083658
8,lease_commence_date,0.056025
11,avg_stock_price,0.04473
12,num_shopping_malls,0.022243
6,flat_model,0.016364
19,central region,0.015206


# Select features with high importance to train linear regression model

In [122]:

selected_features = feature_importances_df['Feature'].head(10).tolist()

X_train_selected = X[selected_features]
X_test_selected =  X_test[selected_features]


lr_scores = cross_val_score(lr_model, X_train_selected, y, scoring='neg_mean_squared_error', cv=5)
average_lr_mse = -lr_scores.mean()
print('lr', average_lr_mse)

ridge_scores = cross_val_score(ridge_model, X_train_selected, y, scoring='neg_mean_squared_error', cv=5)
average_ridge_mse = -ridge_scores.mean()
print('ridge', average_ridge_mse)

lasso_scores = cross_val_score(lasso_model, X_train_selected, y, scoring='neg_mean_squared_error', cv=5)
average_lasso_mse = -lasso_scores.mean()
print('lasso', average_lasso_mse)

lr 279490.80110505165
ridge 279490.4586607524
lasso 279945.37293419


# no improvement. ok..

In [129]:
lr_model.fit(X, y)
y_pred = lr_model.predict(X_test)

In [130]:
df = pd.DataFrame(y_pred, columns =['Predicted'], dtype = float) 
df.head()

Unnamed: 0,Predicted
0,3208.5
1,2620.5
2,3293.0
3,1904.0
4,2890.0


In [131]:
df.to_csv("../data/lr_output.csv", index_label='Id')