# Model Playgrownd

In [2]:
import numpy
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
from sklearn.preprocessing import StandardScaler

In [2]:
def preprocess_data(data):
    def encode(df):
        categorical = ["flat_type", "storey_range", "flat_model"]

        encoder = OneHotEncoder(sparse_output=False)
        encoded_data = encoder.fit_transform(df[categorical])
        encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical))
        df = df.drop(columns=categorical).join(encoded_df)
        return df
    
    def convert_time_columns(df):
        reference_date = datetime(1966, 1, 1)
        df['month'] = pd.to_datetime(df['month'], format='%Y-%m')
        df['month_seconds'] = (df['month'] - reference_date).dt.total_seconds()
        df['lease_commence_date'] = pd.to_datetime(df['lease_commence_date'], format='%Y')
        df['lease_commence_date_seconds'] = (df['lease_commence_date'] - reference_date).dt.total_seconds()
        def calculate_lease_end(row):
            try:
                years, months = 0, 0
                parts = row['remaining_lease'].split()
                if 'years' in parts:
                    years = int(parts[parts.index('years') - 1])
                if 'months' in parts:
                    months = int(parts[parts.index('months') - 1])
                
                start_date = row['month']
                end_date = start_date + pd.DateOffset(years=years, months=months)
                return (end_date - reference_date).total_seconds()
            except Exception as e:
                print(f"Error processing row: {row}, error: {e}")
                return None
        df['remaining_lease_seconds'] = df.apply(calculate_lease_end, axis=1)
        df.drop(columns=['month', 'lease_commence_date', 'remaining_lease'], inplace=True)
        df.rename(columns={
            'month_seconds': 'month',
            'lease_commence_date_seconds': 'lease_commence_date',
            'remaining_lease_seconds': 'remaining_lease'
        }, inplace=True)

        return df
    
    def scale_columns(df):
        to_be_scaled = ['floor_area_sqm', 'month', 'lease_commence_date', 'remaining_lease']
        scaler = StandardScaler()
        df[to_be_scaled] = scaler.fit_transform(df[to_be_scaled])
        return df
    
    def get_coordinates(df):
        def create_address_string(row):
            return f"{row['town']}, {row['street_name']}, block {row['block']}, Singapore"

        def make_full_address(df):
            df['full_address'] = df.apply(create_address_string, axis=1)
            df = df.drop(columns=['town', 'block', 'street_name'])
            return df
        
        coord_df = pd.read_csv("../data/coordinates.csv", index_col='full_address')

        def get_coordinate(full_addr):
            try:
                result = coord_df.loc[full_addr]
                return numpy.float64(result['latitude']), numpy.float64(result['longitude'])
            except KeyError:
                return numpy.nan, numpy.nan
        
        df = make_full_address(df)
        df[['latitude', 'longitude']] = df['full_address'].apply(lambda addr: pd.Series(get_coordinate(addr)))
        df = df.drop(columns= 'full_address')

        return df
    

    data = encode(data)
    data = convert_time_columns(data)
    data = scale_columns(data)
    data = get_coordinates(data)
    data['latitude'] = pd.to_numeric(data['latitude'], errors='coerce')
    data['longitude'] = pd.to_numeric(data['longitude'], errors='coerce')
    data = data.dropna()

    X = data.drop(columns=['resale_price'])
    y = data['resale_price']
    
    return X, y

In [3]:
X = pd.read_csv('../data/preprocessed/X.csv')
y = pd.read_csv('../data/preprocessed/y.csv')
X, y

(       Unnamed: 0  flat_type_1 ROOM  flat_type_2 ROOM  flat_type_3 ROOM  \
 0               0               0.0               0.0               0.0   
 1               1               0.0               0.0               0.0   
 2               2               0.0               0.0               1.0   
 3               3               0.0               0.0               0.0   
 4               4               0.0               0.0               0.0   
 ...           ...               ...               ...               ...   
 36106       36248               0.0               0.0               0.0   
 36107       36249               0.0               0.0               1.0   
 36108       36250               0.0               0.0               0.0   
 36109       36251               0.0               0.0               0.0   
 36110       36252               0.0               0.0               0.0   
 
        flat_type_4 ROOM  flat_type_5 ROOM  flat_type_EXECUTIVE  \
 0                 

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.33, random_state= 42, shuffle= True)

In [5]:
from sklearn.metrics import mean_squared_error
import numpy as np

def evaluate_model_rmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"RMSE: {rmse}")
    return rmse

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the model
rf = RandomForestRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2]
}

# Perform Grid Search
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=3)
grid_search_rf.fit(X_train, y_train)

# Best parameters and score
print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print("Best score for Random Forest:", -grid_search_rf.best_score_)

# Predict and evaluate
y_pred_rf = grid_search_rf.best_estimator_.predict(X_test)
evaluate_model_rmse(y_test, y_pred_rf)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END max_depth=None, min_samples_split=2, n_estimators=100;, score=-23104.514 total time=  11.2s
[CV 2/5] END max_depth=None, min_samples_split=2, n_estimators=100;, score=-22461.807 total time=  11.1s
[CV 3/5] END max_depth=None, min_samples_split=2, n_estimators=100;, score=-21988.956 total time=  10.8s
[CV 4/5] END max_depth=None, min_samples_split=2, n_estimators=100;, score=-23129.725 total time=  10.9s
[CV 5/5] END max_depth=None, min_samples_split=2, n_estimators=100;, score=-22713.766 total time=  10.9s
[CV 1/5] END max_depth=None, min_samples_split=2, n_estimators=200;, score=-23141.565 total time=  22.1s
[CV 2/5] END max_depth=None, min_samples_split=2, n_estimators=200;, score=-22268.408 total time=  21.8s
[CV 3/5] END max_depth=None, min_samples_split=2, n_estimators=200;, score=-22083.275 total time=  21.8s
[CV 4/5] END max_depth=None, min_samples_split=2, n_estimators=200;, score=-23125.344 total time=  2

27413.629108535675

In [1]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp311-cp311-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5
Note: you may need to restart the kernel to use updated packages.


In [10]:
from catboost import CatBoostRegressor

# Define the model
catboost = CatBoostRegressor(verbose=2)

# Define the parameter grid
param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': [3, 5, 7]
}

# Perform Grid Search
grid_search_catboost = GridSearchCV(estimator=catboost, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose= 2)
grid_search_catboost.fit(X_train, y_train)

# Best parameters and score
print("Best parameters for CatBoost:", grid_search_catboost.best_params_)
print("Best score for CatBoost:", -grid_search_catboost.best_score_)

# Predict and evaluate
y_pred_catboost = grid_search_catboost.best_estimator_.predict(X_test)
evaluate_model_rmse(y_test, y_pred_catboost)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
0:	learn: 172303.8582215	total: 948us	remaining: 93.9ms
2:	learn: 170733.7491422	total: 2.5ms	remaining: 80.8ms
4:	learn: 169207.0725474	total: 3.94ms	remaining: 75ms
6:	learn: 167724.3953623	total: 5.73ms	remaining: 76.1ms
8:	learn: 166224.4683142	total: 7.3ms	remaining: 73.9ms
10:	learn: 164773.8775079	total: 8.9ms	remaining: 72ms
12:	learn: 163373.7355438	total: 10.5ms	remaining: 70.4ms
14:	learn: 161998.3426450	total: 12.3ms	remaining: 69.8ms
16:	learn: 160669.7102090	total: 14.7ms	remaining: 71.7ms
18:	learn: 159334.5088675	total: 16.6ms	remaining: 70.6ms
20:	learn: 158077.1399015	total: 18.3ms	remaining: 68.8ms
22:	learn: 156823.4942462	total: 20ms	remaining: 66.8ms
24:	learn: 155586.2273673	total: 21.9ms	remaining: 65.6ms
26:	learn: 154382.0002171	total: 24.3ms	remaining: 65.7ms
28:	learn: 153210.2241311	total: 26.6ms	remaining: 65.1ms
30:	learn: 152050.1538430	total: 28.4ms	remaining: 63.2ms
32:	learn: 150927.1107062

31341.001716914394

In [15]:
grid_search_catboost.best_estimator_.score(X_test, y_test)

0.967294301060011

In [16]:
from sklearn.ensemble import GradientBoostingRegressor

# Define the model
gbr = GradientBoostingRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Perform Grid Search
grid_search_gbr = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose= 2)
grid_search_gbr.fit(X_train, y_train)

print("Best parameters for Gradient Boosting:", grid_search_gbr.best_params_)
print("Best score for Gradient Boosting:", -grid_search_gbr.best_score_)

# Predict and evaluate
y_pred_gbr = grid_search_gbr.best_estimator_.predict(X_test)
evaluate_model_rmse(y_test, y_pred_gbr)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   3.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   3.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   3.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   3.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   3.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=300; total time=   5.8s
[CV] END ..learning_rate=0.01, max_depth=3, n_e

31521.372664483515

In [18]:
grid_search_gbr.best_estimator_.score(X_test, y_test)

0.96691676794114

In [1]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-nccl-cu12
  Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl (190.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.22.3 xgboost-2.1.0
Note: you may need to restart the kernel to use updated packages.


In [11]:
import xgboost as xgb

# Define the model
xgbr = xgb.XGBRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [400, 500, 600, 700],
    'learning_rate': [0.05, 0.1, 0.2, 0.3],
    'max_depth': [5, 7, 9]
}

# Perform Grid Search
grid_search_xgbr = GridSearchCV(estimator=xgbr, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=3)
grid_search_xgbr.fit(X_train, y_train)

# Best parameters and score
print("Best parameters for XGBoost:", grid_search_xgbr.best_params_)
print("Best score for XGBoost:", -grid_search_xgbr.best_score_)

# Predict and evaluate
y_pred_xgbr = grid_search_xgbr.best_estimator_.predict(X_test)
evaluate_model_rmse(y_test, y_pred_xgbr)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END learning_rate=0.05, max_depth=5, n_estimators=400;, score=-38070.848 total time=   0.9s
[CV 2/5] END learning_rate=0.05, max_depth=5, n_estimators=400;, score=-36844.012 total time=   1.0s
[CV 3/5] END learning_rate=0.05, max_depth=5, n_estimators=400;, score=-37118.105 total time=   1.0s
[CV 4/5] END learning_rate=0.05, max_depth=5, n_estimators=400;, score=-37124.519 total time=   0.9s
[CV 5/5] END learning_rate=0.05, max_depth=5, n_estimators=400;, score=-36726.434 total time=   0.9s
[CV 1/5] END learning_rate=0.05, max_depth=5, n_estimators=500;, score=-36903.561 total time=   1.3s
[CV 2/5] END learning_rate=0.05, max_depth=5, n_estimators=500;, score=-35622.688 total time=   1.3s
[CV 3/5] END learning_rate=0.05, max_depth=5, n_estimators=500;, score=-35747.089 total time=   1.2s
[CV 4/5] END learning_rate=0.05, max_depth=5, n_estimators=500;, score=-36024.296 total time=   1.2s
[CV 5/5] END learning_rate=0.

30077.591827930286

In [12]:
grid_search_xgbr.best_estimator_.score(X_test, y_test)

0.9698779995253285

In [2]:
%pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.4.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Installing collected packages: lightgbm
Successfully installed lightgbm-4.4.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
import lightgbm as lgb

# Define the model
lgbm = lgb.LGBMRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Perform Grid Search
grid_search_lgbm = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid_search_lgbm.fit(X_train, y_train)

# Best parameters and score
print("Best parameters for LightGBM:", grid_search_lgbm.best_params_)
print("Best score for LightGBM:", -grid_search_lgbm.best_score_)

# Predict and evaluate
y_pred_lgbm = grid_search_lgbm.best_estimator_.predict(X_test)
evaluate_model_rmse(y_test, y_pred_lgbm)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000419 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1114
[LightGBM] [Info] Number of data points in the train set: 19355, number of used features: 38
[LightGBM] [Info] Start training from score 498373.092140
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000407 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1114
[LightGBM] [Info] Number of data points in the train set: 19355, number of used features: 38
[LightGBM] [Info] Start training from score 499650.470899
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000352 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1115
[LightGBM] [Info] Number of data points in the train set: 19355, number of used features: 38
[LightGBM] [Info]

31163.94775282567