In [1]:
# import libraries
import numpy as np
import pandas as pd

import matplotlib as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import lightgbm as lgb

from sklearn.model_selection import train_test_split
import time


In [2]:
file_path = '../raw_data/resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv'
df = pd.read_csv(file_path, parse_dates = ['month'])

In [3]:
X = df.drop(columns=['resale_price'])
y = df['resale_price']
X

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease
0,2017-01-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months
1,2017-01-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months
2,2017-01-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months
3,2017-01-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month
4,2017-01-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months
...,...,...,...,...,...,...,...,...,...,...
157092,2023-07-01,YISHUN,5 ROOM,504B,YISHUN ST 51,04 TO 06,113.0,Improved,2016,91 years 09 months
157093,2023-07-01,YISHUN,5 ROOM,613,YISHUN ST 61,07 TO 09,121.0,Improved,1987,62 years 10 months
157094,2023-07-01,YISHUN,5 ROOM,713,YISHUN ST 71,04 TO 06,122.0,Improved,1987,63 years 01 month
157095,2023-07-01,YISHUN,5 ROOM,758,YISHUN ST 72,04 TO 06,129.0,Improved,1986,62 years 05 months


In [4]:
def preprocessor(df):
    """
    Input df should be a pandas dataframe.
    It will return a training format dataframe.
    """
    df['storey_range_low'] = df['storey_range'].str[:2].apply(lambda x: int(x))
    df['remaining_lease_years'] = df['remaining_lease'].str[:2].apply(lambda x: int(x))
    df['remaining_lease_months'] = df['remaining_lease'].str[9:11]
    df['remaining_lease_months'] = df['remaining_lease_months'].apply(lambda x: 0 if x == '0 ' or x =='' else x)
    df['remaining_lease_months'] = df['remaining_lease_months'].apply(lambda x: int(x))
    df['remaining_lease_years'] = df['remaining_lease_years'] + df['remaining_lease_months'].apply(lambda x: x/12)
    numerical_features = ['floor_area_sqm', 'storey_range_low', 'remaining_lease_years']
    categorical_features = ['town', 'flat_type', 'flat_model']
    df=df.drop(columns=['block','street_name','lease_commence_date','storey_range','remaining_lease','remaining_lease_months'])
    min_floor_area_sqm=31
    max_floor_area_sqm=249
    min_storey_range_low=1
    max_storey_range_low=49
    min_remaining_lease_years=42.58
    max_remaining_lease_years=99
    df['floor_area_sqm']=df['floor_area_sqm'].apply(lambda x:(x-min_floor_area_sqm)/(max_floor_area_sqm))
    df['storey_range_low']=df['storey_range_low'].apply(lambda x:(x-min_storey_range_low)/(max_storey_range_low))
    df['remaining_lease_years']=df['remaining_lease_years'].apply(lambda x:(x-min_remaining_lease_years)/(max_remaining_lease_years))
    categorical_features_names=[np.array(['ANG MO KIO', 'BEDOK', 'BISHAN', 'BUKIT BATOK', 'BUKIT MERAH',
        'BUKIT PANJANG', 'BUKIT TIMAH', 'CENTRAL AREA', 'CHOA CHU KANG',
        'CLEMENTI', 'GEYLANG', 'HOUGANG', 'JURONG EAST', 'JURONG WEST',
        'KALLANG/WHAMPOA', 'MARINE PARADE', 'PASIR RIS', 'PUNGGOL',
        'QUEENSTOWN', 'SEMBAWANG', 'SENGKANG', 'SERANGOON', 'TAMPINES',
        'TOA PAYOH', 'WOODLANDS', 'YISHUN'], dtype=object),np.array(['1 ROOM', '2 ROOM', '3 ROOM', '4 ROOM', '5 ROOM', 'EXECUTIVE',
        'MULTI-GENERATION'], dtype=object),np.array(['2-room', '3Gen', 'Adjoined flat', 'Apartment', 'DBSS', 'Improved',
        'Improved-Maisonette', 'Maisonette', 'Model A',
        'Model A-Maisonette', 'Model A2', 'Multi Generation',
        'New Generation', 'Premium Apartment', 'Premium Apartment Loft',
        'Premium Maisonette', 'Simplified', 'Standard', 'Terrace',
        'Type S1', 'Type S2'], dtype=object)]
    feature_name=np.array(['town_ANG MO KIO', 'town_BEDOK', 'town_BISHAN', 'town_BUKIT BATOK', 'town_BUKIT MERAH', 'town_BUKIT PANJANG', 'town_BUKIT TIMAH',
       'town_CENTRAL AREA', 'town_CHOA CHU KANG', 'town_CLEMENTI', 'town_GEYLANG', 'town_HOUGANG', 'town_JURONG EAST',
       'town_JURONG WEST', 'town_KALLANG/WHAMPOA', 'town_MARINE PARADE', 'town_PASIR RIS', 'town_PUNGGOL', 'town_QUEENSTOWN',
       'town_SEMBAWANG', 'town_SENGKANG', 'town_SERANGOON', 'town_TAMPINES', 'town_TOA PAYOH', 'town_WOODLANDS', 'town_YISHUN',
       'flat_type_1 ROOM', 'flat_type_2 ROOM', 'flat_type_3 ROOM', 'flat_type_4 ROOM', 'flat_type_5 ROOM', 'flat_type_EXECUTIVE',
       'flat_type_MULTI-GENERATION', 'flat_model_2-room', 'flat_model_3Gen', 'flat_model_Adjoined flat',
       'flat_model_Apartment', 'flat_model_DBSS', 'flat_model_Improved', 'flat_model_Improved-Maisonette', 'flat_model_Maisonette',
       'flat_model_Model A', 'flat_model_Model A-Maisonette', 'flat_model_Model A2', 'flat_model_Multi Generation',
       'flat_model_New Generation', 'flat_model_Premium Apartment', 'flat_model_Premium Apartment Loft',
       'flat_model_Premium Maisonette', 'flat_model_Simplified', 'flat_model_Standard', 'flat_model_Terrace', 'flat_model_Type S1',
       'flat_model_Type S2'], dtype=object)
    encoder=OneHotEncoder(categories=categorical_features_names,sparse=False)
    transformed=encoder.fit_transform(df[categorical_features])
    transformed_categorical_df=pd.DataFrame(transformed, columns=feature_name)
    df=df[['month','floor_area_sqm', 'storey_range_low', 'remaining_lease_years']]
    processed_df=pd.concat([df,transformed_categorical_df],axis=1)

    return processed_df

In [9]:
month='2023-07-01'
town='CHOA CHU KANG'
flat_type='5 ROOM'
storey_range='10 TO 12'
floor_area_sqm=100
flat_model='Improved'
remaining_lease_year=92
remaining_lease_month=6

data_input={'month':[month],
    'town': [town],
    'flat_type': [flat_type],
    'storey_range':[storey_range],
    'floor_area_sqm': [floor_area_sqm],
    'flat_model': [flat_model],
    'remaining_lease': [f'{remaining_lease_year} years {remaining_lease_month} months']}
data_df=pd.DataFrame(data_input)
# def user_input_processor(x):
#     '''
#     put the data_df into the x
#     It will return the final prediction data frame. 
#     '''
#     input_feature=['month', 'town', 'flat_type', 'storey_range', 'floor_area_sqm','flat_model', 'remaining_lease']
#     original_feature=['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range', 'floor_area_sqm', 'flat_model', 'lease_commence_date','remaining_lease']
#     predict_df=pd.DataFrame(columns=original_feature)
#     for i in original_feature:
#         for j in input_feature:
#             if i==j:
#                 predict_df.loc[0,i]=x.loc[0,j]
#     final_predict_df= preprocessor(predict_df)
#     return final_predict_df

In [11]:
data_df.dtypes

month              object
town               object
flat_type          object
storey_range       object
floor_area_sqm      int64
flat_model         object
remaining_lease    object
dtype: object

In [6]:
a=preprocessor(X)
a.columns.shape

(58,)

In [8]:
b=user_input_processor(data_df)
b.columns

Index(['month', 'floor_area_sqm', 'storey_range_low', 'remaining_lease_years',
       'town_ANG MO KIO', 'town_BEDOK', 'town_BISHAN', 'town_BUKIT BATOK',
       'town_BUKIT MERAH', 'town_BUKIT PANJANG', 'town_BUKIT TIMAH',
       'town_CENTRAL AREA', 'town_CHOA CHU KANG', 'town_CLEMENTI',
       'town_GEYLANG', 'town_HOUGANG', 'town_JURONG EAST', 'town_JURONG WEST',
       'town_KALLANG/WHAMPOA', 'town_MARINE PARADE', 'town_PASIR RIS',
       'town_PUNGGOL', 'town_QUEENSTOWN', 'town_SEMBAWANG', 'town_SENGKANG',
       'town_SERANGOON', 'town_TAMPINES', 'town_TOA PAYOH', 'town_WOODLANDS',
       'town_YISHUN', 'flat_type_1 ROOM', 'flat_type_2 ROOM',
       'flat_type_3 ROOM', 'flat_type_4 ROOM', 'flat_type_5 ROOM',
       'flat_type_EXECUTIVE', 'flat_type_MULTI-GENERATION',
       'flat_model_2-room', 'flat_model_3Gen', 'flat_model_Adjoined flat',
       'flat_model_Apartment', 'flat_model_DBSS', 'flat_model_Improved',
       'flat_model_Improved-Maisonette', 'flat_model_Maisonette',
  