In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer

import matplotlib.pyplot as plt

from utils import preprocess

In [None]:
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [None]:
DATA_DIR = './data/'
IMG_DIR = './img/'
PROCESSED_DATA_DIR = './processed_data/'
TRAIN_FILE = DATA_DIR + 'train.csv'
TEST_FILE = DATA_DIR + 'train.csv'
PROCESSED_TRAIN_FILE = PROCESSED_DATA_DIR + 'processed_train.csv'
AUX_DATA_DIR = DATA_DIR + 'auxiliary-data/'
SUBZONE_FILE = AUX_DATA_DIR + 'sg-subzones.csv'

# Preprocess

In [None]:
YEAR_BINS = pd.IntervalIndex.from_tuples([(0, 1980), (1980, 1990), (1990, 2000), (2000, 2010), (2010, 2020), (2020, 3000)])
YEAR_LABELS = dict(zip(YEAR_BINS, ['y0','y1','y2','y3','y4','y5']))
YEAR_DICT = { 'y0': 0., 'y1': 1., 'y2': 2., 'y3': 3., 'y4': 4., 'y5': 5. }

In [None]:
df_subzone = pd.read_csv(SUBZONE_FILE)

In [None]:
PLANNING_AREA_LIST = df_subzone['planning_area'].unique()

In [None]:
df_train = pd.read_csv(TRAIN_FILE)

In [None]:
# fixed random for preprocessing
np.random.seed(5228)

# property_type
preprocess.process_property_type(df_train)
# tenure
preprocess.universalize_tenure(df_train)
# tenure fill na
preprocess.fillna_by_grouping(df_train, 'tenure', 'property_type')
# built year
preprocess.fillna_by_property_name(df_train, 'built_year')
preprocess.discretize_built_year(df_train, YEAR_BINS, YEAR_LABELS)
# built year fill na
preprocess.fillna_by_grouping(df_train, 'built_year', 'property_type')
preprocess.fill_conservation_house_built_year(df_train, 'y0')
# size_sqft
preprocess.fill_zero_sqft(df_train)
preprocess.fix_abnormal_sqft(df_train)
preprocess.convert_sqm_to_sqft(df_train)
# num_beds
preprocess.fill_na_num_beds(df_train)
# TODO: compassvale plains
# num_baths
preprocess.map_value_by_most_common(df_train, 'num_baths', 'num_beds')
# fix odd num_baths
preprocess.fix_abnormal_beds_baths_number(df_train)
# price
preprocess.fix_super_high_price(df_train)
# remove 0 price rows
preprocess.remove_price_zero_records(df_train)
# lat & lng
preprocess.fix_abnormal_geo_location(df_train)
# subzone
preprocess.map_subzone_by_geo_location_knn(df_train)
# planning_area
preprocess.map_value_by_most_common(df_train, 'planning_area', 'subzone')

In [None]:
dropped_columns = [
    "listing_id", 
    "title", 
    "address",
    "property_name",
    "floor_level", 
    "elevation", 
    "available_unit_types", 
    "total_num_units", 
    "furnishing", 
    "available_unit_types", 
    "property_details_url",
]
df_train.drop(columns=dropped_columns, inplace=True)

In [None]:
# add additional price per sqft as new label
# df_train['sqft_price'] = df_train['price'] / df_train['size_sqft']

In [None]:
# encode categorical features
# subzone, property_type
df_train, subzone_property_type_encoding_dict = preprocess.target_encode_property_type_subzone(df_train)
# built_year
preprocess.encode_built_year(df_train, YEAR_DICT)
# tenure
preprocess.encode_tenure(df_train)
# planning_area
df_train = preprocess.encode_planning_area(df_train, PLANNING_AREA_LIST)

In [None]:
df_train.head()

In [None]:
df_train[df_train.isna().any(axis=1)]

In [None]:
Path(PROCESSED_DATA_DIR).mkdir(parents=True, exist_ok=True)
df_train.to_csv(PROCESSED_TRAIN_FILE, index=False)