In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from utils import preprocess

In [None]:
# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [None]:
DATA_DIR = './data/'
IMG_DIR = './img/'
PROCESSED_DATA_DIR = './processed_data/'

TRAIN_FILE = DATA_DIR + 'train.csv'
TEST_FILE = DATA_DIR + 'test.csv'

PROCESSED_TRAIN_FILE = PROCESSED_DATA_DIR + 'processed_train.csv'
PROCESSED_TEST_FILE = PROCESSED_DATA_DIR + 'processed_test.csv'
PROCESSED_RECOMMENDATION_TRAIN_FILE = PROCESSED_DATA_DIR + 'processed_recommendation_train.csv'

AUX_DATA_DIR = DATA_DIR + 'auxiliary-data/'
SUBZONE_FILE = AUX_DATA_DIR + 'sg-subzones.csv'
COMMERCIAL_CENTER_FILE = AUX_DATA_DIR + 'sg-commerical-centres.csv'
MRT_FILE = AUX_DATA_DIR + 'sg-mrt-stations.csv'
PRIMARY_SCHOOL_FILE = AUX_DATA_DIR + 'sg-primary-schools.csv'
SECONDARY_SCHOOL_FILE = AUX_DATA_DIR + 'sg-secondary-schools.csv'
SHOPPING_MALL_FILE = AUX_DATA_DIR + 'sg-shopping-malls.csv'

COMMERCIAL_CENTER_DIS_FILE = PROCESSED_DATA_DIR + 'sg-commerical-centres.npy'
MRT_DIS_FILE = PROCESSED_DATA_DIR + 'sg-mrt-stations.npy'
PRIMARY_SCHOOL_DIS_FILE = PROCESSED_DATA_DIR + 'sg-primary-schools.npy'
SECONDARY_SCHOOL_DIS_FILE = PROCESSED_DATA_DIR + 'sg-secondary-schools.npy'
SHOPPING_MALL_DIS_FILE = PROCESSED_DATA_DIR + 'sg-shopping-malls.npy'

# Preprocess

In [None]:
YEAR_BINS = pd.IntervalIndex.from_tuples([(0, 1980), (1980, 1990), (1990, 2000), (2000, 2010), (2010, 2020), (2020, 3000)])
YEAR_LABELS = dict(zip(YEAR_BINS, ['y0','y1','y2','y3','y4','y5']))
YEAR_DICT = { 'y0': 0., 'y1': 1., 'y2': 2., 'y3': 3., 'y4': 4., 'y5': 5. }

In [None]:
df_subzone = pd.read_csv(SUBZONE_FILE)
PLANNING_AREA_LIST = df_subzone['planning_area'].unique()

In [None]:
df_train = pd.read_csv(TRAIN_FILE)
df_test = pd.read_csv(TEST_FILE)

### Preprocess df_train

In [None]:
# size_sqft
preprocess.fill_zero_sqft(df_train)
preprocess.fix_abnormal_sqft(df_train)
preprocess.convert_sqm_to_sqft(df_train)
# num_beds
preprocess.fill_na_num_beds(df_train)
# TODO: compassvale plains
# fix odd num_baths
preprocess.fix_abnormal_beds_baths_number(df_train)
# remove 0 price rows
preprocess.remove_price_zero_records(df_train)
# price
preprocess.fix_super_high_price(df_train)

### Preprocess df_test

In [None]:
# size_sqft
preprocess.convert_sqm_to_sqft_for_test(df_test)
preprocess.fix_abnormal_sqft_for_test(df_test)
# num_beds
preprocess.fill_na_num_beds_for_test(df_test)

### Combine & Preprocess

In [None]:
# assign an invalid price to test data, combine train/test data together for preprocess
df_test['price'] = -1.
train_test_delimiter = len(df_train)
df_all = pd.concat((df_train, df_test))

In [None]:
# fixed random for preprocessing
np.random.seed(5228)

# num_baths, for both train/test
preprocess.map_value_by_most_common(df_all, 'num_baths', 'num_beds')
# property_type
preprocess.process_property_type(df_all)
# tenure
preprocess.universalize_tenure(df_all)
# tenure fill na
preprocess.fillna_by_grouping(df_all, 'tenure', 'property_type')
# built year
preprocess.fillna_by_property_name(df_all, 'built_year')
preprocess.discretize_built_year(df_all, YEAR_BINS, YEAR_LABELS)
# built year fill na
preprocess.fillna_by_grouping(df_all, 'built_year', 'property_type')
preprocess.fill_conservation_house_built_year(df_all, 'y0')
# lat & lng
preprocess.fix_abnormal_geo_location(df_all)
# subzone
preprocess.map_subzone_by_geo_location_knn(df_all)
# planning_area
preprocess.map_value_by_most_common(df_all, 'planning_area', 'subzone')

### Process Auxiliary Data

In [None]:
# property location data frame
df_property_name_location = df_all.groupby(['property_name']).agg(lat=('lat', 'mean'), lng=('lng', 'mean'))

In [None]:
df_commercial_center = pd.read_csv(COMMERCIAL_CENTER_FILE)
df_mrt = pd.read_csv(MRT_FILE)
df_primary_school = pd.read_csv(PRIMARY_SCHOOL_FILE)
df_secondary_school = pd.read_csv(SECONDARY_SCHOOL_FILE)
df_shopping_mall = pd.read_csv(SHOPPING_MALL_FILE)

In [None]:
# # commercial center
# distance_commercial_center = preprocess.calculate_distance_km(df_property_name_location, df_commercial_center)
# np.save(COMMERCIAL_CENTER_DIS_FILE, distance_commercial_center)
# # mrt
# distance_mrt = preprocess.calculate_distance_km(df_property_name_location, df_mrt)
# np.save(MRT_DIS_FILE, distance_mrt)
# # primary school
# distance_primary_school = preprocess.calculate_distance_km(df_property_name_location, df_primary_school)
# np.save(PRIMARY_SCHOOL_DIS_FILE, distance_primary_school)
# # secondary school
# distance_secondary_school = preprocess.calculate_distance_km(df_property_name_location, df_secondary_school)
# np.save(SECONDARY_SCHOOL_DIS_FILE, distance_secondary_school)
# # shopping mall
# distance_shopping_mall = preprocess.calculate_distance_km(df_property_name_location, df_shopping_mall)
# np.save(SHOPPING_MALL_DIS_FILE, distance_shopping_mall)

In [None]:
distance_commercial_center = np.load(COMMERCIAL_CENTER_DIS_FILE)
distance_mrt = np.load(MRT_DIS_FILE)
distance_primary_school = np.load(PRIMARY_SCHOOL_DIS_FILE)
distance_secondary_school = np.load(SECONDARY_SCHOOL_DIS_FILE)
distance_shopping_mall = np.load(SHOPPING_MALL_DIS_FILE)

In [None]:
# comercial center
preprocess.populate_num_targets_within_range(df_all, df_property_name_location, distance_commercial_center, 0, 5, 'num_cc_5km')
# mrt
preprocess.populate_num_targets_within_range(df_all, df_property_name_location, distance_mrt, 0, 0.8, 'num_mrt_800m')
preprocess.populate_distance_to_nearest_target(df_all, df_property_name_location, distance_mrt, 'nearest_mrt_km')
# primary school
preprocess.populate_num_targets_within_range(df_all, df_property_name_location, distance_primary_school, 0, 1, 'num_ps_1km')
preprocess.populate_num_targets_within_range(df_all, df_property_name_location, distance_primary_school, 1, 2, 'num_ps_1km_to_2km')
# secondary school
preprocess.populate_num_targets_within_range(df_all, df_property_name_location, distance_secondary_school, 0, 1, 'num_ss_1km')
preprocess.populate_num_targets_within_range(df_all, df_property_name_location, distance_secondary_school, 1, 2, 'num_ss_1km_to_2km')
# shopping mall
preprocess.populate_num_targets_within_range(df_all, df_property_name_location, distance_shopping_mall, 0, 3, 'num_sm_3km')
# rearrange_columns
df_all = preprocess.move_price_to_last_column(df_all)

### Drop unused columns

In [None]:
dropped_columns = [
    "title", 
    "address",
    "property_name",
    "floor_level", 
    "elevation", 
    "available_unit_types", 
    "total_num_units", 
    "furnishing", 
    "available_unit_types", 
    "property_details_url",
]
df_all.drop(columns=dropped_columns, inplace=True)

### Encode Categorical Features

In [None]:
# built_year
preprocess.encode_built_year(df_all, YEAR_DICT)
# tenure
preprocess.encode_tenure(df_all)
# planning_area
df_all = preprocess.encode_planning_area(df_all, PLANNING_AREA_LIST)
# property_type
df_all = preprocess.encode_property_type(df_all)

In [None]:
# split train/test
df_train = df_all[:train_test_delimiter].copy()
df_test = df_all[train_test_delimiter:].copy()
df_test.drop(columns=['price'], inplace=True)

In [None]:
# generate encoding dict for test
subzone_encoding_dict = preprocess.generate_subzone_encoding_map(df_train)
property_type_encoding_dict = preprocess.generate_property_type_encoding_map(df_train)
# subzone, property_type
# train
subzone_property_type_encoding_dict = preprocess.target_encode_property_type_subzone(df_train)
# test
preprocess.target_encode_property_type_subzone_for_test(df_test, subzone_property_type_encoding_dict, subzone_encoding_dict, property_type_encoding_dict)

In [None]:
# rearrange columns to put one hot encoding at last place
df_train = preprocess.rearrange_columns(df_train)
df_test = preprocess.rearrange_columns(df_test)

In [None]:
# for recommender system
df_train_recommendation = df_train.copy()
Path(PROCESSED_DATA_DIR).mkdir(parents=True, exist_ok=True)
# df_train_recommendation.to_csv(PROCESSED_RECOMMENDATION_TRAIN_FILE, index=False)

# drop listing id after preparing recommdation related data
df_train.drop(columns=['listing_id'], inplace=True)
df_test.drop(columns=['listing_id'], inplace=True)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
print('Records with NA value in train data:', len(df_train[df_train.isna().any(axis=1)]))
print('Records with NA value in test data:', len(df_test[df_test.isna().any(axis=1)]))

In [None]:
Path(PROCESSED_DATA_DIR).mkdir(parents=True, exist_ok=True)
# df_train.to_csv(PROCESSED_TRAIN_FILE, index=False)
# df_test.to_csv(PROCESSED_TEST_FILE, index=False)