# Importing required packages

In [29]:
import pandas as pd
import sys
import os
from process import *
from process_coe_prices import *
from process_stock_prices import *
from process_main_dataset import *
from process_location_info import *

# Defining Local paths to datasets

In [10]:
datasets_path = "../../datasets"
train_input_path = f"{datasets_path}/train.csv"
test_input_path = f"{datasets_path}/test.csv"
mrt_input_path = f"{datasets_path}/auxiliary-data/sg-mrt-existing-stations.csv"
mrt_planned_input_path = f"{datasets_path}/auxiliary-data/sg-mrt-planned-stations.csv"
mall_input_path = f"{datasets_path}/auxiliary-data/sg-shopping-malls.csv"
school_input_path = f"{datasets_path}/auxiliary-data/sg-primary-schools.csv"
coe_input_path = f"{datasets_path}/auxiliary-data/sg-coe-prices.csv"
stocks_input_path = f"{datasets_path}/auxiliary-data/sg-stock-prices.csv"

# Getting the data from the auxiliary datasets

For more details about how we process the individual auxiliary datasets refer to the ipynb notebooks in /data/auxiliary datasets.

Once we get the values we add them as new columns in the original train dataset

In [11]:
# getting additional details from auxillary datasets
# process with coe
df_coe = transform_coe_prices(coe_input_path)

# process stock information
df_stocks = transform_stock_prices(stocks_input_path)

# first process and transform the dataset with the distance related values
df_with_locations = extract_distance_columns_from_aux_mrt_school_mall(train_input_path, mrt_input_path, mrt_planned_input_path, mall_input_path, school_input_path)

#combining all the dataframes
df_dirty = merge_dataframes(df_with_locations, df_coe, df_stocks)

Finished processing the COE Auxiliary dataset
Finished processing the Stock prices Auxiliary dataset
Finished calculating the distance to nearest existing mrt
Finished calculating the distance to nearest planned mrt
Finished calculating the distance to nearest school
Finished calculating the distance to nearest mall


## Checking out how the dataset looks like now

In [12]:
df_dirty.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,...,subzone,planning_area,region,monthly_rent,distance_to_nearest_existing_mrt,distance_to_nearest_planned_mrt,distance_to_nearest_school,distance_to_nearest_mall,coe_price_indicator,stock_price
0,2021-09,jurong east,257,Jurong East Street 24,3 room,new generation,67.0,yes,1983.0,1.344518,...,yuhua east,jurong east,west region,1600.0,699.127003,675.092874,334.846135,1202.673513,0.559383,0.888712
1,2021-09,punggol,203a,punggol field,4 room,model a,85.0,yes,2004.0,1.398524,...,matilda,punggol,north-east region,2400.0,683.672274,766.323178,476.014557,704.576366,0.559383,0.888712
2,2021-09,choa chu kang,458,choa chu kang avenue 4,executive,premium apartment,142.0,yes,2000.0,1.378101,...,peng siang,choa chu kang,west region,1800.0,1231.831781,1208.372363,213.468076,936.378087,0.559383,0.888712
3,2021-09,sembawang,340a,sembawang close,5 room,premium apartment,110.0,yes,2001.0,1.447534,...,sembawang central,sembawang,north region,2100.0,299.913615,8571.934879,444.346799,189.552356,0.559383,0.888712
4,2021-09,bishan,117,bishan street 12,5 room,improved,121.0,yes,1987.0,1.347017,...,bishan east,bishan,central region,2750.0,445.013019,2188.355884,618.812729,402.450976,0.559383,0.888712


# Cleaning the dataset


## 1. Deleting unnecessary columns

### 1.1 elevation


In [13]:
df_delete = delete_column(df_dirty, "elevation")

Number of columns BEFORE elevation column deletion = 22
Number of columns AFTER elevation column deletion = 21


### 1.2 furnished

In [14]:
# 1. delete columns
df_delete = delete_column(df_delete, "furnished")


Number of columns BEFORE furnished column deletion = 21
Number of columns AFTER furnished column deletion = 20


### 1.3 planning_area

In [15]:
df_delete = delete_column(df_delete, "planning_area")


Number of columns BEFORE planning_area column deletion = 20
Number of columns AFTER planning_area column deletion = 19


### 1.4 block

In [16]:
df_delete = delete_column(df_delete, "block")


Number of columns BEFORE block column deletion = 19
Number of columns AFTER block column deletion = 18


### 1.5 street_name

In [17]:
df_delete = delete_column(df_delete, "street_name")


Number of columns BEFORE street_name column deletion = 18
Number of columns AFTER street_name column deletion = 17


### 1.6 subzone

In [18]:
df_delete = delete_column(df_delete, "subzone")

Number of columns BEFORE subzone column deletion = 17
Number of columns AFTER subzone column deletion = 16


## 2. Converting all string values to lower

This converts all columns with type string into lowercase

In [19]:
df_lower_case = df_delete.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [20]:
df_lower_case.head()

Unnamed: 0,rent_approval_date,town,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,region,monthly_rent,distance_to_nearest_existing_mrt,distance_to_nearest_planned_mrt,distance_to_nearest_school,distance_to_nearest_mall,coe_price_indicator,stock_price
0,2021-09,jurong east,3 room,new generation,67.0,1983.0,1.344518,103.73863,west region,1600.0,699.127003,675.092874,334.846135,1202.673513,0.559383,0.888712
1,2021-09,punggol,4 room,model a,85.0,2004.0,1.398524,103.903863,north-east region,2400.0,683.672274,766.323178,476.014557,704.576366,0.559383,0.888712
2,2021-09,choa chu kang,executive,premium apartment,142.0,2000.0,1.378101,103.736002,west region,1800.0,1231.831781,1208.372363,213.468076,936.378087,0.559383,0.888712
3,2021-09,sembawang,5 room,premium apartment,110.0,2001.0,1.447534,103.817892,north region,2100.0,299.913615,8571.934879,444.346799,189.552356,0.559383,0.888712
4,2021-09,bishan,5 room,improved,121.0,1987.0,1.347017,103.849987,central region,2750.0,445.013019,2188.355884,618.812729,402.450976,0.559383,0.888712


## 3. Converting the date to unix timestep

The rationale behind doing this is to ensure that we get a numerical value for date. Since the date might also have something to do with the prediction of monthly rent

In [21]:
df_lower_case['rent_approval_date'] = pd.to_datetime(df_lower_case['rent_approval_date'] + '-01', format='%Y-%m-%d')
df_lower_case['rent_approval_date'] = (df_lower_case['rent_approval_date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta(seconds=1)

In [22]:
df_lower_case['rent_approval_date'].head()

0    1630454400
1    1630454400
2    1630454400
3    1630454400
4    1630454400
Name: rent_approval_date, dtype: int64

## 4. Finding an ordinal value for flat type and flat model

### 4.1 Ordinalizing flat type

Normalize flat_type column. It replaces the string with the number of rooms.

Eg "3-rooms" will be "3". Only for "Executive" type we will replace it with an ordinal value of 6


In [23]:
df_ordinal_flat_type = ordinalize_flat_type(df_lower_case)



In [24]:
df_ordinal_flat_type["flat_type"].head()

0    3
1    4
2    6
3    5
4    5
Name: flat_type, dtype: object

### 4.2 Ordinalizing the flat model ( while also considering flat type)

In [31]:
df = df_ordinal_flat_type

#combining the flat-type and flat-model into one concatenated string
df['combined_column'] = df['flat_type'].astype(str) + df['flat_model'].str.replace(' ', '')
combined_values = df['combined_column'].unique()

# for each combination we are keeping track of the min and max values
flat_combos = dict()
for value in combined_values:
    min_val = df[df['combined_column'] == value]['floor_area_sqm'].min()
    max_val = df[df['combined_column'] == value]['floor_area_sqm'].max()
    flat_combos[value] = {
        "min_sqm": min_val,
        "max_sqm": max_val
    }

#find the average flat_sq area
for flat_combo in flat_combos.keys():
    flat_combos[flat_combo] = flat_combos[flat_combo]["min_sqm"] + flat_combos[flat_combo]["max_sqm"]

#sort by flat sq area
flat_combo_ordinal = {}
sort_flat_combos_by_area = sorted(flat_combos.items(), key=lambda combo: combo[1])

#find the ordinalily by finding the index of the flat combo after sorting by avg sqm
for item in sort_flat_combos_by_area:
    flat_combo, avg_sqm = item
    flat_combo_ordinal[flat_combo] = sort_flat_combos_by_area.index(item)

df['flat_type_model'] = df.apply(lambda x: flat_combo_ordinal[str(x['combined_column'])], axis=1)

#deleting unnecessary columns
df = delete_column(df, 'combined_column')
df_flat_type_ordinality = delete_column(df, 'flat_model')

Number of columns BEFORE combined_column column deletion = 18
Number of columns AFTER combined_column column deletion = 17
Number of columns BEFORE flat_model column deletion = 17
Number of columns AFTER flat_model column deletion = 16


In [32]:
df_flat_type_ordinality["flat_type_model"].head()

0    13
1    21
2    32
3    25
4    28
Name: flat_type_model, dtype: int64

## 5. One hot encoding

### 5.1 One hot encoding region

In [33]:
df_onehot = pd.get_dummies(df_flat_type_ordinality, columns=['region'], prefix=['region'])

### 5.2 One hot encoding town

In [34]:
df_onehot = pd.get_dummies(df_onehot, columns=['town'], prefix=['town'])

## 6. Removing duplicate rows

In [35]:
df_no_duplicate = duplicate(df_onehot)

Number of duplicates =  523
Number of rows before duplicate deletion =  60000
Number of rows after duplicate deletion =  59477


# Ensuring that columns are normalized

The following columns need to be normalized:
1. rent_approval_date: as it is now a unix timestamp
2. lease_commense_date: as it is just an year value
3. floor_area_sqm
4. all the distance values computed to mrt, school and malls
5. coe_price_indicator (from auxiliary dataset)
6. stock_price (from auxiliary dataset)

In [None]:
df_unnormalized = normalize_column(df_no_duplicate, "rent_approval_date")
df_unnormalized = normalize_column(df_unnormalized, "lease_commence_date")
df_unnormalized = normalize_column(df_unnormalized, "floor_area_sqm")
df_unnormalized = normalize_column(df_unnormalized, "distance_to_nearest_existing_mrt")
df_unnormalized = normalize_column(df_unnormalized, "distance_to_nearest_planned_mrt")
df_unnormalized = normalize_column(df_unnormalized, "distance_to_nearest_school")
df_unnormalized = normalize_column(df_unnormalized, "distance_to_nearest_mall")
df_unnormalized = normalize_column(df_unnormalized, "coe_price_indicator")
df_normalized = normalize_column(df_unnormalized, "stock_price")

# Checking out how the final cleaned dataset looks like

In [37]:
df_normalized.head()

Unnamed: 0,rent_approval_date,flat_type,floor_area_sqm,lease_commence_date,latitude,longitude,monthly_rent,distance_to_nearest_existing_mrt,distance_to_nearest_planned_mrt,distance_to_nearest_school,...,town_pasir ris,town_punggol,town_queenstown,town_sembawang,town_sengkang,town_serangoon,town_tampines,town_toa payoh,town_woodlands,town_yishun
0,0.26674,3,0.18232,0.320755,1.344518,103.73863,1600.0,0.2719,0.067848,0.143355,...,0,0,0,0,0,0,0,0,0,0
1,0.26674,4,0.281768,0.716981,1.398524,103.903863,2400.0,0.265562,0.077554,0.212643,...,0,1,0,0,0,0,0,0,0,0
2,0.26674,6,0.596685,0.641509,1.378101,103.736002,1800.0,0.490368,0.124587,0.08378,...,0,0,0,0,0,0,0,0,0,0
3,0.26674,5,0.41989,0.660377,1.447534,103.817892,2100.0,0.108178,0.908049,0.1971,...,0,0,0,1,0,0,0,0,0,0
4,0.26674,5,0.480663,0.396226,1.347017,103.849987,2750.0,0.167685,0.228855,0.282732,...,0,0,0,0,0,0,0,0,0,0
