# Importing required packages

In [5]:
import pandas as pd
import sys
import os
from process import *
from process_coe_prices import *
from process_stock_prices import *
from process_main_dataset import *
from process_location_info import *

# Defining Local paths to datasets

In [10]:
datasets_path = "../../datasets"
train_input_path = f"{datasets_path}/train.csv"
test_input_path = f"{datasets_path}/test.csv"
mrt_input_path = f"{datasets_path}/auxiliary-data/sg-mrt-existing-stations.csv"
mrt_planned_input_path = f"{datasets_path}/auxiliary-data/sg-mrt-planned-stations.csv"
mall_input_path = f"{datasets_path}/auxiliary-data/sg-shopping-malls.csv"
school_input_path = f"{datasets_path}/auxiliary-data/sg-primary-schools.csv"
coe_input_path = f"{datasets_path}/auxiliary-data/sg-coe-prices.csv"
stocks_input_path = f"{datasets_path}/auxiliary-data/sg-stock-prices.csv"

# Getting the data from the auxiliary datasets

For more details about how we process the individual auxiliary datasets refer to the ipynb notebooks in /data/auxiliary datasets.

Once we get the values we add them as new columns in the original train dataset

In [11]:
# getting additional details from auxillary datasets
# process with coe
df_coe = transform_coe_prices(coe_input_path)

# process stock information
df_stocks = transform_stock_prices(stocks_input_path)

# first process and transform the dataset with the distance related values
df_with_locations = extract_distance_columns_from_aux_mrt_school_mall(train_input_path, mrt_input_path, mrt_planned_input_path, mall_input_path, school_input_path)

#combining all the dataframes
df_dirty = merge_dataframes(df_with_locations, df_coe, df_stocks)

Finished processing the COE Auxiliary dataset
Finished processing the Stock prices Auxiliary dataset
Finished calculating the distance to nearest existing mrt
Finished calculating the distance to nearest planned mrt
Finished calculating the distance to nearest school
Finished calculating the distance to nearest mall


## Checking out how the dataset looks like now

In [12]:
df_dirty.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,...,subzone,planning_area,region,monthly_rent,distance_to_nearest_existing_mrt,distance_to_nearest_planned_mrt,distance_to_nearest_school,distance_to_nearest_mall,coe_price_indicator,stock_price
0,2021-09,jurong east,257,Jurong East Street 24,3 room,new generation,67.0,yes,1983.0,1.344518,...,yuhua east,jurong east,west region,1600.0,699.127003,675.092874,334.846135,1202.673513,0.559383,0.888712
1,2021-09,punggol,203a,punggol field,4 room,model a,85.0,yes,2004.0,1.398524,...,matilda,punggol,north-east region,2400.0,683.672274,766.323178,476.014557,704.576366,0.559383,0.888712
2,2021-09,choa chu kang,458,choa chu kang avenue 4,executive,premium apartment,142.0,yes,2000.0,1.378101,...,peng siang,choa chu kang,west region,1800.0,1231.831781,1208.372363,213.468076,936.378087,0.559383,0.888712
3,2021-09,sembawang,340a,sembawang close,5 room,premium apartment,110.0,yes,2001.0,1.447534,...,sembawang central,sembawang,north region,2100.0,299.913615,8571.934879,444.346799,189.552356,0.559383,0.888712
4,2021-09,bishan,117,bishan street 12,5 room,improved,121.0,yes,1987.0,1.347017,...,bishan east,bishan,central region,2750.0,445.013019,2188.355884,618.812729,402.450976,0.559383,0.888712


# Cleaning the dataset


## 1. Deleting unnecessary columns

## 2. Converting all string values to lower

## 3. Converting the date to unix timestep

## 4. Finding an ordinal value for flat type and flat model

## 5. One hot encoding

## 6. Removing duplicate rows

# Ensuring that columns are normalized

The following columns need to be normalized:
1. rent_approval_date: as it is now a unix timestamp
2. lease_commense_date: as it is just an year value
3. floor_area_sqm
4. all the distance values computed to mrt, school and malls
5. coe_price_indicator (from auxiliary dataset)
6. stock_price (from auxiliary dataset)

In [None]:
df_unnormalized = normalize_column(df_no_duplicate, "rent_approval_date")
df_unnormalized = normalize_column(df_unnormalized, "lease_commence_date")
df_unnormalized = normalize_column(df_unnormalized, "floor_area_sqm")
df_unnormalized = normalize_column(df_unnormalized, "distance_to_nearest_existing_mrt")
df_unnormalized = normalize_column(df_unnormalized, "distance_to_nearest_planned_mrt")
df_unnormalized = normalize_column(df_unnormalized, "distance_to_nearest_school")
df_unnormalized = normalize_column(df_unnormalized, "distance_to_nearest_mall")
df_unnormalized = normalize_column(df_unnormalized, "coe_price_indicator")
df_normalized = normalize_column(df_unnormalized, "stock_price")

# Checking out how the final cleaned dataset looks like

In [None]:
df_normalized.head()