# Import requried dependencies

In [1]:
import os
import pandas as pd
import numpy as np
import math

# Defining Local paths to datasets

In [2]:
datasets_path = "../../../datasets"
auxillary_path = f"{datasets_path}/auxiliary-data"
transformed_path = f"{datasets_path}/transformed"

train_inp_path = f"{datasets_path}/train.csv"
mrt_existing_inp_path = f"{auxillary_path}/sg-mrt-existing-stations.csv"
mrt_planned_inp_path = f"{auxillary_path}/sg-mrt-planned-stations.csv"
primary_schools_inp_path = f"{auxillary_path}/sg-primary-schools.csv"
shopping_malls_inp_path = f"{auxillary_path}/sg-shopping-malls.csv"

# What are we planning to do ?

Since we have the following information:
1. existing mrts and their latitude & longitudes
2. planned mrts and their latitude & longitudes
3. primary schools and their latitude & longitudes
4. shopping malls and their latitude & longitudes

It is fair to assume that the prices also get influenced by the number of such locations in its vicinity. For e.g. a house next to a mall could potentially be pricy.

Since we also have the latitude and longitude for each house, we can find out the distance to the nearest existing/planned mrt, school and shopping mall and add those distances in meters as new columns to our dataset.

Our regression models should then have enough context to potentially learn some unseen rules from these newly added columns as well.

<b> How do we find distances given a latitude and longitude? </b>

There were mainly two options which immediately came to our mind.
1. Using google maps api to find out the walking distance to each point
2. Using an approximation of "haversine distance" to compute the distance along the surface area of earth

Option #1 was not feasible given the API prices therefore we are going for a much simpler haversine distance computation

In [3]:
# For computing haversine distances
def haversine_distance(lat1, lon1, lat2, lon2):
    '''
    Compute the distances between 2 lat and long

    :param lat1:
    :param lon1:
    :param lat2:
    :param lon2:
    :return:
    '''
    # Radius of the Earth in kilometers
    earth_radius_km = 6371.0

    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Haversine formula
    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Calculate the distance
    distance = earth_radius_km * c

    #returns distance in meters
    return distance * 1000


# 0. Reading all the dataframes and doing minor preprocessing steps

In [4]:
# read all the input dfs
df_house = pd.read_csv(train_inp_path)
df_mrt = pd.read_csv(mrt_existing_inp_path)
df_mrt_planned = pd.read_csv(mrt_planned_inp_path)
df_mall = pd.read_csv(shopping_malls_inp_path)
df_school = pd.read_csv(primary_schools_inp_path)


In [5]:
# remove df where the opening year is TBA
df_mrt_planned = df_mrt_planned[df_mrt_planned['opening_year'] != 'TBA']

# get list of coordinates for everything
lat_long_of_houses = list(zip(df_house['latitude'], df_house['longitude']))
lat_long_of_mrts = list(zip(df_mrt['latitude'], df_mrt['longitude']))
lat_long_of_mrts_planned = list(zip(df_mrt_planned['latitude'], df_mrt_planned['longitude']))
lat_long_of_malls = list(zip(df_mall['latitude'], df_mall['longitude']))
lat_long_of_school = list(zip(df_school['latitude'], df_school['longitude']))

# needed when calculating for planned mrts
house_rent_approval_year = df_house['rent_approval_date'].tolist()
mrt_opening_year = df_mrt_planned['opening_year'].tolist()

# 1. Computing the distances of each house to its nearest existing MRT


In [6]:
def calculate_distance_to_nearest_mrt(list1, list2):
    '''
    Calculate Haversine distances between all pairs of points in two lists.
    Each list contains points represented as (latitude, longitude) tuples.

    :param list1:
    :param list2:
    :return:
    '''

    distances_to_closest_mrt_for_all_houses = []

    for point1 in list1:
        lat1, lon1 = point1
        distances_to_all_mrts = []
        for point2 in list2:
            lat2, lon2 = point2
            # distance = (geodesic(point1, point2).kilometers ) * 1000
            distance = haversine_distance(lat1, lon1, lat2, lon2)
            distances_to_all_mrts.append(distance)
        distance_to_closest_mrt = min(distances_to_all_mrts)
        distances_to_closest_mrt_for_all_houses.append(distance_to_closest_mrt)

    return distances_to_closest_mrt_for_all_houses


In [8]:
# calculate all distances
distances_to_closest_mrt_for_all_houses = calculate_distance_to_nearest_mrt(lat_long_of_houses, lat_long_of_mrts)
df_house['distance_to_nearest_existing_mrt'] = distances_to_closest_mrt_for_all_houses


# 2. Computing the distances of each house to its nearest planned MRT

Since none of these MRT's exist currently they should not be given the same weightage as existing MRTs.

Hence, we defined a damping factor called "alpha" which accounts for the expected future distance to the planned MRT while also considering the years of construction left to inaugurate the MRT.
<br>
$distance\_to\_planned\_mrt = estimated\_real\_world\_distance + 100e^{years\_left\_for\_opening}$
<br>
The further in future a particular planned MRT is to be opened, the distance to that MRT is also increased accordingly by considering the wait time.

This way the values here would represent some kind of "future reward" which the model can learn.

In [16]:
def apply_alpha(distance, years):
    '''
    apply some kind of weight to ensure that future planned MRTs have more distance (to indicate the patience level of the customer)

    :param distance:
    :param years:
    :return:
    '''
    distance = distance + (math.exp(years) * 100)
    return distance


def calculate_distance_to_nearest_mrt_planned(lat_long_of_houses, house_rent_approval_year, lat_long_of_mrts, mrt_opening_year):
    '''
    Calculate Haversine distances between all pairs of points in two lists.
    Each list contains points represented as (latitude, longitude) tuples.

    :param lat_long_of_houses:
    :param house_rent_approval_year:
    :param lat_long_of_mrts:
    :param mrt_opening_year:
    :return:
    '''

    num_houses = len(lat_long_of_houses)
    num_mrt = len(lat_long_of_mrts)
    distances_to_closest_mrt_for_all_houses = []

    for house_index in range(num_houses):
        point1 = lat_long_of_houses[house_index]
        lat1, lon1 = point1
        distances_to_all_mrts = []

        for mrt_index in range(num_mrt):
            point2 = lat_long_of_mrts[mrt_index]
            lat2, lon2 = point2
            distance = haversine_distance(lat1, lon1, lat2, lon2)
            distances_to_all_mrts.append(distance)

        min_index = distances_to_all_mrts.index(min(distances_to_all_mrts))
        distance_to_closest_mrt = distances_to_all_mrts[min_index]
        rent_approval_date = house_rent_approval_year[house_index]
        rent_approval_year = rent_approval_date.split('-')[0]
        metro_opening_year = mrt_opening_year[min_index]

        years_left_for_metro_opening = int(rent_approval_year) - int(metro_opening_year)
        distance_to_closest_mrt = apply_alpha(distance_to_closest_mrt, years_left_for_metro_opening)
        distances_to_closest_mrt_for_all_houses.append(distance_to_closest_mrt)

    return distances_to_closest_mrt_for_all_houses



In [10]:
distances_to_closest_mrt_planned_for_all_houses = calculate_distance_to_nearest_mrt_planned(lat_long_of_houses, house_rent_approval_year, lat_long_of_mrts_planned,mrt_opening_year)
df_house['distance_to_nearest_planned_mrt'] = distances_to_closest_mrt_planned_for_all_houses

# 3. Computing the distances of each house to its nearest primary school

In [11]:
def calculate_distance_to_nearest_school(list1, list2):
    '''
    Calculate Haversine distances between all pairs of points in two lists.
    Each list contains points represented as (latitude, longitude) tuples.

    :param list1:
    :param list2:
    :return:
    '''

    distances_to_closest_school_for_all_houses = []

    for point1 in list1:
        lat1, lon1 = point1
        distances_to_all_school = []
        for point2 in list2:
            lat2, lon2 = point2
            distance = haversine_distance(lat1, lon1, lat2, lon2)
            distances_to_all_school.append(distance)
        distance_to_closest_school = min(distances_to_all_school)
        distances_to_closest_school_for_all_houses.append(distance_to_closest_school)

    return distances_to_closest_school_for_all_houses



In [12]:
distances_to_closest_school_for_all_houses = calculate_distance_to_nearest_school(lat_long_of_houses, lat_long_of_school)
df_house['distance_to_nearest_school'] = distances_to_closest_school_for_all_houses

# 4. Computing the distances of each house to its nearest shopping mall

In [13]:
def calculate_distance_to_nearest_mall(list1, list2):
    '''
    Calculate Haversine distances between all pairs of points in two lists.
    Each list contains points represented as (latitude, longitude) tuples.

    :param list1:
    :param list2:
    :return:
    '''

    distances_to_closest_mall_for_all_houses = []

    for point1 in list1:
        lat1, lon1 = point1
        distances_to_all_mall = []
        for point2 in list2:
            lat2, lon2 = point2
            distance = haversine_distance(lat1, lon1, lat2, lon2)
            distances_to_all_mall.append(distance)
        distance_to_closest_mall = min(distances_to_all_mall)
        distances_to_closest_mall_for_all_houses.append(distance_to_closest_mall)

    return distances_to_closest_mall_for_all_houses


In [14]:
distances_to_closest_mall_for_all_houses = calculate_distance_to_nearest_mall(lat_long_of_houses, lat_long_of_malls)
df_house['distance_to_nearest_mall'] = distances_to_closest_mall_for_all_houses

# 5. Putting it all together

In [15]:
df_house.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent,distance_to_nearest_existing_mrt,distance_to_nearest_planned_mrt,distance_to_nearest_school,distance_to_nearest_mall
0,2021-09,jurong east,257,Jurong East Street 24,3 room,new generation,67.0,yes,1983,1.344518,103.73863,0.0,yuhua east,jurong east,west region,1600,699.127003,675.092874,334.846135,1202.673513
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250,898.991282,904.343701,607.716465,1114.338361
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900,218.603012,3716.441532,425.76016,468.296504
3,2021-08,pasir ris,250,Pasir Ris Street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850,1546.040421,516.218553,564.969272,402.358778
4,2022-11,kallang/whampoa,34,Whampoa West,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100,187.855816,2831.638339,271.723351,1073.353709
