In [1]:
import pandas as pd
from geopy.distance import geodesic

### Read train data and test data

In [2]:
df_train = pd.read_csv("train_new.csv")
print('Number of train records: {}'.format(len(df_train)))
df_train.head()

Number of train records: 60000


Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,planning_area,region,monthly_rent
0,2021-09,jurong east,257,Jurong East Street 24,3-room,new generation,67.0,1983,1.344518,103.73863,yuhua east,jurong east,west region,1600
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,1978,1.330186,103.938717,bedok north,bedok,east region,2250
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,1971,1.332242,103.845643,toa payoh central,toa payoh,central region,1900
3,2021-08,pasir ris,250,Pasir Ris Street 21,executive,apartment,149.0,1993,1.370239,103.962894,pasir ris drive,pasir ris,east region,2850
4,2022-11,kallang/whampoa,34,Whampoa West,3-room,improved,68.0,1972,1.320502,103.863341,bendemeer,kallang,central region,2100


In [3]:
df_test = pd.read_csv("test_new.csv")
print('Number of test records: {}'.format(len(df_test)))
df_test.head()

Number of test records: 30000


Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,planning_area,region
0,2023-01,hougang,245,hougang street 22,5-room,improved,121.0,1984,1.358411,103.891722,lorong ah soo,hougang,north-east region
1,2022-09,sembawang,316,sembawang vista,4-room,model a,100.0,1999,1.446343,103.820817,sembawang central,sembawang,north region
2,2023-07,clementi,708,Clementi West Street 2,4-room,new generation,91.0,1980,1.305719,103.762168,clementi west,clementi,west region
3,2021-08,jurong east,351,Jurong East Street 31,3-room,model a,74.0,1986,1.344832,103.730778,yuhua west,jurong east,west region
4,2022-03,jurong east,305,jurong east street 32,5-room,improved,121.0,1983,1.345437,103.735241,yuhua west,jurong east,west region


### Drop columns: 'block', 'street_name', 'subzone', 'planning_area'

In [4]:
df_train = df_train.drop(columns=['block', 'street_name', 'subzone', 'planning_area'], errors='ignore')
df_test = df_test.drop(columns=['block', 'street_name', 'subzone', 'planning_area'], errors='ignore')

### Read existing mrt stations data

In [5]:
df_mrt = pd.read_csv("auxiliary-data/sg-mrt-existing-stations.csv")
print('Number of mrt station records: {}'.format(len(df_mrt)))
df_mrt.head()

Number of mrt station records: 162


Unnamed: 0,code,name,opening_year,latitude,longitude
0,NS1,Jurong East,1990,1.333295,103.742154
1,NS2,Bukit Batok,1990,1.349035,103.749526
2,NS3,Bukit Gombak,1990,1.358663,103.751913
3,NS4,Choa Chu Kang,1990,1.385363,103.744371
4,NS5,Yew Tee,1996,1.397476,103.747418


### Calculate the distance (km) to the nearest mrt station

In [6]:
def calculate_min_distance(row):
    min_distance = float('inf')
    for _, df2_row in df_mrt.iterrows():
        loc1 = (row['latitude'], row['longitude'])
        loc2 = (df2_row['latitude'], df2_row['longitude'])
        distance = geodesic(loc1, loc2).kilometers
        if distance < min_distance:
            min_distance = distance
    return min_distance

In [7]:
df_train['min_distance'] = df_train.apply(calculate_min_distance, axis=1)

In [8]:
df_test['min_distance'] = df_test.apply(calculate_min_distance, axis=1)

In [9]:
df_train.head()

Unnamed: 0,rent_approval_date,town,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,region,monthly_rent,min_distance
0,2021-09,jurong east,3-room,new generation,67.0,1983,1.344518,103.73863,west region,1600,0.699301
1,2022-05,bedok,4-room,new generation,92.0,1978,1.330186,103.938717,east region,2250,0.899077
2,2022-10,toa payoh,3-room,improved,67.0,1971,1.332242,103.845643,central region,1900,0.218801
3,2021-08,pasir ris,executive,apartment,149.0,1993,1.370239,103.962894,east region,2850,1.547366
4,2022-11,kallang/whampoa,3-room,improved,68.0,1972,1.320502,103.863341,central region,2100,0.18793


In [10]:
df_test.head()

Unnamed: 0,rent_approval_date,town,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,region,min_distance
0,2023-01,hougang,5-room,improved,121.0,1984,1.358411,103.891722,north-east region,0.82103
1,2022-09,sembawang,4-room,model a,100.0,1999,1.446343,103.820817,north region,0.306192
2,2023-07,clementi,4-room,new generation,91.0,1980,1.305719,103.762168,west region,1.092259
3,2021-08,jurong east,3-room,model a,74.0,1986,1.344832,103.730778,west region,0.361691
4,2022-03,jurong east,5-room,improved,121.0,1983,1.345437,103.735241,west region,0.447607


In [11]:
df_train.to_csv("train_mrt_dis.csv", index=False)
df_test.to_csv("test_mrt_dis.csv", index=False)

In [None]:
# from math import radians, cos, sin, asin, sqrt
# def distance(lat1, lat2, lon1, lon2):
     
#     # The math module contains a function named
#     # radians which converts from degrees to radians.
#     lon1 = radians(lon1)
#     lon2 = radians(lon2)
#     lat1 = radians(lat1)
#     lat2 = radians(lat2)
      
#     # Haversine formula
#     dlon = lon2 - lon1
#     dlat = lat2 - lat1
#     a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
 
#     c = 2 * asin(sqrt(a))
    
#     # Radius of earth in kilometers. Use 3956 for miles
#     r = 6371
      
#     # calculate the result
#     return(c * r)
     
     
# # driver code
# lat1 = df_train.iloc[4]['latitude']
# lat2 = df_mrt.iloc[0]['latitude']
# lon1 = df_train.iloc[4]['longitude']
# lon2 = df_mrt.iloc[0]['longitude']
# print(lat1, lat2, lon1, lon2)
# print(distance(lat1, lat2, lon1, lon2), "K.M")

In [None]:
# result = []
# for i in range(len(df_train)):
#     min_dis = None
#     for j in range(len(df_mrt)):
#         lat1 = df_train.iloc[i]['latitude']
#         lat2 = df_mrt.iloc[j]['latitude']
#         lon1 = df_train.iloc[i]['longitude']
#         lon2 = df_mrt.iloc[j]['longitude']
#         curr_dis = distance(lat1, lat2, lon1, lon2)
#         if min_dis is None or curr_dis < min_dis:
#             min_dis = curr_dis
#     result.append(min_dis)