In [88]:
import pandas as pd

import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

In [89]:
CLIENT_ID = 'IDJCUT0LVSS1DDP2M0KP3ZOSKNKDJ5V42DNNJ2YZF3VC50HB' # your Foursquare ID
CLIENT_SECRET = 'L5PJFPZHQHZ21MMHPJKLIHC1E5Y53OMYRIYBED2XARUP1REF' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 100

In [90]:
#reading our dataset
neighborhoods_subset = pd.read_csv("sg_random_samles_clean.csv")
neighborhoods_subset.head()

Unnamed: 0.1,Unnamed: 0,Neighborhood,Latitude,Longitude
0,0,BEDOKVILLE,1.323681,103.947566
1,1,BRADDELL HEIGHTS ESTATE,1.349106,103.865046
2,2,BUKIT LOYANG ESTATE,1.361056,103.964245
3,3,CAPITOL PARK,1.328713,103.817978
4,4,CASHEW GREEN,1.373039,103.770301


In [91]:
# Select first building (which in our case is BedokVille):
neighborhood_name = neighborhoods_subset.loc[0, 'Neighborhood']
neighborhood_latitude = neighborhoods_subset.loc[0, 'Latitude'] 
neighborhood_longitude = neighborhoods_subset.loc[0, 'Longitude'] 
# limit of number of venues returned by Foursquare API
LIMIT = 100 
radius = 500

In [92]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
results = requests.get(url).json()

In [93]:
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

In [94]:
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
# filter the category for each row
#nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,New Changi Eating House,"[{'id': '4bf58dd8d48988d142941735', 'name': 'A...",1.323117,103.945954
1,Amy's Laksa,"[{'id': '4bf58dd8d48988d1d1941735', 'name': 'N...",1.323111,103.945957
2,Mama Shop @ Blk 165,"[{'id': '4bf58dd8d48988d118951735', 'name': 'G...",1.320112,103.946199
3,Tanah Merah Playground,"[{'id': '4bf58dd8d48988d1e7941735', 'name': 'P...",1.327528,103.946284
4,Changi Naval Base Pick Up Point @ Tanah Merah,"[{'id': '4bf58dd8d48988d1fe931735', 'name': 'B...",1.327365,103.945903


In [95]:
venues_map = folium.Map(location=[neighborhood_latitude, neighborhood_longitude], zoom_start=15)
# add a red circle marker to represent the BEDOKVILLE
folium.CircleMarker(
    [neighborhood_latitude, neighborhood_longitude],
    radius=10,
    color='red',
    popup='BEDOKVILLE',
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(venues_map)
# add all venues as blue circle markers
for lat, lng, label in zip(nearby_venues.lat, nearby_venues.lng, nearby_venues.categories):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)
display(venues_map)

In [96]:
neighborhoods = pd.read_csv('singapore_geo.csv')

In [97]:
print('The dataframe has {} postcodes and {} neighborhoods.'.format(
        len(neighborhoods['Postcode'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 85265 postcodes and 155744 neighborhoods.


In [98]:
neighborhoods.head()

Unnamed: 0.1,Unnamed: 0,Postcode,Neighborhood,Latitude,Longitude
0,0,18906,SINGAPORE CHINESE CULTURAL CENTRE,1.275829,103.849576
1,1,18907,TEMPORARY SITE OFFICE,1.27495,103.851665
2,3,18915,TEMPORARY SITE OFFICE,1.273682,103.860075
3,4,18925,CITIBANK TRADE_BRANCH,1.276424,103.854759
4,5,18925,DBS Marina Bay MRT Station,1.276427,103.854598


In [99]:
#Randomly select only 500 neighborhoods
neighborhoods_subset = neighborhoods.sample(500
                                           )
neighborhoods_subset.head()

Unnamed: 0.1,Unnamed: 0,Postcode,Neighborhood,Latitude,Longitude
34839,52820,419910,FRANKEL ESTATE,1.319396,103.914566
127685,194594,588905,MAYFAIR PARK,1.34075,103.779755
98572,156599,465607,TANAH MERAH GREEN,1.329652,103.943983
39790,65432,458411,OPERA ESTATE,1.319086,103.926208
95575,152847,456648,CASA NALLUR,1.311367,103.932649


In [100]:
neighborhoods_subset=neighborhoods_subset.loc[:, ~neighborhoods_subset.columns.str.contains('^Unnamed')]
print(neighborhoods_subset.shape)
neighborhoods_subset.head()

(500, 4)


Unnamed: 0,Postcode,Neighborhood,Latitude,Longitude
34839,419910,FRANKEL ESTATE,1.319396,103.914566
127685,588905,MAYFAIR PARK,1.34075,103.779755
98572,465607,TANAH MERAH GREEN,1.329652,103.943983
39790,458411,OPERA ESTATE,1.319086,103.926208
95575,456648,CASA NALLUR,1.311367,103.932649


In [101]:
neighborhoods_subset.shape

(500, 4)

In [102]:
#Some neighborhoods correspond to the CONSERVATION AREA, which is in multiple locations, therfore CONSERVATION AREAS are excluded from our analysis.
to_drop = ['CONSERVATION', 'AREA']
neighborhoods_subset=neighborhoods_subset[~neighborhoods_subset.Neighborhood.str.contains("CONSERVATION AREA")]
neighborhoods_subset.shape

(465, 4)

In [103]:
neighborhoods_subset.head()

Unnamed: 0,Postcode,Neighborhood,Latitude,Longitude
34839,419910,FRANKEL ESTATE,1.319396,103.914566
127685,588905,MAYFAIR PARK,1.34075,103.779755
98572,465607,TANAH MERAH GREEN,1.329652,103.943983
39790,458411,OPERA ESTATE,1.319086,103.926208
95575,456648,CASA NALLUR,1.311367,103.932649


In [104]:
#cleaning the data
group_by_occurence=neighborhoods_subset.groupby('Neighborhood').count().reset_index()
group_by_occurence.head()

Unnamed: 0,Neighborhood,Postcode,Latitude,Longitude
0,24-HOUR WALK-IN CLINIC (MOUNT ALVERNIA HOSPITAL),1,1,1
1,ADELPHI PARK ESTATE,1,1,1
2,AFFLUENCE COURT,1,1,1
3,ALEGRIA,1,1,1
4,ALIWAL PARK HOTEL,1,1,1


In [105]:
group_by_occurence.sort_values(by="Latitude",ascending=False).head()

Unnamed: 0,Neighborhood,Postcode,Latitude,Longitude
205,OPERA ESTATE,14,14,14
249,SERANGOON GARDEN ESTATE,8,8,8
125,HDB-JURONG WEST,7,7,7
92,FABER HILLS,7,7,7
154,KEMBANGAN ESTATE,6,6,6


In [106]:
df_tmp = group_by_occurence[group_by_occurence['Latitude']>1]

In [107]:
df_tmp.shape

(57, 4)

In [108]:
df_tmp_2 = group_by_occurence[group_by_occurence['Latitude']==1]
df_tmp_2.head()

Unnamed: 0,Neighborhood,Postcode,Latitude,Longitude
0,24-HOUR WALK-IN CLINIC (MOUNT ALVERNIA HOSPITAL),1,1,1
1,ADELPHI PARK ESTATE,1,1,1
2,AFFLUENCE COURT,1,1,1
3,ALEGRIA,1,1,1
4,ALIWAL PARK HOTEL,1,1,1


In [109]:
def subset_data_frame(input_df1):
    Nghbr = []
    Lat = []
    Long =[]

    for name in input_df1.Neighborhood :
        Nghbr.append(name)
        coordinates = neighborhoods_subset[neighborhoods_subset["Neighborhood"]==name][["Latitude","Longitude"]]
        Lat.append(coordinates.iloc[0,0])
        Long.append(coordinates.iloc[0,1])
    
    df_temp_coord = pd.DataFrame({"Neighborhood":Nghbr,"Latitude":Lat,"Longitude":Long})
    df_temp_coord=df_temp_coord[["Neighborhood","Latitude","Longitude"]]
    
    return df_temp_coord

In [110]:
df_part1 = subset_data_frame(df_tmp)

In [111]:
df_part2 = subset_data_frame(df_tmp_2)
df_part2.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,24-HOUR WALK-IN CLINIC (MOUNT ALVERNIA HOSPITAL),1.341499,103.837774
1,ADELPHI PARK ESTATE,1.355015,103.828192
2,AFFLUENCE COURT,1.35015,103.883062
3,ALEGRIA,1.324303,103.841018
4,ALIWAL PARK HOTEL,1.303419,103.860235


In [112]:
df_part1=df_part1.append(df_part2)
df_part1.shape

(345, 3)