In [1]:
import pandas as pd

In [2]:
import numpy as np

### Get stations data

In [3]:
stations = pd.read_json('./bkk-stations.json')

In [4]:
bkk_lat, bkk_lng = stations['lat'].mean(),  stations['lng'].mean()

In [5]:
import folium
map_bkk = folium.Map(location=[bkk_lat, bkk_lng], zoom_start=12)

for lat, lng, name in zip(stations['lat'], stations['lng'], stations['desc']):
    label = folium.Popup(name, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7
    ).add_to(map_bkk)  
    
map_bkk

### Get all categories reference

In [6]:
import requests

In [7]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [8]:
categories_url = 'https://api.foursquare.com/v2/venues/categories?v={}&client_id={}&client_secret={}'.format(VERSION,CLIENT_ID,CLIENT_SECRET)
raw_categories = requests.get(categories_url).json()['response']['categories']

In [9]:
def flatten_categories(category, main_category):
    output = [[category['id'], category['shortName'], main_category['id']]]
    if len(category['categories'])>0:
        for c in category['categories']:
            output += flatten_categories(c, main_category) 
    return output

In [10]:
all_categories = np.concatenate([flatten_categories(c, c) for c in raw_categories], axis=0)
all_categories = pd.DataFrame(all_categories,columns=['id','desc','main_id']).set_index('id')
main_categories = all_categories.loc[all_categories['main_id'].unique()][['desc']]

In [11]:
selected_categories = main_categories[main_categories['desc']!='Food']

### Get venues of each stations

In [12]:
def searchNearbyVenues(ref, lat, lng, categoryIds, radius=650, LIMIT=100,):
    venues_list=[]
    print(ref)
        
    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&intent=browse&categoryId={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        lng, 
        radius, 
        LIMIT,
        categoryIds)
        
    # make the GET request
    results = requests.get(url).json()["response"]['venues']
    
    # return only relevant information for each nearby venue
    venues_list += [(
        ref, 
        lat, 
        lng, 
        v['name'], 
        v['location']['lat'], 
        v['location']['lng'],  
        v['categories'][0]['id']) for v in results]
    
    nearby_venues = pd.DataFrame(venues_list,
                                 columns = ['reference', 
                                            'Neighborhood Latitude', 
                                            'Neighborhood Longitude', 
                                            'Venue', 
                                            'Venue Latitude', 
                                            'Venue Longitude', 
                                            'Venue Category ID'])
    
    return(nearby_venues)

In [13]:
venues = [searchNearbyVenues(x.desc, x.lat, x.lng, ','.join(selected_categories.index.values)) for index, x in stations.iterrows()]

HUA
SAM
LUM
KHO
SIR
PET
RAM
CUL
HUI
SUT
RAT
LAT
PHA
KAM
BAN
N8
N7
N5
N4
N3
N2
N1
CEN
E1
E2
E3
E4
E5
E6
E7
E8
E9
E10
E11
E12
E13
E14
W1
S1
S2
S3
S5
S6
S7
S8
S9
S10
S11
S12


In [14]:
venues = pd.concat(venues,axis=0).reset_index(drop=True)

In [15]:
venues['Venue Category ID'] = venues['Venue Category ID'].map(all_categories['main_id'])
venues = venues[venues['Venue Category ID']!=main_categories[main_categories['desc']=='Food'].index[0]]

In [16]:
palette = ["hsl({}, 100%, 50%)".format(360*x/3) for x in range(0,9)]
map_venues = folium.Map(location=[bkk_lat, bkk_lng], zoom_start=12)

for lat, lng, cate in zip(venues['Venue Latitude'], venues['Venue Longitude'], venues['Venue Category ID']):
    color = palette[selected_categories.index.get_loc(cate)]
    folium.Circle(
        [lat, lng],
        radius=5,
        opacity=0.7,
        color=color,
    ).add_to(map_venues)  
    
map_venues

In [17]:
venues['Category'] = venues['Venue Category ID'].map(selected_categories['desc'])
stations_x_categories = venues[['reference','Category','Venue']].groupby(['reference','Category']).count().unstack('Category')['Venue'].fillna(0)

The category "Event" seems anomaly.

In [18]:
stations_x_categories.sum()

Category
Arts & Entertainment      74.0
College & Education       95.0
Event                      2.0
Nightlife                134.0
Outdoors & Recreation    182.0
Professional             590.0
Residence                243.0
Shops                    608.0
Travel                   390.0
dtype: float64

It is unlikely a permanent venue anyway, so, remove "Event" category.

In [19]:
stations_x_categories.drop(columns=['Event'],inplace=True)
selected_categories = selected_categories[selected_categories['desc']!='Event']

normalize features

In [20]:
stations_x_categories = stations_x_categories.divide(stations_x_categories.max(),axis=1)

In [21]:
stations_x_categories

Category,Arts & Entertainment,College & Education,Nightlife,Outdoors & Recreation,Professional,Residence,Shops,Travel
reference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BAN,0.0,0.0,0.0,0.3,1.0,0.083333,0.416667,0.26087
CEN,1.0,0.0,0.1,0.4,0.2,0.0,1.0,0.217391
CUL,0.6,0.0,0.4,0.5,0.52,0.333333,0.666667,0.086957
E1,0.2,0.0,0.4,0.5,0.28,0.0,0.75,0.565217
E10,0.0,0.0,0.1,0.3,0.48,0.666667,0.75,0.173913
E11,0.0,0.125,0.1,0.2,0.48,0.916667,0.708333,0.173913
E12,0.1,0.125,0.2,0.8,0.28,0.583333,0.625,0.304348
E13,0.0,0.0,0.0,0.7,0.68,0.25,0.416667,0.130435
E14,0.0,0.041667,0.0,0.2,0.28,0.75,0.5,0.304348
E2,0.1,0.0,0.5,0.2,0.44,0.083333,0.375,0.826087


In [22]:
from sklearn.cluster import KMeans
kclusters = 4
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(stations_x_categories)

In [23]:
prediction = pd.DataFrame({'reference':stations_x_categories.index,'cluster':kmeans.labels_}).set_index('reference')

In [24]:
stations_x_prediction = stations.set_index('desc').join(prediction).reset_index()

In [25]:
map_predicted = folium.Map(location=[bkk_lat, bkk_lng], zoom_start=12)
palette = ["hsl({}, 100%, 50%)".format(360*x/kclusters) for x in range(0,kclusters)]
for lat, lng, name, cluster in zip(stations_x_prediction['lat'], stations_x_prediction['lng'], stations_x_prediction['desc'], stations_x_prediction['cluster']):
    label = folium.Popup(name, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=palette[cluster],
        fill=True,
        fill_color=palette[cluster],
        fill_opacity=0.7
    ).add_to(map_predicted)  
    
map_predicted

### Describe meaning of each cluster

In [26]:
from scipy import stats

In [27]:
stations_x_categories_prediction = stations_x_categories.join(prediction)
distinction_categories = []
for k in range(0,kclusters):
    for c in selected_categories['desc']:
        # ANOVA
        f_val, p_val = stats.f_oneway(stations_x_categories_prediction[stations_x_categories_prediction['cluster']==k][c], stations_x_categories_prediction[stations_x_categories_prediction['cluster']!=k][c])
        distinction_categories += [(k,c,f_val,p_val)]
summary = pd.DataFrame(distinction_categories,columns=['cluster','category','F','P'])
summary = summary[summary['P']<0.05]
summary.sort_values(by=['cluster','F'],ascending=False)

Unnamed: 0,cluster,category,F,P
28,3,Professional,26.061741,5.891631e-06
29,3,Residence,18.271038,9.288891e-05
30,3,Shops,17.559918,0.0001216915
31,3,Travel,16.312009,0.0001971431
26,3,Nightlife,6.215488,0.01624311
21,2,Residence,91.370287,1.345993e-12
23,2,Travel,11.554251,0.001386561
18,2,Nightlife,6.767196,0.01237659
16,2,Arts & Entertainment,4.13855,0.04758152
8,1,Arts & Entertainment,37.466271,1.765371e-07


In [28]:
pd.set_option('display.max_colwidth', 80)
prediction.reset_index().groupby('cluster').aggregate(','.join).sort_values(by=['cluster'],ascending=False)

Unnamed: 0_level_0,reference
cluster,Unnamed: 1_level_1
3,"BAN,E2,E3,E4,HUA,KHO,LUM,N2,N3,N4,N8,S1,S2,S3,S5,S6,SIR"
2,"E10,E11,E12,E13,E14,E6,E7,E8,E9,HUI,KAM,LAT,N5,N7,RAM,RAT,S10,S11,S7,S8,S9,SUT"
1,"CEN,CUL,E1,E5,N1,PHA,W1"
0,"PET,S12,SAM"
