### importing libraries

In [202]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

### scraping and cleaning

after scraping html text from the url, i extracted the column names and the rows data into a list and converted it into a dataframe. then i removed unassigned boroughs and checked if there were any unassigned neighborhoods. since there were no unassigned neighborhoods after clearing unassigned boroughs, we can skip the step to reassign neighborhoods

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_text = requests.get(url).text
soup = BeautifulSoup(html_text,'lxml')
table = soup.find("table", attrs = {'class':'wikitable'}) #class="wikitable sortable"
table_tr = table.tbody.find_all('tr')
columns = ['Postal Code','Borough','Neighborhood'] #declaring column names
rows = []
for data in table_tr[1:]: #skipping first row because they contain headers
    row = [] 
    row.append(data.find_all('td')[0].text.replace('\n','').strip()) #postal code of each row
    row.append(data.find_all('td')[1].text.replace('\n','').strip()) #borough of each row
    row.append(data.find_all('td')[2].text.replace('\n','').strip()) #neighborhood of each row
    rows.append(row) #adding the list of the three attributes to another list so it can be used as input for DF
df = pd.DataFrame(rows,columns=columns)
df = df[~(df['Borough']=='Not assigned')] #removing unassigned boroughs
unassigned_neighborhoods = 0 #counting unassigned neighborhoods after removing unassigned boroughs
for i, row in df.iterrows():
    if row.Neighborhood == 'Not assigned':
        unassigned_neighborhoods+=1
print('Unassigned Neighborhoods = ',unassigned_neighborhoods)
df.head()

Unassigned Neighborhoods =  0


Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [3]:
df.shape

(103, 3)

### extracting geospatial data

In [4]:
!wget -q -O 'geodata.csv' https://cocl.us/Geospatial_data

In [5]:
df_geo = pd.read_csv('geodata.csv')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [44]:
df_merged = df.join(df_geo.set_index('Postal Code'),on='Postal Code')
df_merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
165,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### displaying all postal codes in toronto

In [45]:
df_merged = df_merged[df_merged.Neighborhood.str.contains('Toronto')]
df_merged.reset_index(inplace=True,drop=True)
df_merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4J,East York,"East Toronto, Broadview North (Old East York)",43.685347,-79.338106
1,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
2,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576
3,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
4,M5S,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049
5,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores",43.605647,-79.501321
6,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558


In [46]:
latlng = Nominatim(user_agent='toronto').geocode('Toronto')
latitude = latlng.latitude-0.15
longitude = latlng.longitude+0.1
map_toronto = folium.Map(location=[latitude,longitude],zoom_start=10)
for lat, lng, postal, borough in zip(df_merged['Latitude'],df_merged['Longitude'],df_merged['Postal Code'],df_merged['Borough']):
    pop = '{},{}'.format(postal,borough)
    label = folium.Popup(pop)
    folium.CircleMarker(
    [lat,lng],
    radius=3,
    popup=label,
    color='blue',
    fill=True).add_to(map_toronto)
map_toronto

### clustering of venues by category

this cell contains id and secret and hence it's deprecated

In [215]:
CLIENT_ID = 'client_id' # your Foursquare ID
CLIENT_SECRET = 'client_secret' # your Foursquare Secret
ACCESS_TOKEN = 'access_token'
VERSION = '20180605' # Foursquare API version

to understand the json response format, a single toronto neighborhood is called

In [61]:
lat = df_merged.loc[0,'Latitude']
lng = df_merged.loc[0,'Longitude']
radius = 500
limit = 100
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&oauth_token={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    ACCESS_TOKEN,
    VERSION,
    lat,
    lng,
    radius,
    limit
)


results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f0545820ff35d2a391be16e'},
 'notifications': [{'type': 'notificationTray', 'item': {'unreadCount': 0}}],
 'response': {'headerLocation': 'Greektown',
  'headerFullLocation': 'Greektown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.6898470045, 'lng': -79.33189528390383},
   'sw': {'lat': 43.6808469955, 'lng': -79.34431771609616}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '591e690841868654556b5a00',
       'name': 'Ben Martin Cocher EPK BTS Toronto',
       'location': {'address': '28 Inwood Ave',
        'lat': 43.6853932,
        'lng': -79.33633739999999,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.6853932,
          'lng': -79.3363

In [70]:
#figuring out the values by traversing through json
results['response']['groups'][0]['items'][0]['venue']['name']

'Ben Martin Cocher EPK BTS Toronto'

In [71]:
#making df for venues
venues = []
for lat,lng,postal in zip(df_merged['Latitude'],df_merged['Longitude'],df_merged['Postal Code']):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&oauth_token={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        ACCESS_TOKEN,
        VERSION,
        lat,
        lng,
        radius,
        limit
    )
    results = requests.get(url).json()['response']['groups'][0]['items']
    for result in results:
        venues.append([postal,lat,lng,result['venue']['name'],result['venue']['location']['lat'],result['venue']['location']['lng'],result['venue']['categories'][0]['name']])
columns = ['Postal Code','Neighborhood Latitude','Neighborhood Longitude','Venue','Latitude','Longitude','Category']        
df_venues = pd.DataFrame(venues,columns=columns)

In [72]:
df_venues.head()

Unnamed: 0,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue,Latitude,Longitude,Category
0,M4J,43.685347,-79.338106,Ben Martin Cocher EPK BTS Toronto,43.685393,-79.336337,Film Studio
1,M4J,43.685347,-79.338106,Aldwych Park,43.684901,-79.341091,Park
2,M4J,43.685347,-79.338106,Sammon Convenience,43.686951,-79.335007,Convenience Store
3,M4J,43.685347,-79.338106,Donlands Subway Station,43.68096,-79.337759,Metro Station
4,M5J,43.640816,-79.381752,Harbourfront,43.639526,-79.380688,Neighborhood


In [73]:
df_venues.shape

(347, 7)

In [74]:
len(df_venues['Category'].unique())

137

### making dictionaries out of each category count per neighborhood

In [150]:
out_df = pd.DataFrame()
for code in df_venues['Postal Code'].unique():
    temp = dict(df_venues[df_venues['Postal Code']==code]['Category'].value_counts().head(10))
    temp2 = {'Postal Code':code}
    temp3 = temp2.update(temp)
    print(df_venues[df_venues['Postal Code']==code]['Postal Code'].unique())
    print(temp2)
    out_df = out_df.append(temp2,ignore_index=True)
    print('-----')
out_df.fillna(0,inplace=True)
out_df.set_index('Postal Code')

['M4J']
{'Postal Code': 'M4J', 'Metro Station': 1, 'Film Studio': 1, 'Convenience Store': 1, 'Park': 1}
-----
['M5J']
{'Postal Code': 'M5J', 'Coffee Shop': 12, 'Aquarium': 5, 'Hotel': 4, 'Café': 4, 'Restaurant': 3, 'Brewery': 3, 'Sporting Goods Shop': 3, 'Scenic Lookout': 3, 'Fried Chicken Joint': 3, 'Music Venue': 2}
-----
['M5K']
{'Postal Code': 'M5K', 'Coffee Shop': 14, 'Café': 7, 'Restaurant': 6, 'Hotel': 6, 'Seafood Restaurant': 3, 'Deli / Bodega': 3, 'Japanese Restaurant': 2, 'Pizza Place': 2, 'Tea Room': 2, 'Salad Place': 2}
-----
['M4R']
{'Postal Code': 'M4R', 'Clothing Store': 5, 'Cosmetics Shop': 3, 'Spa': 2, 'Sporting Goods Shop': 2, 'Salon / Barbershop': 2, 'Coffee Shop': 2, 'Restaurant': 1, 'Metro Station': 1, 'Fast Food Restaurant': 1, 'Café': 1}
-----
['M5S']
{'Postal Code': 'M5S', 'Café': 7, 'Sandwich Place': 4, 'Coffee Shop': 3, 'Restaurant': 2, 'Bakery': 2, 'Bookstore': 2, 'Japanese Restaurant': 2, 'Italian Restaurant': 2, 'Bar': 2, 'Pub': 2}
-----
['M8V']
{'Postal Co

Unnamed: 0_level_0,Convenience Store,Film Studio,Metro Station,Park,Aquarium,Brewery,Café,Coffee Shop,Fried Chicken Joint,Hotel,...,Business Service,Gym,Liquor Store,Auto Workshop,Burrito Place,Butcher,Gym / Fitness Center,Light Rail Station,Recording Studio,Skate Park
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M4J,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
M5J,0.0,0.0,0.0,0.0,5.0,3.0,4.0,12.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
M5K,0.0,0.0,0.0,0.0,0.0,0.0,7.0,14.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
M4R,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
M5S,0.0,0.0,0.0,0.0,0.0,0.0,7.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
M8V,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,...,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
M7Y,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0


Swapping columns

In [162]:
out_df.reset_index()
columns = out_df.columns.tolist()
index = columns.index('Postal Code')
columns = [columns[index]]+columns[:index]+columns[index+1:]
out_df = out_df[columns]
out_df

Unnamed: 0,Postal Code,Convenience Store,Film Studio,Metro Station,Park,Aquarium,Brewery,Café,Coffee Shop,Fried Chicken Joint,...,Business Service,Gym,Liquor Store,Auto Workshop,Burrito Place,Butcher,Gym / Fitness Center,Light Rail Station,Recording Studio,Skate Park
0,M4J,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M5J,0.0,0.0,0.0,0.0,5.0,3.0,4.0,12.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M5K,0.0,0.0,0.0,0.0,0.0,0.0,7.0,14.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4R,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M5S,0.0,0.0,0.0,0.0,0.0,0.0,7.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M8V,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0,...,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M7Y,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0


#### Normalizing

In [178]:
df_index = out_df.loc[:,'Postal Code']
df_test = out_df.loc[:,out_df.columns!='Postal Code'].div(7)
df_test.index = df_index
df_test.reset_index()

Unnamed: 0,Postal Code,Convenience Store,Film Studio,Metro Station,Park,Aquarium,Brewery,Café,Coffee Shop,Fried Chicken Joint,...,Business Service,Gym,Liquor Store,Auto Workshop,Burrito Place,Butcher,Gym / Fitness Center,Light Rail Station,Recording Studio,Skate Park
0,M4J,0.142857,0.142857,0.142857,0.142857,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M5J,0.0,0.0,0.0,0.0,0.714286,0.428571,0.571429,1.714286,0.428571,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M5K,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4R,0.0,0.0,0.142857,0.0,0.0,0.0,0.142857,0.285714,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M5S,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.428571,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M8V,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.285714,0.142857,...,0.285714,0.285714,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M7Y,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.142857,0.142857,0.142857,0.285714,0.285714,0.142857,0.142857


In [184]:
df_test.index

Index(['M4J', 'M5J', 'M5K', 'M4R', 'M5S', 'M8V', 'M7Y'], dtype='object', name='Postal Code')

### k Means
since i have only 7 neighborhoods in toronto unlike nyc manhattan, 4 clusters is high to start

In [187]:
kmeans = KMeans(n_clusters=4,random_state=0).fit(df_test)
kmeans.labels_

array([2, 3, 1, 2, 0, 2, 2], dtype=int32)

In [188]:
df_test['Clusters'] = kmeans.labels_
df_test

Unnamed: 0_level_0,Convenience Store,Film Studio,Metro Station,Park,Aquarium,Brewery,Café,Coffee Shop,Fried Chicken Joint,Hotel,...,Gym,Liquor Store,Auto Workshop,Burrito Place,Butcher,Gym / Fitness Center,Light Rail Station,Recording Studio,Skate Park,Clusters
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M4J,0.142857,0.142857,0.142857,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
M5J,0.0,0.0,0.0,0.0,0.714286,0.428571,0.571429,1.714286,0.428571,0.571429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
M5K,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.857143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
M4R,0.0,0.0,0.142857,0.0,0.0,0.0,0.142857,0.285714,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
M5S,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.428571,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
M8V,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.285714,0.142857,0.0,...,0.285714,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
M7Y,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.142857,0.142857,0.142857,0.285714,0.285714,0.142857,0.142857,2


### joining data with latlong for mapping

In [196]:
df_final = df_merged.join(df_test,on=df_test.index)

In [197]:
df_final

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Convenience Store,Film Studio,Metro Station,Park,Aquarium,...,Gym,Liquor Store,Auto Workshop,Burrito Place,Butcher,Gym / Fitness Center,Light Rail Station,Recording Studio,Skate Park,Clusters
0,M4J,East York,"East Toronto, Broadview North (Old East York)",43.685347,-79.338106,0.142857,0.142857,0.142857,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752,0.0,0.0,0.0,0.0,0.714286,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678,0.0,0.0,0.142857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,M5S,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores",43.605647,-79.501321,0.0,0.0,0.0,0.0,0.0,...,0.285714,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
6,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.142857,0.142857,0.142857,0.285714,0.285714,0.142857,0.142857,2


### mapping
visualisation code used per nyc lab

In [211]:
map_clusters = folium.Map(location=[latitude+0.15, longitude], zoom_start=11)
x = np.arange(4)
ys = [i + x + (i*x)**2 for i in range(4)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
markers_colors = []
for lat, lon, poi, cluster in zip(df_final['Latitude'], df_final['Longitude'], df_final['Postal Code'], df_final['Clusters']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

In [212]:
map_clusters