# Segmenting and Clustering Neighborhoods in Toronto

### Task 1: Scraping the table

#### (a) Import packages

In [1]:
import numpy as np
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup

#### (b) Read table into json and then pandas dataframe

In [2]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
postalcodes = pd.read_html(str(table))
postalcodes_json = postalcodes[0].to_json(orient='records')
postalcodes = pd.read_json(postalcodes_json)

#### (c) Remove missing boroughs, sort and reset index

In [3]:
postalcodes = postalcodes.query('Borough != "Not assigned"')
postalcodes.sort_values(by = "Postal Code", axis = 0, ascending = True, inplace = True)
postalcodes.reset_index(drop =True, inplace = True)
postalcodes = postalcodes.rename(columns = {'Postal Code':'PostalCode'})
postalcodes

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


#### (d) Print the shape of the dataframe

In [4]:
print(postalcodes.shape)

(103, 3)


### Task 2: Get location data

After being unable to get results using the Geocoder package, I'm using the CSV provided.

#### (a) Read CSV of postcode locations

In [5]:
# Read the CSV
coords = pd.read_csv('https://cocl.us/Geospatial_data')

# Rename Postal Code column so it matches df - this avoids using left_on and right_on in merge
coords.columns = ['PostalCode', 'Latitude', 'Longitude']
coords

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


#### (b) Merge the two dataframes

In [6]:
torontodf = pd.merge(postalcodes, coords, how='left', on='PostalCode')
torontodf.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Task 3: Explore and cluster the neighborhoods in Toronto

#### (a) Import packages

In [7]:
from sklearn.cluster import KMeans
import folium
from geopy.geocoders import Nominatim
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.cm as cm
import matplotlib.colors as colors

#### (b) Get Toronto latitude and longitude

In [8]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
torontolocation = geolocator.geocode(address)
torontolatitude = torontolocation.latitude
torontolongitude = torontolocation.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(torontolatitude, torontolongitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


#### (c) Create map with postcodes marked

In [9]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[torontolatitude, torontolongitude], zoom_start=11)

# add markers to map
for lat, lng, postal_code, borough, neighbourhood in zip(torontodf['Latitude'], torontodf['Longitude'], torontodf['PostalCode'], torontodf['Borough'], torontodf['Neighbourhood']):
    label = '{}, {}, {}'.format(postal_code, neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### (d) Define Foursquare credentials and version

In [10]:
CLIENT_ID = 'VKSYDPNJNRYWDFOGX343VHXJVXTCSYEA2SB4D5OB3GGXO5KS' # your Foursquare ID
CLIENT_SECRET = '4F3YJSXCMZRM2FE0N3OZ5PQRPAQGP0ITB2PJQ2M1MLV2EYY1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: VKSYDPNJNRYWDFOGX343VHXJVXTCSYEA2SB4D5OB3GGXO5KS
CLIENT_SECRET:4F3YJSXCMZRM2FE0N3OZ5PQRPAQGP0ITB2PJQ2M1MLV2EYY1


#### (e) Define function for getting venues

In [11]:
def getNearbyVenues(postal_codes, boroughs, neighbourhoods, latitudes, longitudes, radius=500):
    
    venues_list=[]
    LIMIT = 100
    
    for postal_code, borough, neighbourhood, lat, lng in zip(postal_codes, boroughs, neighbourhoods, latitudes, longitudes):
        print('{}, {}, {}'.format(postal_code, borough, neighbourhood))
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postal_code,
            borough,
            neighbourhood,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode',
                  'Borough',
                  'Neighbourhood',
                  'PostalCode Latitude', 
                  'PostalCode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### (e) Get Toronto venues

In [12]:
toronto_venues = getNearbyVenues(postal_codes=torontodf['PostalCode'],
                                 boroughs=torontodf['Borough'],
                                 neighbourhoods=torontodf['Neighbourhood'],
                                   latitudes=torontodf['Latitude'],
                                   longitudes=torontodf['Longitude']
                                  )

M1B, Scarborough, Malvern, Rouge


KeyError: 'groups'

#### (f) Investigate results

In [None]:
# Get shape of venue dataframe
print(toronto_venues.shape)

# Show top of venue dataframe
toronto_venues

In [None]:
# Show number of results per area
toronto_venues.groupby('PostalCode').count()['Borough']

In [None]:
#Show the number of unique venue categories
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

#### (g) Analyse each postal code

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['PostalCode'] = toronto_venues['PostalCode'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

# show top of resulting dataframe
toronto_onehot.head()

In [None]:
# Get size of toronto_onehot dataframe
toronto_onehot.shape

In [None]:
# Group rows by neighbourhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
toronto_grouped

In [None]:
# Get size of grouped dataframe
toronto_grouped.shape

#### (h) Cluster by Postal Code

In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('PostalCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

#### (i) Merge cluster data and visualise

In [None]:
# Function to return most common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# Get most common venues in each PostalCode
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postalcodes_venues_sorted = pd.DataFrame(columns=columns)
postalcodes_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']

for ind in np.arange(toronto_grouped.shape[0]):
    postalcodes_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

postalcodes_venues_sorted.head()

In [None]:
# add clustering labels
postalcodes_venues_sorted.insert(0, 'Cluster Labels', np.array(kmeans.labels_, dtype='int'))
postalcodes_venues_sorted.head()

In [None]:
toronto_merged = torontodf

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(postalcodes_venues_sorted.set_index('PostalCode'), on='PostalCode')
toronto_merged['Cluster Labels'] = pd.to_numeric(toronto_merged['Cluster Labels'], downcast='integer')

# Drop rows with NA - i.e. where no venues were found on Foursquare
toronto_merged = toronto_merged.dropna()

#Show dataframe
toronto_merged 

In [None]:
toronto_merged['Cluster Labels'].value_counts()

In [None]:
map_clusters = folium.Map(location=[torontolatitude, torontolongitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### (j) Examine clusters

In [None]:
# Cluster 1
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
# Cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
# Cluster 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
# Cluster 4
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

In [None]:
# Cluster 5
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]