# This is Part 1

In [1]:
import numpy as np
import pandas as pd


scrape wiki table to dataframe

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
toronto_df = pd.read_html(wiki_url)[0]


drop 'Not assigned' Boroughs and rename 'Not Assigned' Neighbourhoods

In [3]:
toronto_df = toronto_df[toronto_df.Borough!='Not assigned']
columns = toronto_df.columns
toronto_df.Neighbourhood[toronto_df.Neighbourhood=='Not assigned'] = toronto_df[toronto_df.Neighbourhood=='Not assigned'].Borough
toronto_df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
...,...,...,...
282,M8Z,Etobicoke,Kingsway Park South West
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West


Find unique postal codes and create an emplty dataframe

In [4]:
postalcode_list = toronto_df['Postcode'].unique().tolist()
postalcode_df = pd.DataFrame(columns=columns, index=range(len(postalcode_list)))


fill the dataframe with the correctly formatted values

In [5]:
for i, postalcode in enumerate(postalcode_list):
    # find all boroughs with overlaping postalcodes (if existant)
    borough_list = toronto_df.Borough[toronto_df.Postcode==postalcode].unique().tolist()
    borough_str = ', '.join(borough_list)
    
    # find all neighborhoods with overlaping postalcodes (if existant)
    neighborhood_list = toronto_df.Neighbourhood[toronto_df.Postcode==postalcode].unique().tolist()
    neighborhood_str = ', '.join(neighborhood_list)
    
    # save to new dataframe entry
    postalcode_df.iloc[i].Postcode = postalcode
    postalcode_df.iloc[i].Borough = borough_str
    postalcode_df.iloc[i].Neighbourhood = neighborhood_str
    
    
postalcode_df

    

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."


In [6]:
postalcode_df.shape

(103, 3)

# Part 2

I cant get the geocoder library to work, so here we go reading the csv:

In [7]:
latlong_df = pd.read_csv('Geospatial_Coordinates.csv')
latlong_df.rename({'Postal Code':'Postcode'}, axis='columns',inplace=True)
latlong_df

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [8]:
postalloc_df = pd.merge(postalcode_df, latlong_df, on='Postcode')
postalloc_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.636258,-79.498509


# Part 3
I will cluster neighborhoods by agglomorative clustering and 5 clusters.

In [9]:
import folium
import seaborn as sns

In [None]:
import clustering algorithm and feed it the location data

In [11]:
from sklearn.cluster import AgglomerativeClustering
agglom = AgglomerativeClustering(n_clusters = 4, linkage = 'average')
lats = np.asarray(postalloc_df.Latitude.tolist())
longs = np.asarray(postalloc_df.Longitude.tolist())
pos = np.stack([lats,longs])
agglom.fit(pos.T)
postalloc_df['Distance_label'] = agglom.labels_

plot markers with color according to agglomeration label

In [12]:
map_toronto = folium.Map(location=center_location, zoom_start=10)
colors=['black','red','blue','green']
for i in range(postalloc_df.shape[0]):
    entry = postalloc_df.iloc[i]
    color = colors[entry.Distance_label]
    folium.CircleMarker((entry.Latitude,entry.Longitude),
                  fill='true',
                  radius = 2,
                  fill_color=color,
                  color = color,
                  fill_opacity=1).add_to(map_toronto)
                  

map_toronto