In [1]:
import pandas as pd
import numpy as np
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

**Import data by scraping from wikipedia page**

In [2]:
# scrape the data
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

# data cibled in scraping 
data_table = df[0]  
data_table.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


**Remove data with "Not assigned" value for borough**


In [3]:
# Remove data with "Not assigned" value for borough
data_table = data_table[(data_table["Borough"] != "Not assigned")].reset_index(drop = True)

In [4]:
data_table.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
# Now we are going to impute the "Not assigned" neighbourhood with his corresponding Borough
# if it exists
for Brgh,Neigh in zip(data_table["Borough"], data_table["Neighbourhood"]):
  if Neigh == "Not assigned":
    data_table["Neighbourhood"].replace("Not assigned", Brgh, inplace = True)

**Number of samples**

In [6]:
print("Number of rows in the dataframe : ", data_table.shape[0])

Number of rows in the dataframe :  103


In [None]:
postal_code = list(data_table["Postal Code"])

**Import Geospatial data**

In [10]:
new_df =  pd.read_csv("Geospatial_Coordinates.csv")
new_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
# Merge the two data imported
data = data_table.merge(new_df, on = "Postal Code")
data.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


**Keep just Borough containing word "Toronto**

In [None]:
# We are going to work with only "borough" that contain the word "Toronto"

# list that will contrain index corresponding to toronto
index_to_keep = []

# loop to extract all corresponding rows in the dataframe
for idx,brgh in zip(data.index,data.Borough):
  if "Toronto" in brgh :
    index_to_keep.append(idx)

# filter the data
Toronto_data = data.loc[index_to_keep,:].reset_index(drop = True)
Toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [None]:
# Create map of Toronto using latitude and longitude values #


borough = Toronto_data["Borough"]
neighbourhood = Toronto_data["Neighbourhood"]
latitude = list(Toronto_data["Latitude"])
longitude = list(Toronto_data["Longitude"])

# initialize the map
map_toronto = folium.Map(location = [latitude[0], longitude[0]], zoom_start = 10)



# add markers to map
for lat,lng,brgh,neigh in zip(latitude, longitude, borough, neighbourhood):
  label = "{}, {}".format(neigh, brgh)
  label = folium.Popup(label, parse_html = True)
  folium.CircleMarker(
      [lat, lng],
      radius = 5,
      popup = label,
      color = "blue",
      fill = True,
      fill_color = '#3186cc',
      fill_opacity=0.7,
      parse_html=False).add_to(map_toronto)

# display the map
map_toronto

In [None]:
Toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [None]:
# Number of rows in the toronto data dataframe
print(f"Number of rows in Toronto data : {Toronto_data.shape[0]}")

Number of rows in Toronto data : 39


**One hot encoding for Postal code and Borough column**

In [None]:
# Encode Borough and postal code
Toronto_onehot = pd.get_dummies(Toronto_data[["Borough","Postal Code"]])
Toronto_onehot.head()

Unnamed: 0,Borough_Central Toronto,Borough_Downtown Toronto,Borough_East Toronto,Borough_West Toronto,Postal Code_M4E,Postal Code_M4K,Postal Code_M4L,Postal Code_M4M,Postal Code_M4N,Postal Code_M4P,...,Postal Code_M5X,Postal Code_M6G,Postal Code_M6H,Postal Code_M6J,Postal Code_M6K,Postal Code_M6P,Postal Code_M6R,Postal Code_M6S,Postal Code_M7A,Postal Code_M7Y
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
Toronto_onehot["Neighbourhood"] = Toronto_data["Neighbourhood"]
Toronto_onehot["Latitude"] = Toronto_data["Latitude"]
Toronto_onehot["Longitude"] = Toronto_data["Longitude"]

Toronto_onehot.head()

Unnamed: 0,Borough_Central Toronto,Borough_Downtown Toronto,Borough_East Toronto,Borough_West Toronto,Postal Code_M4E,Postal Code_M4K,Postal Code_M4L,Postal Code_M4M,Postal Code_M4N,Postal Code_M4P,...,Postal Code_M6J,Postal Code_M6K,Postal Code_M6P,Postal Code_M6R,Postal Code_M6S,Postal Code_M7A,Postal Code_M7Y,Neighbourhood,Latitude,Longitude
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"Regent Park, Harbourfront",43.65426,-79.360636
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"Garden District, Ryerson",43.657162,-79.378937
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,St. James Town,43.651494,-79.375418
4,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,The Beaches,43.676357,-79.293031


**Cluster the data into 4 clusters**

In [None]:
# Normalize our dataframe
from sklearn.cluster import KMeans

features = [feat for feat in Toronto_onehot.columns if feat not in ("Neighbourhood","Latitude","Longitude")]

kclusters = 4

kmeans = KMeans(n_clusters = kclusters)
kmeans.fit(Toronto_onehot[features])

kmeans.labels_[0:5]


array([0, 0, 0, 0, 3], dtype=int32)

In [None]:
# Add cluster labels to our dataframe #
Toronto_data["cluster_labels"] = kmeans.labels_
Toronto_data.head() 

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,cluster_labels
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,3


In [None]:

# initialize the cluster map
map_clusters = folium.Map(location = [latitude[0], longitude[0]], zoom_start=10)

# colors
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]



# add markerss to the map
for lat,lon,neigh,cluster in zip(latitude, longitude, neighbourhood,Toronto_data["cluster_labels"]):
    label = folium.Popup(str(neigh) + "cluster" + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat,lon],
        radius = 5,
        popup = label,
        color = rainbow[cluster],
        fill = True,
        fill_color = rainbow[cluster],
        fill_opacity = 0.7).add_to(map_clusters)

# display the cluster map
map_clusters

In [None]:
# cluster 1 : Downtown Toronto 

Toronto_data[Toronto_data["cluster_labels"] == 0]

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,cluster_labels
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564,0
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,0
10,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752,0
13,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576,0


In [None]:
# Cluster 2 : Central of Toronto
Toronto_data[Toronto_data["cluster_labels"] == 1]


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,cluster_labels
18,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1
19,M5N,Central Toronto,Roselawn,43.711695,-79.416936,1
20,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1
21,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307,1
23,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678,1
24,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678,1
26,M4S,Central Toronto,Davisville,43.704324,-79.38879,1
29,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,1
31,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,1


In [None]:
# Cluster 3 : West of Toronto
Toronto_data[Toronto_data["cluster_labels"] == 2]

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,cluster_labels
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259,2
11,M6J,West Toronto,"Little Portugal, Trinity",43.647927,-79.41975,2
14,M6K,West Toronto,"Brockton, Parkdale Village, Exhibition Place",43.636847,-79.428191,2
22,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763,2
25,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325,2
28,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.48445,2


In [None]:
# Cluster 4 : East of Toronto
Toronto_data[Toronto_data["cluster_labels"] == 3]

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,cluster_labels
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,3
12,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,3
15,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,3
17,M4M,East Toronto,Studio District,43.659526,-79.340923,3
38,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,3
