# Segmenting and Clustering Neighborhoods in Toronto

Note: To see the maps generated in Part 3, use this link: https://nbviewer.jupyter.org/github/oscar-philomath/Coursera_Capstone/blob/main/Segmenting%20and%20Clustering%20Neighborhoods%20in%20Toronto.ipynb

## Part 1:

### The goal of this part is to read in the data and format it to create a usable data frame

In [39]:
#Import Needed Libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup 
from pandas.io.json import json_normalize
from IPython.display import display_html

In [40]:
#Import the data from Wikipedia using Beautiful Soup
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup=BeautifulSoup(source,'lxml')
print(soup.title)

<title>List of postal codes of Canada: M - Wikipedia</title>


In [41]:
#Create the table and parse it to create a pandas data frame in the format we want
table = str(soup.table)
table_contents = []
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

#Add the correct Boroughs
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [42]:
#Let's see the head to make sure it went over smoothly
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [44]:
#Last part, let's see the shape
df.shape

(103, 3)

# Part 2

### The goal of this part is to add the latitude and longitude coordinates for each neighborhood


In [57]:
#We start by importing the file with the latitudes and longitudes:
coordinates = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [112]:
#Let's create a copy of our data frame from the previous part for this part:
df2 = df

In [113]:
#We use pandas' merge method to combine the two data frames

#First, we format:
coordinates.rename(columns={'Postal Code':'PostalCode'},inplace=True)

#Now we merge
df2 = pd.merge(df2,coordinates,on='PostalCode')
#df2.drop(['Latitude_x', 'Longitude_x', 'Latitude_y', 'Longitude_y'], axis = 1)
df2.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


# Part 3

### The goal of this part is to explore Toronto

### Note that I had issues getting a Foursquare Developer API so I will be using Folium instead

In [122]:
#We first cre4ate a new data frame for this part that only contains Toronto
df3 = df2[df2['Borough'].str.contains('Toronto',regex=False)]
print(df3.shape)
df3.head()

(39, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [67]:
!pip install folium #You may need to install folium as I had to

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 8.8 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [68]:
#We need to create some maps, so we import the needed libraries
import folium
from IPython.display import Image 
#from IPython.core.display import HTML 
import matplotlib.cm as cm
import matplotlib.colors as colors

In [115]:
#Create a map of Toronto and start at Toronto
map_toronto = folium.Map(location=[43.6590,-79.3490],zoom_start=13)

for lat,lng,borough,neighbourhood in zip(df3['Latitude'],df3['Longitude'],df3['Borough'],df3['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=10,
        popup=label,
        color='red',
        fill=True,
        fill_color='white',
        fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)

In [94]:
#Let's see our map
map_toronto

### Now we do some clustering (using K-Means)

In [97]:
#First we import K-Means from sk-learn
from sklearn.cluster import KMeans

In [140]:
#Let's see what happens when k is only 1
k=1
toronto_k_df = df3.drop(['PostalCode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_k_df)
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [144]:
# I have been having some issues with the Cluster Lables, so I am adding this for robustness
if 'Cluster Labels' in df3.columns:
    df3.drop(['Cluster Labels'],1, inplace = True)
df3_k1 = df3
df3_k1.insert(5, 'Cluster Labels', kmeans.labels_)
df3_k1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0
19,M4E,East Toronto,The Beaches,43.676357,-79.293031,0
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0


In [152]:
# create map
map_clusters1 = folium.Map(location=[43.6590,-79.3490],zoom_start=13)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df3_k1['Latitude'], df3_k1['Longitude'], df3_k1['Neighborhood'], df3_k1['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color='white',
        fill_opacity=0.9).add_to(map_clusters1)
       
map_clusters1

In [148]:
#Let's see what happens when k is only 3
k=3
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_k_df)
kmeans.labels_

# I have been having some issues with the Cluster Lables, so I am adding this for robustness
if 'Cluster Labels' in df3.columns:
    df3.drop(['Cluster Labels'],1, inplace = True)

df3_k3 = df3
df3_k3.insert(5, 'Cluster Labels', kmeans.labels_)
df3_k3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0
19,M4E,East Toronto,The Beaches,43.676357,-79.293031,0
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0


In [151]:
# create map
map_clusters3 = folium.Map(location=[43.6590,-79.3490],zoom_start=13)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df3_k3['Latitude'], df3_k3['Longitude'], df3_k3['Neighborhood'], df3_k3['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color='white',
        fill_opacity=0.9).add_to(map_clusters3)
       
map_clusters3

In [153]:
#Let's see what happens when k is only 5
k=5
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_k_df)
kmeans.labels_

# I have been having some issues with the Cluster Lables, so I am adding this for robustness
if 'Cluster Labels' in df3.columns:
    df3.drop(['Cluster Labels'],1, inplace = True)

df3_k5 = df3
df3_k5.insert(5, 'Cluster Labels', kmeans.labels_)
df3_k5.head()

# create map
map_clusters5 = folium.Map(location=[43.6590,-79.3490],zoom_start=13)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df3_k3['Latitude'], df3_k3['Longitude'], df3_k3['Neighborhood'], df3_k5['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color='white',
        fill_opacity=0.9).add_to(map_clusters5)
       
map_clusters5

In [155]:
#Lastly, let's see k = 7
k=7
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_k_df)
kmeans.labels_

# I have been having some issues with the Cluster Lables, so I am adding this for robustness
if 'Cluster Labels' in df3.columns:
    df3.drop(['Cluster Labels'],1, inplace = True)

df3_k7 = df3
df3_k7.insert(5, 'Cluster Labels', kmeans.labels_)
df3_k7.head()

# create map
map_clusters7 = folium.Map(location=[43.6590,-79.3490],zoom_start=13)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df3_k3['Latitude'], df3_k3['Longitude'], df3_k3['Neighborhood'], df3_k7['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color='black',
        fill_opacity=0.9).add_to(map_clusters7)
       
map_clusters7