# Notebook for "Applied Data Science Capstone"

In [1]:
import pandas as pd
import numpy as np
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Scrape neighbourhoods

In [3]:
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]

Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [4]:
df = df[df["Borough"] != "Not assigned"]

If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [5]:
df.loc[df["Neighbourhood"] == "Not assigned", "Neighbourhood"] = df["Borough"]

More than one neighborhood can exist in one postal code area.

For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park.

These two rows will be combined into one row with the neighborhoods separated with a comma.

In [6]:
df = df.groupby(["Postal Code", "Borough"]).agg(lambda x: ", ".join(x.values)).reset_index()

In [7]:
display(df.head())
print(df.shape)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


(103, 3)


## Get coordinates for each neighbourhood

In [9]:
import geocoder

In [10]:
def get_coords(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while (lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng

    return lat_lng_coords

In [11]:
# get_coords("M5G")

Since geocoder didn't work out, use the provided csv.

In [12]:
codes_csv = pd.read_csv("Geospatial_Coordinates.csv")
codes_csv.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
df_wc = df.merge(codes_csv, left_on="Postal Code", right_on="Postal Code")
display(df_wc.head())
print(df_wc.shape)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


(103, 5)


In [14]:
df_wc.to_csv("toronto_neighbourhoods.csv", index=False)

## Explore the neighbourhood data

In [15]:
neighbourhoods = df_wc

In [16]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighbourhoods['Borough'].unique()),
        neighbourhoods.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


In [18]:
import folium
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [19]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [20]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(neighbourhoods['Latitude'], neighbourhoods['Longitude'], neighbourhoods['Borough'], neighbourhoods['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Cluster neighbourhoods

In [22]:
from sklearn.cluster import KMeans

In [23]:
# set number of clusters
kclusters = 5

neighbourhoods_clustering = neighbourhoods[["Latitude", "Longitude"]]
neighbourhoods_clustering = pd.concat([neighbourhoods_clustering, pd.get_dummies(neighbourhoods["Borough"])], axis=1)
display(neighbourhoods_clustering)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(neighbourhoods_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

Unnamed: 0,Latitude,Longitude,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,Toronto/York,West Toronto,York
0,43.806686,-79.194353,0,0,0,0,0,0,0,1,0,0,0
1,43.784535,-79.160497,0,0,0,0,0,0,0,1,0,0,0
2,43.763573,-79.188711,0,0,0,0,0,0,0,1,0,0,0
3,43.770992,-79.216917,0,0,0,0,0,0,0,1,0,0,0
4,43.773136,-79.239476,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,43.706876,-79.518188,0,0,0,0,0,0,0,0,0,0,1
99,43.696319,-79.532242,0,0,0,0,1,0,0,0,0,0,0
100,43.688905,-79.554724,0,0,0,0,1,0,0,0,0,0,0
101,43.739416,-79.588437,0,0,0,0,1,0,0,0,0,0,0


array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)

In [24]:
neighbourhoods_clusters = neighbourhoods.copy()
# add clustering labels
neighbourhoods_clusters.insert(0, 'Cluster Labels', kmeans.labels_)

neighbourhoods_clusters.head() # check the last columns!

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,2,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,2,M1G,Scarborough,Woburn,43.770992,-79.216917
4,2,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [26]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [27]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(neighbourhoods_clusters['Latitude'], neighbourhoods_clusters['Longitude'], neighbourhoods_clusters['Neighbourhood'], neighbourhoods_clusters['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters