# Notebook for "Applied Data Science Capstone"

In [1]:
import pandas as pd
import numpy as np
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Scrape neighbourhoods

In [2]:
!pip install lxml



In [38]:
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]

Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [39]:
df = df[df["Borough"] != "Not assigned"]

If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [40]:
df.loc[df["Neighbourhood"] == "Not assigned", "Neighbourhood"] = df["Borough"]

More than one neighborhood can exist in one postal code area.

For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park.

These two rows will be combined into one row with the neighborhoods separated with a comma.

In [41]:
df = df.groupby(["Postal Code", "Borough"]).agg(lambda x: ", ".join(x.values)).reset_index()

In [42]:
display(df.head())
print(df.shape)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


(103, 3)


## Get coordinates for each neighbourhood

In [45]:
!pip install geocoder



In [46]:
import geocoder

In [48]:
def get_coords(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while (lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng

    return lat_lng_coords

In [50]:
# get_coords("M5G")

Since geocoder didn't work out, use the provided csv.

In [51]:
codes_csv = pd.read_csv("Geospatial_Coordinates.csv")
codes_csv.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [53]:
df_wc = df.merge(codes_csv, left_on="Postal Code", right_on="Postal Code")
display(df_wc.head())
print(df_wc.shape)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


(103, 5)
