# Appliead Data Science Capstone

#### Stefan Ubovic

### Peer-graded assignment: Scraping postcodes from Wikipedia

In [1]:
import pandas as pd
import numpy as np

Download and read HTML using urllib:

In [2]:
import urllib.request, urllib.parse, urllib.error
import xml.etree.ElementTree as ET
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

fhand = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

html = fhand.read().decode()

Parse HTML using BeautifulSoup:

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

cells = soup('td') #'td' denotes table values within the soup object

# creates dataframe with required number of rows and columns
rows = int(len(cells)/3) # number of rows
neighbourhoods_raw = pd.DataFrame(index=np.arange(rows), columns=['Post Code', 'Borough', 'Neighbourhood'])

# fills in dataframe with relevant values from soup object
row = 0
column = 0

for cell in cells:
    neighbourhoods_raw.iloc[row,column] = cell.text.rstrip()
    column = column + 1
    
    if column==3:
        column = 0
        row = row + 1

neighbourhoods_raw

Unnamed: 0,Post Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
186,E,G,H
187,J,K,L
188,M,N,P
189,R,S,T


Wrangle dataframe: remove invalid values and unassigned boroughs

In [4]:
neighbourhoods = neighbourhoods_raw.drop(range(180,191)) # removes rows containing values that are not post codes
neighbourhoods = neighbourhoods[(neighbourhoods['Borough']!='Not assigned')] # drops post codes where borough not assigned
neighbourhoods

Unnamed: 0,Post Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Check if any neighbourhoods are still unassigned:

In [5]:
if 'Not assigned' in neighbourhoods.Neighbourhood.values:
    print('Further wrangling required.')
else: 
    print("There are no 'Not assigned' values.")

# print(neighbourhoods.Neighbourhood.values)

There are no 'Not assigned' values.


In [6]:
print('There are {} assigned postcodes in Toronto.'.format(neighbourhoods.shape[0]))

There are 103 assigned postcodes in Toronto.


### Peer-graded assignment: Downloading latitude and the longitude coordinates using Geocoder

Add columns for latitude and longitude to dataframe:

In [7]:
neighbourhoods['Latitude'] = np.nan
neighbourhoods['Longitude'] = np.nan
neighbourhoods

Unnamed: 0,Post Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,,
3,M4A,North York,Victoria Village,,
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
5,M6A,North York,"Lawrence Manor, Lawrence Heights",,
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",,
165,M4Y,Downtown Toronto,Church and Wellesley,,
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",,
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",,


Fill in latitude and longitude columns with values using geocoder:

In [12]:
import geocoder # import geocoder

for postcode in neighbourhoods['Post Code']:
    
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postcode))
        lat_lng_coords = g.latlng
        
    neighbourhoods.loc[neighbourhoods['Post Code']==postcode,['Latitude']] = lat_lng_coords[0]
    neighbourhoods.loc[neighbourhoods['Post Code']==postcode,['Longitude']] = lat_lng_coords[1]
    
    print(postcode,'...','({},{})'.format(lat_lng_coords[0], lat_lng_coords[1]))

    #print(latitude)
    #print(longitude)

M3A ... (43.75188000000003,-79.33035999999998)
M4A ... (43.73042000000004,-79.31281999999999)
M5A ... (43.655140000000074,-79.36264999999997)
M6A ... (43.72321000000005,-79.45140999999995)
M7A ... (43.66449000000006,-79.39301999999998)
M9A ... (43.66277000000008,-79.52830999999998)
M1B ... (43.81153000000006,-79.19551999999999)
M3B ... (43.74929000000003,-79.36168999999995)
M4B ... (43.707940000000065,-79.31159999999994)
M5B ... (43.65736000000004,-79.37817999999999)
M6B ... (43.70799000000005,-79.44837999999999)
M9B ... (43.65279000000004,-79.55405999999994)
M1C ... (43.78564000000006,-79.15870999999999)
M3C ... (43.72184000000004,-79.34339999999997)
M4C ... (43.68970000000007,-79.30679999999995)
M5C ... (43.65143000000006,-79.37556999999998)
M6C ... (43.69211000000007,-79.43035999999995)
M9C ... (43.648900000000026,-79.57824999999997)
M1E ... (43.765750000000025,-79.17519999999996)
M4E ... (43.67703000000006,-79.29541999999998)
M5E ... (43.64531000000005,-79.37367999999998)
M6E ... (

In [13]:
neighbourhoods

Unnamed: 0,Post Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.75188,-79.33036
3,M4A,North York,Victoria Village,43.73042,-79.31282
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.36265
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72321,-79.45141
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66449,-79.39302
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.65369,-79.51112
165,M4Y,Downtown Toronto,Church and Wellesley,43.66659,-79.38130
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.64869,-79.38544
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.63288,-79.48955
