In [5]:
!conda install -c anaconda beautifulsoup4

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.8.0       |           py36_0         147 KB  anaconda
    openssl-1.1.1              |       h7b6447c_0         5.0 MB  anaconda
    ca-certificates-2019.8.28  |                0         132 KB  anaconda
    certifi-2019.9.11          |           py36_0         154 KB  anaconda
    ------------------------------------------------------------
                                           Total:         5.4 MB

The following packages will be UPDATED:

    beautifulsoup4:  4.7.1-py36_1      --> 4.8.0-py36_0     anaconda
    ca-certificates: 2019.5.15-1       --> 2019.8.28-0      anaconda
    certifi:         2019.6.16-py36_1  --> 2019.9.11-py36_0 anaconda
    openssl:         1.1.1d-h7b

In [1]:
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
from pandas.io.html import read_html
import numpy as np

## Scrape Wikipedia Page for Table containing Postcode, Borough, Neighbourhood

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)
#soup = BeautifulSoup(page, "lxml")

### Extract table from wikipedia page into dataframe

In [3]:
#print(soup.prettify())
#table_wiki = soup.find("table", class_="wikitable sortable")
df = read_html(url, attrs={"class":"wikitable sortable"})[0]
print("Shape: ", df.shape)
df.head()


Shape:  (288, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Preprocessing Data

### Drop rows with borough = 'Not assigned'

In [4]:
# check for boroughs with values "Not assigned"
df_NotAssigned = df[df["Borough"]=="Not assigned"]
print("Shape: ", df_NotAssigned.shape)
df_NotAssigned.head()

Shape:  (77, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
9,M8A,Not assigned,Not assigned
13,M2B,Not assigned,Not assigned
20,M7B,Not assigned,Not assigned


In [5]:
df.drop(df[df.Borough=="Not assigned"].index, axis=0, inplace=True)
print("Shape: ", df.shape)
df.head()

Shape:  (211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Replace Neighbourhoods with 'Not assigned' to the Borough name

In [6]:
# Check for neighbourhoods with 'Not assigned' values
df_NotAssigned = df[df["Neighbourhood"]=="Not assigned"]
print("Shape: ", df_NotAssigned.shape)
df_NotAssigned.head()

Shape:  (1, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
8,M7A,Queen's Park,Not assigned


In [7]:
# Replace neighbourhoods with 'Not assigned' to borough
i = (df.Neighbourhood=="Not assigned")
df.loc[i, "Neighbourhood"] = df.loc[i, "Borough"]
print("Shape: ",df.shape)
df.head(8)

Shape:  (211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue


### Combine neighbourhoods with the same postal codes into a comma separated list

In [29]:
df=df.groupby(["Postcode","Borough"])["Neighbourhood"].agg([("Neighbourhood", ", ".join)]).reset_index()
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [30]:
df.shape

(103, 3)