## Install Beautiful Soup package

In [3]:
pip install bs4

Collecting bs4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Collecting beautifulsoup4 (from bs4)
[?25l  Downloading https://files.pythonhosted.org/packages/cb/a1/c698cf319e9cfed6b17376281bd0efc6bfc8465698f54170ef60a485ab5d/beautifulsoup4-4.8.2-py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 2.9MB/s eta 0:00:01
[?25hCollecting soupsieve>=1.2 (from beautifulsoup4->bs4)
  Downloading https://files.pythonhosted.org/packages/05/cf/ea245e52f55823f19992447b008bcbb7f78efc5960d77f6c34b5b45b36dd/soupsieve-2.0-py2.py3-none-any.whl
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jupyterlab/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.8.2 bs4-0

## Import the packages necessary for the project

In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Request the website and parse the data

In [165]:
# Get the data from the wikipedia page 
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
# Parse the html data thanks to the html parser of BeautifulSoup
soup = BeautifulSoup(res, 'html.parser')
my_table = soup.find('table',{'class':'wikitable sortable'})

In [172]:
# Isolate the data that contains the taga 'td'
links = soup.find_all('td')

## Data Manipulation 

In [196]:
PostalCode = []
Borough = []
Neighborhood = []
i=0

# Put the PostalCode, Borough and Neighborhood data 
# in lists
for link in links[:-1:3]:
    PostalCode.append(link.text)

for link in links[1::3]:
    Borough.append(link.text)
    
for link in links[2::3]:
    Neighborhood.append(link.text[:-1])
        
# Put the lists in a DataFrame
df = pd.DataFrame()
df['PostalCode'] = PostalCode
df['Borough'] = Borough
df['Neighborhood'] = Neighborhood
# We remove the unecessary rows that don't contain
# relevant data
df = df[:287]

# We only keep the rows that have an assigned borough
df = df[df.Borough !='Not assigned'].reset_index(drop=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
...,...,...,...
205,M8Z,Etobicoke,Kingsway Park South West
206,M8Z,Etobicoke,Mimico NW
207,M8Z,Etobicoke,The Queensway West
208,M8Z,Etobicoke,Royal York South West


In [197]:
# We group by the postal code and join all the neighboorhood that have the same postal code
grouped = df.groupby('PostalCode')['Neighborhood'].apply(lambda x:','.join(x))
df2 = pd.DataFrame(grouped).reset_index()
df2

Unnamed: 0,PostalCode,Neighborhood
0,M1B,"Rouge,Malvern"
1,M1C,"Highland Creek,Rouge Hill,Port Union"
2,M1E,"Guildwood,Morningside,West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae
...,...,...
98,M9N,Weston
99,M9P,Westmount
100,M9R,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


In [198]:
# We transform df to keep only the unique postal codes
# This DataFrame will be useful to join it with df2
# and have the corresponding Borough of each PostalCode
df1 = df[['PostalCode','Borough']]
df1 = df1.drop_duplicates().reset_index(drop=True)
df1

Unnamed: 0,PostalCode,Borough
0,M3A,North York
1,M4A,North York
2,M5A,Downtown Toronto
3,M6A,North York
4,M7A,Downtown Toronto
...,...,...
98,M8X,Etobicoke
99,M4Y,Downtown Toronto
100,M7Y,East Toronto
101,M8Y,Etobicoke


In [199]:
# Join of the two DataFrame to have the 3 required columns
df = pd.merge(df2, df1, on='PostalCode', how = 'left')
df = df[['PostalCode','Borough','Neighborhood']]
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


In [200]:
# We remove the rows that have not a neighborhood assigned
df = df[df.Neighborhood != 'Not assigned'].reset_index(drop=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


In [190]:
#Shape of the final DataFrame
df.shape

(103, 3)