In [1]:
!pip install BeautifulSoup4
!pip install requests



# 一、Get Data 

1. Get HTML from wikipedia  
2. Use BeautifulSoup to parse html data  
3. Store parsed data into Pandas DataFrame.

In [2]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd 
import numpy as np

In [3]:
# get html from wiki-page and create soup object 
# BeautifulSoup: pulling data out of HTML and XML files 
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(source.text, 'lxml')   # lxml’s HTML parser

In [5]:
# get data from html page & store it into a list 
data = []
columns = []
table = soup.find(class_ = 'wikitable')
for index, tr in enumerate (table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
        
        # First row of data is the header 
        if (index == 0):
            columns = section
        else:
            data.append(section)
        
#convert list into Pandas DataFrame
canada_df = pd.DataFrame(data = data, columns = columns)

In [7]:
canada_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M1A,Not assigned,Not assigned
2,M1A,Not assigned,Not assigned
3,M2A,Not assigned,Not assigned
4,M2A,Not assigned,Not assigned


# 二、Data Cleaning 

1. Remove 'Boroughs' that are 'Not Assigned'  
2. More than one neighborhood can exist in one postal code area, combined these into one row with the neighborhoods separated with a comma.  
3. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [8]:
# Remove 'Boroughs' that are 'Not Assigned'
canada_df = canada_df[canada_df['Borough'] != 'Not assigned' ]
canada_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
6,M3A,North York,Parkwoods
7,M3A,North York,Parkwoods
8,M3A,North York,Parkwoods
9,M4A,North York,Victoria Village
10,M4A,North York,Victoria Village


In [12]:
# More than one neighborhood can exist in one postal code area, combined these into one row  
# with the neighborhoods separated with a comma.
canada_df['Neighborhood'] = canada_df.groupby('Postcode')['Neighborhood'].transform(lambda neigh:','.join(neigh))

# Remove duplicates 
canada_df = canada_df.drop_duplicates()

# Update index to be postcode if it isn't already 
if(canada_df.index.name != 'Postcode'):
    canada_df = canada_df.set_index('Postcode')
canada_df.head()

Unnamed: 0_level_0,Borough,Neighborhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,"Parkwoods,Parkwoods,Parkwoods"
M4A,North York,"Victoria Village,Victoria Village,Victoria Vil..."
M5A,Downtown Toronto,"Harbourfront,Harbourfront,Harbourfront"
M6A,North York,"Lawrence Heights,Lawrence Heights,Lawrence Hei..."
M7A,Downtown Toronto,"Queen's Park,Queen's Park,Queen's Park"


In [13]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
canada_df['Neighborhood'].replace('Not assigned', canada_df['Borough'], inplace = True)
canada_df.head()

Unnamed: 0_level_0,Borough,Neighborhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,"Parkwoods,Parkwoods,Parkwoods"
M4A,North York,"Victoria Village,Victoria Village,Victoria Vil..."
M5A,Downtown Toronto,"Harbourfront,Harbourfront,Harbourfront"
M6A,North York,"Lawrence Heights,Lawrence Heights,Lawrence Hei..."
M7A,Downtown Toronto,"Queen's Park,Queen's Park,Queen's Park"


In [15]:
canada_df.shape

(103, 2)