### Using BeautifulSoup to extract the source code for a table in Wikipedia and clean the data


In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r  = requests.get(url)
data = r.text
soup = BeautifulSoup(data)

In [2]:
# Extract the table we want
table = soup.find("table", class_="wikitable sortable")

In [3]:
# Extract the table rows
table_content = table.tbody.find_all("tr")

In [4]:
# Create DataFrame
columns = ["Postcode", "Borough", "Neighbourhood"]
df = pd.DataFrame(columns=columns)

# Get the table rows in text format and remove the row divider \n
row = 1
while True:
    try:
        text1 = table_content[row].get_text()
    except:
        break
    text1 = text1.split("\n")

    # remove empty strings
    text1 = [i for i in text1 if i] 
    text1
    df.loc[len(df)] = text1
    row = row + 1
    
    if row>1000:
        break


In [5]:
# View the DataFrame
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


# COOL! Now we only need to clean the data a little bit

In [6]:
# Make a copy of the DataFrame
df_clean = df.copy()

In [7]:
# Remove rows where Borough is "Not assigned"
df_clean = df_clean[df_clean.Borough != "Not assigned"]
df_clean.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [8]:
# Look at the frequency of Neighbourhoods grouped by postcode and borough
df_clean.groupby(["Postcode", "Borough"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M1B,Scarborough,2
M1C,Scarborough,3
M1E,Scarborough,3
M1G,Scarborough,1
M1H,Scarborough,1
...,...,...
M9N,York,1
M9P,Etobicoke,1
M9R,Etobicoke,4
M9V,Etobicoke,8


In [9]:
# If a postcode has multiple Neighbourhoods, combine them into one with the same Borough and Postcode
df_clean = df_clean.groupby(["Postcode", "Borough"], sort=False).agg(",".join)
df_clean = df_clean.reset_index()
df_clean

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."


In [10]:
# If neighborhood is "Not assigned" then set the name to its Borough
for i in range(0,len(df_clean)):
    if df_clean.Neighbourhood.iloc[i] == "Not assigned":
        df_clean.Neighbourhood.iloc[i] = df_clean.Borough.iloc[i]

In [11]:
# Display the final version of the DataFrame
df_clean

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout..."


In [12]:
df_clean.shape

(103, 3)