### import the modouls

In [1]:
from bs4 import BeautifulSoup
import requests
import urllib.request
import time
import pandas as pd

### get the connection to the url and scrape the web page

get the table text after inspecting the site

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')
# after inspecting the web site, need to find the table class
My_table = soup.find('table',{'class':'wikitable sortable'})

### get the table headers

In [3]:
# Get table headers
t_headers = []
for th in My_table.find_all("th"):
    # remove any newlines and extra spaces from left and right
    t_headers.append(th.text.replace('\n', ' ').strip())
print(t_headers)

['Postcode', 'Borough', 'Neighbourhood']


### get the table data and zip each row to the headers

In [4]:
# get the data of each row with its header
table_data = []
for tr in My_table.tbody.find_all("tr"): # find all tr's from table's tbody
    t_row = {}
    # Each table row is stored in the form of
    # t_row = {'Postcode': '', 'Borough': '', 'Neighbourhood': ''}

    # find all td's(3) in tr and zip it with t_header
    for td, th in zip(tr.find_all("td"), t_headers): 
        t_row[th] = td.text.replace('\n', '').strip()
    table_data.append(t_row)

### convert to pandas DF and clean the data

In [5]:
# convert the table to pandas dataframe 
df_data_table = pd.DataFrame(table_data[1:], columns=t_headers)

# drop all rows where Borough column is 'Not assigned'
df_data_table = df_data_table.loc[df_data_table.Borough != 'Not assigned']

# fix the 'Not assinged' issue on the Neighbourhood column
df_data_table.loc[df_data_table.Neighbourhood=='Not assigned', 'Neighbourhood'] = \
df_data_table.loc[df_data_table.Neighbourhood=='Not assigned', 'Borough']

# group the borough and concatenate the neighborhood names with comma
df_data_table = df_data_table.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(','.join)
df_data_table = pd.DataFrame(df_data_table)
# sort values based on the postcode column
df_data_table = df_data_table.sort_values(by='Postcode')
# reset the dataframe index
df_data_table.reset_index(inplace=True)

df_data_table.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [6]:
df_data_table.shape

(103, 3)