# We will extract the table from the wikipedia page and filter the rows with the Not assigned borough

In [4]:
from bs4 import BeautifulSoup

In [5]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

#### Getting the response from the wiki page using the requests package

In [6]:
import requests
response = requests.get(wiki_url)

In [8]:
data = response.text

## Initialize the Beautiful Soup instance to query the page

In [10]:
soup = BeautifulSoup(data)

#### Getting the first table body

In [30]:
table = soup.find("tbody")

#### Here we will iterate over the rows and extracting the postal code, borough and neighborhood
#### We will do the following in order to get the desired dataframe:
1. Ignore the rows that are Not assigned to any borough
2. Assign each value of the row to the correct list

In [85]:
postal_codes = []
boroughs = []
neigborhoods = []

for row in table:
    if row in ["\n", None]:
        continue
        
    heads = row.find_all("td")
    if not heads:
        continue
        
    borough = heads[1].text.strip()
    if borough == "Not assigned":
        continue

    postal_code = heads[0].text.strip()
    postal_codes.append(postal_code)
    boroughs.append(borough)
    _neigborhoods = heads[2].text.strip().replace("/ ", ",")
    neigborhoods.append(_neigborhoods)
    
    print("postal_code: {}, borough: {}, neigborhood: {}".format(postal_code, borough, _neigborhoods))
    print("-"*20)

postal_code: M3A, borough: North York, neigborhood: Parkwoods
--------------------
postal_code: M4A, borough: North York, neigborhood: Victoria Village
--------------------
postal_code: M5A, borough: Downtown Toronto, neigborhood: Regent Park ,Harbourfront
--------------------
postal_code: M6A, borough: North York, neigborhood: Lawrence Manor ,Lawrence Heights
--------------------
postal_code: M7A, borough: Downtown Toronto, neigborhood: Queen's Park ,Ontario Provincial Government
--------------------
postal_code: M9A, borough: Etobicoke, neigborhood: Islington Avenue
--------------------
postal_code: M1B, borough: Scarborough, neigborhood: Malvern ,Rouge
--------------------
postal_code: M3B, borough: North York, neigborhood: Don Mills
--------------------
postal_code: M4B, borough: East York, neigborhood: Parkview Hill ,Woodbine Gardens
--------------------
postal_code: M5B, borough: Downtown Toronto, neigborhood: Garden District ,Ryerson
--------------------
postal_code: M6B, boroug

### Construct the dataframe with the lists

In [88]:
import pandas as pd
frame = {
    "PostalCode": postal_codes,
    "Borough": boroughs,
    "Neighborhood": neigborhoods
}

In [89]:
df = pd.DataFrame(frame)

In [90]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park ,Harbourfront"
3,M6A,North York,"Lawrence Manor ,Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park ,Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern ,Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill ,Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District ,Ryerson"


In [91]:
df.shape

(103, 3)