1. Import library

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore")

2. Get data from url

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
table = soup.find('table',{'class':'wikitable sortable'})
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)

3. Convert data to dataframe

In [3]:
df = pd.DataFrame(res, columns=["Postcode","Borough","Neighborhood"])
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
df.shape

(287, 3)

4. Create new dataframe without "Not assigned" value in Borough column.

In [5]:
data = df[df['Borough']!="Not assigned"]
data.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [6]:
data.shape

(210, 3)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210 entries, 2 to 285
Data columns (total 3 columns):
Postcode        210 non-null object
Borough         210 non-null object
Neighborhood    210 non-null object
dtypes: object(3)
memory usage: 6.6+ KB


5. Check "Not assigned" value in Neighborhood column.

In [8]:
data[data['Neighborhood']=="Not assigned"]

Unnamed: 0,Postcode,Borough,Neighborhood
7,M7A,Queen's Park,Not assigned


In [9]:
data[data['Postcode']=="M7A"]

Unnamed: 0,Postcode,Borough,Neighborhood
7,M7A,Queen's Park,Not assigned


6. Replace value and review

In [10]:
data['Neighborhood'][data['Neighborhood']=="Not assigned"] = data['Borough']

In [11]:
data[data['Postcode']=="M7A"]

Unnamed: 0,Postcode,Borough,Neighborhood
7,M7A,Queen's Park,Queen's Park


In [12]:
data.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


7. Group by into one row

In [13]:
data_groupby = data.groupby(["Postcode","Borough"])['Neighborhood'].apply(lambda Neighborhood: ','.join(Neighborhood)).to_frame().reset_index()
data_groupby.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
data_groupby[data_groupby['Postcode']=="M6A"]

Unnamed: 0,Postcode,Borough,Neighborhood
71,M6A,North York,"Lawrence Heights,Lawrence Manor"


In [15]:
data_groupby.shape

(103, 3)