In [1]:
import pandas as pd
import numpy as np
import geocoder
import requests
from bs4 import BeautifulSoup

# Pre-Processing

Getting the data

In [2]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))[0]

 

In [3]:
df.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


In [4]:
df.shape

(287, 3)

Checking if some postcodes do appear multiple times as described

In [5]:
df['Postcode'].value_counts()

M9V    8
M8Y    8
M5V    7
M9B    5
M4V    5
      ..
M5E    1
M7A    1
M8P    1
M1S    1
M2V    1
Name: Postcode, Length: 180, dtype: int64

We see that it is indeed the case

So let's group the neighbourhoods by Postcode and Borough

In [6]:
df_grouped=df.groupby(['Postcode','Borough'])['Neighbourhood'].agg(', '.join)
df_grouped.head()

Postcode  Borough     
M1A       Not assigned                              Not assigned
M1B       Scarborough                             Rouge, Malvern
M1C       Scarborough     Highland Creek, Rouge Hill, Port Union
M1E       Scarborough          Guildwood, Morningside, West Hill
M1G       Scarborough                                     Woburn
Name: Neighbourhood, dtype: object

The groupby method **converts** the dataframe into a *Pandas Series*, **so let's convert it back** to a *Pandas DataFrame*.

(There is probably a better way to do this transformation)

In [7]:
df=pd.DataFrame(df_grouped)
df.reset_index(inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M1B,Scarborough,"Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn


We want to remove the lines wit no assigned Borough nor Neighbourhood. Let's create a concatenation of those two columns to better identify those lines

In [8]:
df['Borough-Neigh']=df['Borough']+', '+df['Neighbourhood']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Borough-Neigh
0,M1A,Not assigned,Not assigned,"Not assigned, Not assigned"
1,M1B,Scarborough,"Rouge, Malvern","Scarborough, Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union","Scarborough, Highland Creek, Rouge Hill, Port ..."
3,M1E,Scarborough,"Guildwood, Morningside, West Hill","Scarborough, Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn,"Scarborough, Woburn"


Now we can remove the Postcodes with no Boroughs nor Neighbourhood

In [9]:
df=df[df['Borough-Neigh']!= 'Not assigned, Not assigned']
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Borough-Neigh
1,M1B,Scarborough,"Rouge, Malvern","Scarborough, Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union","Scarborough, Highland Creek, Rouge Hill, Port ..."
3,M1E,Scarborough,"Guildwood, Morningside, West Hill","Scarborough, Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn,"Scarborough, Woburn"
5,M1H,Scarborough,Cedarbrae,"Scarborough, Cedarbrae"
6,M1J,Scarborough,Scarborough Village,"Scarborough, Scarborough Village"
7,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park","Scarborough, East Birchmount Park, Ionview, Ke..."
8,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge","Scarborough, Clairlea, Golden Mile, Oakridge"
9,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West","Scarborough, Cliffcrest, Cliffside, Scarboroug..."
10,M1N,Scarborough,"Birch Cliff, Cliffside West","Scarborough, Birch Cliff, Cliffside West"


Now, let's drop the artificial column

In [10]:
df.drop(columns=['Borough-Neigh'],inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1B,Scarborough,"Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae


In [11]:
df.shape

(103, 3)