In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import requests
from bs4 import BeautifulSoup

In [12]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = requests.get(url).text
soup = BeautifulSoup(result,'lxml')
table = soup.find('table',{'class':'wikitable plainrowheaders'})

In [14]:
# it shows all 'tr' in the url from the first one to the last one
#[1::1] it means skipping the first finding, and then slice it one by one same as [1:]
#we did this because we dont want to import the header name
for items in soup.find('table', class_='wikitable').find_all('tr')[1::1]:
    data = items.find_all(['th','td'])
    try:
        country = data[0].a.text
        title = data[1].a.text
        name = data[1].a.find_next_sibling().text
    except IndexError:pass
    print("{}|{}|{}".format(country,title,name))

None


In [109]:
table = soup.find('table', class_='wikitable')
#data is in form of tr>>td + filling>> td>>td + filling>>td>>td+filling>>td tr>>tr and so on
#this is all considered as one item in a loop so the row is an item
df = pd.DataFrame(columns=['postal_code','borough','neighbourhood'])
# it shows all 'tr' in the url from the first one to the last one
#[1::1] it means skipping the first finding, and then slice it one by one same as [1:]
#we did this because we dont want to import the header name
#Each cell is considered as <td>filling + </td>
for row in table.find_all('tr')[1::1]:
    cells = []
    for cell in table.find_all('td'):
        #using the text will remove td, and then do strip to remove any spaces
        cell_value = cell.text.strip()
        #now we added each cell value in a list
        cells.append(cell_value)
#adding cells values to the dataframe by jumping 2 steps from each start
df['postal_code'] = cells[::3]
df['borough'] = cells[1::3]
df['neighbourhood'] = cells[2::3]
df.head()

Unnamed: 0,postal_code,borough,neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [130]:
#removing rows where borough is not assigned
df = df[df['borough']!= 'Not assigned']
#checking if there is ant not assigned value, we found it none
df.isin(['Not assigned']).any()

postal_code      False
borough          False
neighbourhood    False
dtype: bool

In [178]:
#joining neighbourhoods for different boroughs
dfjoin = df.groupby('postal_code')['neighbourhood'].apply(lambda x: ','.join(x))
dfjoin.reset_index()
df2 = dfjoin.reset_index()
df2.rename(columns = {'neighbourhood':'neighbourhood_joined'}, inplace= True)
df2.head()
df_merged = pd.merge(df, df2)
df_merged = df_merged.drop(['neighbourhood'], axis=1)
df_merged.drop_duplicates(inplace=True)
df_merged.head()

Unnamed: 0,postal_code,borough,neighbourhood_joined
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [179]:
df.nunique(), df.shape

(postal_code      103
 borough          10 
 neighbourhood    99 
 dtype: int64,
 (103, 3))

In [181]:
df_merged.nunique(), df2.shape

(postal_code             103
 borough                 10 
 neighbourhood_joined    99 
 dtype: int64,
 (103, 2))