
Step 1:  scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M


In [203]:

# import packages

from bs4 import BeautifulSoup
import requests
import pandas as pd


In [204]:
# store the content of the webpage into a string

url_can = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
str_can = requests.get(url_can).text 
type(str_can)

str

In [205]:
#transform the str to html

html_can = BeautifulSoup(str_can, 'lxml') 
type(html_can)

bs4.BeautifulSoup








Step 2: Extract and save targeted data into a dataframe 


In [206]:

# extract the neighborhoods data into table

neighborhoods = html_can.find('table', class_ = 'wikitable')

In [207]:
# extract all tags whose .string matches 'tr' 

neigh = neighborhoods.find_all('tr')

In [208]:
# notice the head from the above printout:
#<th>Postcode</th>
#<th>Borough</th>
#<th>Neighbourhood
#</th>

# define a new Dataframe
heads = ['Postcode','Borough','Neighborhood']
df_can = pd.DataFrame(columns = heads)

# extract each row for ('Postcode', 'Borough', 'Neighbourhood') from the table
# then split the row into three strings
# finally, attach the three strings to the Dataframe

for row in neigh:
    info = row.text.split('\n')[1:-1] 
    pc = info[0]
    br = info[1]
    nbhd= info[2]
    df_can = df_can.append({'Postcode': pc,'Borough': br,'Neighborhood': nbhd}, ignore_index=True)

df_can.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [209]:
#drop the first row, which is not real data but just the original column heads on the website

df_can = df_can.iloc[1:]
df_can.head()

Unnamed: 0,Postcode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront





Step 3: Data Cleaning

For this part, I conducted three procedures:
first, delete all the rows without Borough;
second, for those rest rows without Neighborhood, I assign Borough to Neighborhood;
third, I combined the rows with the same postcode

In [210]:
# drop those data with Borough = "Not assigned"

no_br_index = df_can.index[df_can['Borough'] == 'Not assigned'] # extract the index of rows without "Borough"
df_can.drop(no_br_index, inplace=True) # filter the Dataframe

df_can.shape

(211, 3)

In [211]:
# assign "Borough" to "Neighborhood", if "Borough" was not assigned before

#no_nbhd_index = df_can.index[df_can['Neighborhood'] == 'Not assigned'] 

#for i in no_nbhd_index:
    # df_can['Neighborhood'][i] = df_can['Borough'][i]
        
for i, row in df_can.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
        
df_can.shape

(211, 3)

In [212]:
df = df_can.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(list).apply(lambda x:', '.join(x)).to_frame().reset_index()

#df.columns = ['Postcode', 'Borough', 'Neighborhood']

In [213]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [214]:
print('The shape of the processed dataframe is: ',  df.shape)

The shape of the processed dataframe is:  (103, 3)
