## Part 1. Gathering the data from wiki page and pre-processing

In [1]:
#import libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
#let's download the html code of wiki webpage

wiki_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(wiki_page.content, 'html.parser')

In [42]:
#all the table data are in 'tr' elements, so get them all first

table_rows = soup.find_all('tr')

#create an empty list to fill it with the data

result = []

#exctract all the 'td' elements from the 'tr'. In addition we take data which starts from row with first letter "M" (to be sure that this is postal code). And one more filter - skipping 'Not Assigned' Boroughs

for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        if row[0][0] == 'M' and row[1] != 'Not assigned':
            result.append(row)

#now we have a list with the info we need. If a cell has a borough but a Not assigned neighborhood, we put the Neighbourhood name the same as the borough name

for row in result:
    if row[2] == 'Not assigned':
        row[2] = row[1]
        
#creating the dataframe with the data we got so far

toronto_df = pd.DataFrame(result, columns=["Postcode", "Borough", "Neighbourhood"])

In [43]:
#check the shape of DF

print(toronto_df.shape)
print(toronto_df.head())

(211, 3)
  Postcode           Borough     Neighbourhood
0      M3A        North York         Parkwoods
1      M4A        North York  Victoria Village
2      M5A  Downtown Toronto      Harbourfront
3      M5A  Downtown Toronto       Regent Park
4      M6A        North York  Lawrence Heights


In [44]:
#create a new DF to populate it with unique Postcodes from the list "postcodes"

new_df = pd.DataFrame(columns=["Postcode", "Neighbourhood"])
postcodes = toronto_df['Postcode'].unique()

print(postcodes)

['M3A' 'M4A' 'M5A' 'M6A' 'M7A' 'M9A' 'M1B' 'M3B' 'M4B' 'M5B' 'M6B' 'M9B'
 'M1C' 'M3C' 'M4C' 'M5C' 'M6C' 'M9C' 'M1E' 'M4E' 'M5E' 'M6E' 'M1G' 'M4G'
 'M5G' 'M6G' 'M1H' 'M2H' 'M3H' 'M4H' 'M5H' 'M6H' 'M1J' 'M2J' 'M3J' 'M4J'
 'M5J' 'M6J' 'M1K' 'M2K' 'M3K' 'M4K' 'M5K' 'M6K' 'M1L' 'M2L' 'M3L' 'M4L'
 'M5L' 'M6L' 'M9L' 'M1M' 'M2M' 'M3M' 'M4M' 'M5M' 'M6M' 'M9M' 'M1N' 'M2N'
 'M3N' 'M4N' 'M5N' 'M6N' 'M9N' 'M1P' 'M2P' 'M4P' 'M5P' 'M6P' 'M9P' 'M1R'
 'M2R' 'M4R' 'M5R' 'M6R' 'M7R' 'M9R' 'M1S' 'M4S' 'M5S' 'M6S' 'M1T' 'M4T'
 'M5T' 'M1V' 'M4V' 'M5V' 'M8V' 'M9V' 'M1W' 'M4W' 'M5W' 'M8W' 'M9W' 'M1X'
 'M4X' 'M5X' 'M8X' 'M4Y' 'M7Y' 'M8Y' 'M8Z']


In [45]:
#add the postcode and all the Neighbourhoods belong to it into the "new_df" DataFrame

for code in postcodes:
    neighbourhoods = ", ".join(toronto_df[toronto_df['Postcode'] == code]['Neighbourhood'])
    new_df = new_df.append({"Postcode" : code, "Neighbourhood": str(neighbourhoods)}, ignore_index=True)
    
print(new_df.head())
print(new_df.shape)

  Postcode                     Neighbourhood
0      M3A                         Parkwoods
1      M4A                  Victoria Village
2      M5A         Harbourfront, Regent Park
3      M6A  Lawrence Heights, Lawrence Manor
4      M7A                      Queen's Park
(103, 2)


In [47]:
#let's drop the duplicated postcodes from the original DataFrame

toronto_df = toronto_df.drop_duplicates(subset=['Postcode'])

In [48]:
# finally we merge two df with unique Postcodes and all the related neighbourhoods combined
toronto_df = pd.merge(toronto_df, new_df, on = 'Postcode')
toronto_df = toronto_df.drop(['Neighbourhood_x'], axis = 1)
toronto_df = toronto_df.rename(columns={'Neighbourhood_y': ' Neighbourhood'})

In [49]:
print(toronto_df.head())

  Postcode           Borough                     Neighbourhood
0      M3A        North York                         Parkwoods
1      M4A        North York                  Victoria Village
2      M5A  Downtown Toronto         Harbourfront, Regent Park
3      M6A        North York  Lawrence Heights, Lawrence Manor
4      M7A      Queen's Park                      Queen's Park


In [50]:
print(toronto_df.shape)

(103, 3)
