In [36]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

## Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [37]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url,'lxml') 

In [38]:
table_post = soup.find('table')
fields = table_post.find_all('td')

In [39]:
postcode = []
borough = []
neighbourhood = []

## Create Dataframe with Postcode, Borough and Neighbourhood columns

In [40]:
for i in range(0, len(fields), 3):
    postcode.append(fields[i].text.strip())
    borough.append(fields[i+1].text.strip())
    neighbourhood.append(fields[i+2].text.strip())
        
toronto_df = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()
toronto_df.columns = ['Postcode', 'Borough', 'Neighbourhood']
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Remove Not Assigned

In [41]:
toronto_df['Borough'].replace('Not assigned', np.nan, inplace=True)
toronto_df.dropna(subset=['Borough'], inplace=True)

toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Combine Neighborhoods corresponding to the same Postcode

In [42]:
toronto_df = toronto_df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
toronto_df.columns = ['Postcode', 'Borough', 'Neighbourhood']
toronto_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


## Final Dataframe as seen in the assignment

In [43]:
column_names = ["Postcode", "Borough", "Neighborhood"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(toronto_df[toronto_df["Postcode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,Postcode,Borough,Neighborhood,Neighbourhood
0,M5G,Downtown Toronto,,Central Bay Street
1,M2H,North York,,Hillcrest Village
2,M4B,East York,,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,,Scarborough Village
4,M4G,East York,,Leaside
5,M4M,East Toronto,,Studio District
6,M1R,Scarborough,,"Wexford, Maryvale"
7,M9V,Etobicoke,,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,,Humber Summit
9,M5V,Downtown Toronto,,"CN Tower, King and Spadina, Railway Lands, Har..."
