## PART ONE

### Importing the Libraries

In [1]:
import requests # library to handle requests
import lxml.html as lh
import bs4 as bs
import urllib.request
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analysis

In [2]:
url   = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=890001695"

In [3]:
source = requests.get('https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=890001695').text

#### Preliminary overview of the dataframe
We will look at the first 10 rows.

In [4]:
raw_table = pd.read_html(source, header=0, attrs={"class":"wikitable sortable"})[0]
raw_table.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


#### Creating the dataframe

First, we will rename the columns.

In [5]:
# Renaming the columns
raw_table = raw_table.rename(columns={"Postcode":"PostalCode", "Borough":"Borough"})

Then, we drop the cells with a boroughs that is 'Not assigned'.

In [6]:
raw_table = raw_table.drop(raw_table.index[raw_table['Borough']=='Not assigned'])

We then combine neighborhoods that share the same postal code area.

In [7]:
table=raw_table.groupby('PostalCode', sort=False, as_index=False).agg(lambda x: ', '.join(set(x)))

If we find that a cell has a borough with a 'Not assigned' neighborhood, then the borough's name will replace the 'Not assigned' value.

In [8]:
for i in range(table.shape[0]):
    if table.loc[i,'Neighbourhood'] =='Not assigned':
        table.loc[i,'Neighbourhood'] = table.loc[i,'Borough']

In [9]:
# Sorting the data by postal code area and borough:
table = table.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

We view the new and polished dataframe.

In [10]:
table

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Morningside, Guildwood, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"Thistletown, Humbergate, Jamestown, Silverston..."


In [11]:
table.shape

(103, 3)

In [12]:
table.to_csv('Toronto_Dataframe.csv',index=False)