# Let's Scrape The Toronto Postal Code Wikipedia Page

### First we have to import the libraries we will need

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

## Next, let's get the HTML code from Wikipedia

##### Using the Requests Library, we can store the html code of the Toronto Postal Code Wikipedia page in text format and name it 'resource'. Then create a BeautifulSoup object as 'soup' to extract the html code's contents to get relavent information for the dataframe.

In [3]:
resource = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
#print(resource)
soup  = BeautifulSoup(resource,'lxml')
#print(soup.prettify()) # using prettify to show the indented format of the code.

## Let's extract the table and it's values

In [4]:
table = soup.find('table', class_ = "wikitable sortable")
#print(table.prettify())

##### The data is in HTML format so we will need to extract all table rows seen above as '<>' with 'tr' inside, in text format by using '.text' which will remove the special characters '<', '/' and '>'.

In [5]:
for row in table.find_all('tr'):
    print(row.text)


Postcode
Borough
Neighborhood


M1A
Not assigned
Not assigned


M2A
Not assigned
Not assigned


M3A
North York
Parkwoods


M4A
North York
Victoria Village


M5A
Downtown Toronto
Harbourfront


M6A
North York
Lawrence Heights


M6A
North York
Lawrence Manor


M7A
Queen's Park
Not assigned


M8A
Not assigned
Not assigned


M9A
Downtown Toronto
Queen's Park


M1B
Scarborough
Rouge


M1B
Scarborough
Malvern


M2B
Not assigned
Not assigned


M3B
North York
Don Mills North


M4B
East York
Woodbine Gardens


M4B
East York
Parkview Hill


M5B
Downtown Toronto
Ryerson


M5B
Downtown Toronto
Garden District


M6B
North York
Glencairn


M7B
Not assigned
Not assigned


M8B
Not assigned
Not assigned


M9B
Etobicoke
Cloverdale


M9B
Etobicoke
Islington


M9B
Etobicoke
Martin Grove


M9B
Etobicoke
Princess Gardens


M9B
Etobicoke
West Deane Park


M1C
Scarborough
Highland Creek


M1C
Scarborough
Rouge Hill


M1C
Scarborough
Port Union


M2C
Not assigned
Not assigned


M3C
North York
Flemingdon Park


### Let's list the elements of our rows

In [6]:
for tr in table.find_all('tr'): #tr = table row
    data = (tr.find_all('td')) #td = table data
    row1 = [i.text for i in data]
    #print(row1)

## Now Let's Create the DataFrame

##### We will make an array of the lists above so we can turn it into a dataframe.

##### We will also need to add the column names which are the Table Headers labled 'th' within the '<>' in the html code above.

In [7]:
row= []
for tr in table.find_all('tr'):
    data = tr.find_all('td')
    row.append([i.text.strip() for i in data]) # .strip() is used to remove the \n from last column
      
labels = table.find_all('th')
labels = [c.text for c in labels]
labels = [i.strip() for i in labels] 
#print(labels)
tor_data = pd.DataFrame( data  = row , columns = labels)
tor_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### Now let's clean our dataframe

##### We will start by removing the empty top row and then resetting the index

In [8]:
df1 =  tor_data.drop(tor_data.index[0], inplace = True)

In [9]:
tor_data1 = tor_data.reset_index(drop = True)

In [10]:
tor_data1.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### We need to remove all the 'Not assigned' values from the 'Borough' column

In [11]:
tor_data1 = tor_data1[tor_data1.Borough != 'Not assigned']

In [12]:
tor_data1.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### Let's group together 'Neighborhoods' with same 'Postcode'.

#### After getting our column names, we can start grouping Neighborhoods together first by 'Postcode' then by 'Borough' and joining together the string values.

In [13]:
tor_data1.columns

Index(['Postcode', 'Borough', 'Neighborhood'], dtype='object')

In [14]:
tor_data1 = tor_data1.groupby(['Postcode','Borough'])['Neighborhood'].apply(','.join).reset_index()
tor_data1.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Now we replace the 'Not assigned' values in the 'Neigborhood' column with the corresponding 'Boroughs'.

In [15]:
tor_data1.loc[tor_data1.Neighborhood == 'Not assigned', 'Neighborhood'] = tor_data1.Borough
tor_data1.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Now let's see the shape of the cleaned up DataFrame

In [16]:
tor_data1.shape

(103, 3)