#### Importing all required libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

#### Get the wikipedia link using the 'requests' method and initializing a BeautifulSoup object with html5lib as the parser

In [2]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source, 'html5lib')

#### Used 'Inspect' option in the Google Chrome browser application to find the workable table from the source code of Wikipedia page

In [3]:
initial_table = soup.find('table', {'class':'wikitable sortable'})

#### Initialized three lists for each column. Looped through the html tags 'tr' to get each row and separately filled in each list with elements within the tag 'td'

In [4]:
a,b,c=[],[],[]
for rows in initial_table.find_all('tr'):
    cells = rows.find_all('td')
    if(len(cells)==3):
        a.append(cells[0].find(text=True))
        b.append(cells[1].find(text=True))
        c.append(cells[2].find(text=True))

#### Created a new DataFrame with columns 'Postcode', 'Borough', and 'Neighbourhood' and assigned them their corresponding list values.

In [258]:
df = pd.DataFrame({'Postcode':a, 'Borough':b, 'Neighbourhood':c})
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [260]:
print('Number of rows in the current dataframe: ', df.shape[0])

Number of rows in the current dataframe:  289


#### The number of 'Not assigned' values in the column 'Borough' stands at 77

In [251]:
df['Borough'].value_counts()

Not assigned        77
Etobicoke           45
Scarborough         38
North York          38
Downtown Toronto    37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64

#### Creating a separate copy of the dataframe to work on. Dropping all 'Not assigned' values of 'Borough' column from the dataframe.

In [261]:
df_toronto = df.copy()
df_toronto = df_toronto[df_toronto.Borough != 'Not assigned']
df_toronto.Borough.value_counts()

Etobicoke           45
Scarborough         38
North York          38
Downtown Toronto    37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64

#### Converting the elements in the dataframe to type 'str' in order to reveal annoying '\n' values tagged along with the values. Removing '\n' values from the dataframe and reset the index.

In [262]:
df_toronto.reset_index(inplace=True)
df_toronto.drop(['index'], axis=1, inplace=True)
df_toronto = df_toronto.astype(str)
df_toronto = df_toronto.replace('\n','', regex=True)
df_toronto.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


#### The For loop will search for all 'Not assigned' elements in 'Neighbourhood' column and replace it with the corresponding 'Borough' value.

In [263]:
for i in range(0,df_toronto.shape[0]):
    if df_toronto.Neighbourhood[i] == 'Not assigned':
        df_toronto.Neighbourhood.replace(df_toronto.Neighbourhood[i], df_toronto.Borough[i], inplace=True)

In [264]:
df_toronto.Neighbourhood[6] # previous value was "Not assigned"

"Queen's Park"

In [265]:
df_test.Borough[6]

'North York'

#### Final number of rows in the dataframe

In [266]:
print("The number of rows in the final dataframe :", df_test.shape[0])

The number of rows in the final dataframe : 212
