Import libraries

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
print("Libraries Imported")

Libraries Imported


# Creating DataFrame

In [3]:
#get html from wiki page and create soup object
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(source.text, 'lxml')

#using soup object iterate through wiki table and store into a list
data = []
columns = []
table = soup.find(class_ = 'wikitable')
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
        
    #first row of data is header
    if(index == 0):
        columns = section
    else:
        data.append(section)
        
#convert list into pandas dataframe
toronto_df = pd.DataFrame(data = data, columns = columns)
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# Data Cleaning

## remove borough that are not assigned

In [5]:

toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned']
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## More than one neighborhood can exist in one postal code area.

In [11]:

toronto_df["Neighborhood"] = toronto_df.groupby("Postal Code")["Neighborhood"].transform(lambda neigh: ', '.join(neigh))

#remove duplicates
toronto_df = toronto_df.drop_duplicates()

toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [14]:
if(toronto_df.index.name != 'Postal Code'):
    toronto_df = toronto_df.set_index('Postal Code')

toronto_df.head()

Unnamed: 0_level_0,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [15]:
toronto_df['Neighborhood'].replace("Not assigned", toronto_df["Borough"], inplace= True)
toronto_df.head()

Unnamed: 0_level_0,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [16]:
toronto_df.shape

(103, 2)