# Toronto neighborhoods' data scraping and wrangling

First, we import necessary libraries for scraping and wrangling

In [85]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import numpy as np

Next, we scrape the table from the wikipedia page providing the list of postal codes in Canada

In [93]:
#get url using the requests library
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#parse html of the web page using html parser
soup = BeautifulSoup(source,'html.parser')

#open a csv file passing in the writing argument
csv_file = open('canada_data.csv','w')

#assign the column headers
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['PostalCode','Borough','Neighborhood'])

#parse table from the web page
table  = soup.find('table')

#iterate over rows to fill the rows of the csv file
rows = table.find_all('tr')
for row in rows:
    csv_row = []
    for cell in row.find_all('td'):
        csv_row.append(cell.get_text())
    csv_writer.writerow(csv_row)
print('data loaded to csv file')

data loaded to csv file


read the file into a pandas dataframe

In [87]:
toronto_df = pd.read_csv('canada_data.csv',delimiter=",")
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n
5,M6A\n,North York\n,Lawrence Manor / Lawrence Heights\n
6,M7A\n,Downtown Toronto\n,Queen's Park / Ontario Provincial Government\n
7,M8A\n,Not assigned\n,\n
8,M9A\n,Etobicoke\n,Islington Avenue\n
9,M1B\n,Scarborough\n,Malvern / Rouge\n


Then, we remove rows without assigned Boroughs

In [95]:
#replace 'Not assigned' with nan
toronto_df.replace("Not assigned\n", np.nan, inplace = True)

#drop rows with nan values in Borough column
toronto_df.dropna(subset=["Borough"], axis=0, inplace=True)

#reset index of the rows
toronto_df.reset_index(drop=True, inplace=True)


toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods\n
1,M4A\n,North York\n,Victoria Village\n
2,M5A\n,Downtown Toronto\n,"Regent Park , Harbourfront\n"
3,M6A\n,North York\n,"Lawrence Manor , Lawrence Heights\n"
4,M7A\n,Downtown Toronto\n,"Queen's Park , Ontario Provincial Government\n"
5,M9A\n,Etobicoke\n,Islington Avenue\n
6,M1B\n,Scarborough\n,"Malvern , Rouge\n"
7,M3B\n,North York\n,Don Mills\n
8,M4B\n,East York\n,"Parkview Hill , Woodbine Gardens\n"
9,M5B\n,Downtown Toronto\n,"Garden District, Ryerson\n"


Afterwards, we replace cells with 'Not assigned' neighborhood so the neighborhood will be the same as the borough.

In [90]:
toronto_df.replace("\n", "Borough", inplace = True)
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods\n
1,M4A\n,North York\n,Victoria Village\n
2,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n
3,M6A\n,North York\n,Lawrence Manor / Lawrence Heights\n
4,M7A\n,Downtown Toronto\n,Queen's Park / Ontario Provincial Government\n
5,M9A\n,Etobicoke\n,Islington Avenue\n
6,M1B\n,Scarborough\n,Malvern / Rouge\n
7,M3B\n,North York\n,Don Mills\n
8,M4B\n,East York\n,Parkview Hill / Woodbine Gardens\n
9,M5B\n,Downtown Toronto\n,"Garden District, Ryerson\n"


Finally, we group neighborhoods with the same postal code

In [91]:
toronto_df.groupby('PostalCode')
toronto_df['Neighborhood'] = toronto_df['Neighborhood'].str.replace('/',',')
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods\n
1,M4A\n,North York\n,Victoria Village\n
2,M5A\n,Downtown Toronto\n,"Regent Park , Harbourfront\n"
3,M6A\n,North York\n,"Lawrence Manor , Lawrence Heights\n"
4,M7A\n,Downtown Toronto\n,"Queen's Park , Ontario Provincial Government\n"
5,M9A\n,Etobicoke\n,Islington Avenue\n
6,M1B\n,Scarborough\n,"Malvern , Rouge\n"
7,M3B\n,North York\n,Don Mills\n
8,M4B\n,East York\n,"Parkview Hill , Woodbine Gardens\n"
9,M5B\n,Downtown Toronto\n,"Garden District, Ryerson\n"
