In [45]:
# The code was removed by Watson Studio for sharing.

# Data Scraping of Toronto Neighborhoods
We'll start by importing the libraries necessary for scraping the Wikipedia data.

In [46]:
from bs4 import BeautifulSoup
import lxml
import requests
import csv

Now we'll load the data from the Wikipedia page:

In [47]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

The "soup" variable contains the entire page HTML, but we only care about the table with the data.

In [48]:
#isolate the table of interest
src_table = soup.find('table')

Now we have successfully pulled out the HTML of the data table, so we need to put it into a format that is easier to work with. I will create a CSV file and write the data rows into it.

In [49]:
csv_file = open('toronto_neighborhoods.csv', 'w')
csv_writer = csv.writer(csv_file)

#Write in the column headers:
csv_writer.writerow(['PostalCode', 'Borough','Neighborhood'])

33

In [50]:
for row in src_table.find_all('tr'):
    #initialize each row as an empty list
    row_list = []
    for cell in row.find_all('td'):
        #loop horizontally through each row in the source table and add its contents to the csv row
        #The replace function is needed because the row_list automatically appends a newline character '\n' at the end of each row,
        #which I would like to avoid.
        row_list.append(cell.text.replace('\n',''))
    csv_writer.writerow(row_list)

In [51]:
#close the file
csv_file.close()

Let's create a pandas dataframe out of this CSV file:

In [52]:
import pandas as pd
import numpy as np

In [53]:
df = pd.read_csv('toronto_neighborhoods.csv')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Now we have a working copy of the data table as a Pandas dataframe, so we'll need to process it for the data analysis. We'll start by dropping any rows that have "Not assigned" for the borough.

In [54]:
#First replace the 'Not assigned' boroughs with numpy's NaN for easier row dropping.
for i in range(len(df['Borough'])):
    if df.loc[i,'Borough'] =='Not assigned':
        df.loc[i,'Borough'] = np.nan
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,,Not assigned
1,M2A,,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [55]:
#now we will drop all rows with NaN in the Borough column
df.dropna(subset=["Borough"], axis=0, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [56]:
#combine the rows with the same postal code into a comma-separated list in a single row.
df = df.groupby(['PostalCode','Borough'], as_index = False, sort=False).agg(', '.join)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


In [57]:
#if the neighborhood is "Not assigned", process it to match the corresponding Borough name
for i in range(len(df['Neighborhood'])):
    if df.loc[i,'Neighborhood'] =='Not assigned':
        df.loc[i,'Neighborhood'] = df.loc[i,'Borough']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


Finally, find out what the shape of the processed dataframe is:

In [58]:
df.shape

(103, 3)

In [60]:
#Save this file to my project assets so I can access it more easily in the next part of the project
project.save_data(data=df.to_csv(index=False),file_name='processed_toronto_neigh.csv',overwrite=True)

{'asset_id': '6fbe673c-dcb1-4b18-8dca-73e7578a2193',
 'bucket_name': 'machinelearningfinalproject-donotdelete-pr-2woc1o3vy08wzt',
 'file_name': 'processed_toronto_neigh.csv',
 'message': 'File saved to project storage.'}