# Capstone Project: Segmenting and Clustering Toronto

## imports

In [2]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd

## Scraping the Wikipedia page

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = urllib.request.urlopen(url)
article = req.read().decode()

We know the page has only one table, we use BeautifulSoup to find it

In [4]:
soup = BeautifulSoup(article, 'html.parser')
table = soup.find('table', class_='sortable')

Now we parse the table into a Pandas dataframe

In [5]:
#headings
ths = table.find_all('th')
headings = [th.text.strip() for th in ths]

#create dataframe
neighbourhoods = pd.DataFrame(columns=headings)
neighbourhoods

# scrape rows into dataframe
for tr in table.find_all('tr'):
    tds = tr.find_all('td')
    if not tds:
        continue                            #skips first row with headings
    pc, b, n = [td.text.strip() for td in tds]
    if b!="Not assigned":
        neighbourhoods = neighbourhoods.append({'Postcode': pc, 'Borough': b, 'Neighbourhood': n}, ignore_index=True)

neighbourhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Neighbourhood,Postcode
0,,North York,,Parkwoods,M3A
1,,North York,,Victoria Village,M4A
2,,Downtown Toronto,,"Regent Park, Harbourfront",M5A
3,,North York,,"Lawrence Manor, Lawrence Heights",M6A
4,,Downtown Toronto,,"Queen's Park, Ontario Provincial Government",M7A


## Cleaning up the dataframe
Combine neighbourhoods for the same postcode, by concatenating the names. Then drop all duplicates arising

In [6]:

neighbourhoods['Neighbourhood'] = neighbourhoods.groupby('Postcode')['Neighbourhood'].transform(lambda x: "%s" % ', '.join(x)).values
neighbourhoods = neighbourhoods.drop_duplicates().reset_index(drop=True)
neighbourhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Neighbourhood,Postcode
0,,North York,,Parkwoods,M3A
1,,North York,,Victoria Village,M4A
2,,Downtown Toronto,,"Regent Park, Harbourfront",M5A
3,,North York,,"Lawrence Manor, Lawrence Heights",M6A
4,,Downtown Toronto,,"Queen's Park, Ontario Provincial Government",M7A


Replace Neighbourhoods with 'Not assigned' to their Burough name

In [7]:
neighbourhoods['Neighbourhood'].replace("Not assigned", neighbourhoods['Borough'], inplace=True)
neighbourhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Neighbourhood,Postcode
0,,North York,,Parkwoods,M3A
1,,North York,,Victoria Village,M4A
2,,Downtown Toronto,,"Regent Park, Harbourfront",M5A
3,,North York,,"Lawrence Manor, Lawrence Heights",M6A
4,,Downtown Toronto,,"Queen's Park, Ontario Provincial Government",M7A


This will result in the following amount of rows:

In [8]:
neighbourhoods.shape

(103, 5)