# Segmenting and Clustering Neighborhoods in Toronto

## Part 1 - Extract neighborhoods from wikipedia page

#### Import packages

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib
import time
import folium
import re

#### Get webpage information and create soup

In [2]:
url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050.'
page = requests.get(url)
soup = BeautifulSoup(page.text,'html.parser')

#### Extract table information and create dataframe

In [70]:
table_contents=[]
table=soup.find('table')

# loop through table rows to obtain values
for row in table.findAll('tr'):
    cell = {}
    try:
        c = row.findAll('td')
        cell['PostalCode'] = c[0].get_text()
        cell['Borough'] = c[1].get_text()
        cell['Neighborhood'] = c[2].get_text().replace('\n', '')
        table_contents.append(cell)
    except:
        pass
    
# create a dataframe containing extracted text values
df=pd.DataFrame(table_contents)

#### Clean data and combine rows per postal code

In [70]:
# update the names of some boroughs
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
# remove all rows where borough equals 'Not assigned'
df = df[df['Borough'] != 'Not assigned']
# combine rows into one where the postal code is the same, with a comma between neigborhoods
df = pd.DataFrame(df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))).reset_index()

In [72]:
# check for unassigned neighborhoods
df[df['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


There are no neighborhoods equal to 'Not assigned'

#### Check shape of final table

In [74]:
df.shape

(103, 3)