# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
table = soup.find('table')
ths = table.findAll('th')
trs = table.findAll('tr')

In [2]:
rows = []
columns = []
for th in ths:
    columns.append(th.text.replace('\n', ''))
df = pd.DataFrame(columns=columns)
for i in range(1, len(trs)):
    row = []
    tds = trs[i].findAll('td')
    for td in tds:
        row.append(td.text.replace('\n', ''))
    rows.append(row)
    df.loc[len(df)] = row
print('shpae of dataframe scrapped from Wikipedia', df.shape)
df.head(10)

shpae of dataframe scrapped from Wikipedia (287, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


## Ignore cells with a borogh that is Not assigned

In [3]:
print('Shape of dataframe with no borogh assigned', df[df['Borough']=='Not assigned'].shape)
#drop the cells
df.drop(df[df['Borough']=='Not assigned'].index, inplace=True)
print('Shape of dataframe have an assigned borough', df.shape)
df.head()

Shape of dataframe with no borogh assigned (77, 3)
Shape of dataframe have an assigned borough (210, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


## More than one neighborhood can exist in one postal code area.(Combining Rows)

In [4]:
processed_df = pd.DataFrame(columns=columns)
postcodes = df['Postcode'].unique()
for postcode in postcodes:
    borough = df[df['Postcode']==postcode]['Borough'].values[0]
    neighbourhood = ''
    neighborhoods = df[df['Postcode']==postcode]['Neighbourhood']
    for neighborhood in neighborhoods:
        neighbourhood += neighborhood + ', '
    neighbourhood = neighbourhood[: len(neighbourhood) - 2]
    processed_df.loc[len(processed_df)] = [postcode, borough, neighbourhood]
df = processed_df
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Not assigned
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


## Replace a cell has a borough but a Not assigned neighborhood

In [5]:
## indcies of cells need to replace
indicies = df[df['Neighbourhood']=='Not assigned'].index
df.loc[indicies].head()

Unnamed: 0,Postcode,Borough,Neighbourhood
5,M9A,Queen's Park,Not assigned


In [6]:
## replace
for index in indicies:
    df.loc[index]['Neighbourhood'] = df.loc[index]['Borough']
df.loc[indicies].head()

Unnamed: 0,Postcode,Borough,Neighbourhood
5,M9A,Queen's Park,Queen's Park


## Final View

In [7]:
print('the number of rows of dataframe', df.shape)
df.head(10)

the number of rows of dataframe (103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Queen's Park
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"
