## Segmenting and Clustering neighbourhoods in Toronto

### Imports

In [47]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Read Html using Beautiful Soup Library

In [53]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M';
html = requests.get(url).text
soup = BeautifulSoup(html,'lxml')

In [54]:
postal_codes_table = soup.find('table',{'class':'wikitable sortable'})

In [55]:
table_cells = postal_codes_table.find_all('td')

In [56]:
N = 3
table_rows = [table_cells[n:n+N] for n in range(0, len(table_cells), N)]
postal_code = []
borough = []
neighborhood = []
for row in table_rows:
    postal_code_text = row[0].text
    borough_text = row[1].text
    neighborhood_text = row[2].text.rstrip(' ')
    if borough_text != 'Not assigned':
        postal_code.append(postal_code_text)
        borough.append(borough_text)
        neighborhood.append(neighborhood_text)

### Create Dataframe

In [57]:
df = pd.DataFrame()
df['PostalCode'] = postal_code
df['Borough'] = borough
df['Neighborhood'] = neighborhood
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [58]:
df = df.groupby(['PostalCode','Borough']).aggregate(lambda x : ', '.join(x)).reset_index()
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M1B\n,Scarborough\n,"Malvern, Rouge\n"
2,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek\n"
3,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill\n"
4,M1G\n,Scarborough\n,Woburn\n


In [59]:
df.loc[df['PostalCode'] == 'M5A\n']

Unnamed: 0,PostalCode,Borough,Neighborhood
80,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [60]:
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = df['Borough']

In [61]:
df.loc[df['PostalCode'] == 'M7A\n']

Unnamed: 0,PostalCode,Borough,Neighborhood
120,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"


In [40]:

# df shape
df.shape

(180, 3)

### Insert Geospatial Coordinates CSV File to the code

In [41]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Perform Left Join to the Data Frames

In [42]:
df_data_1.rename(columns={'Postal Code':'PostalCode'},inplace=True)
df_data_1.head()
df = pd.merge(df, df_data_1, on='PostalCode', how='left')
df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1A\n,Not assigned\n,,,
1,M1B\n,Scarborough\n,"Malvern, Rouge",,
2,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek",,
3,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill",,
4,M1G\n,Scarborough\n,Woburn,,


In [46]:
df.loc[df['PostalCode'] == 'M5G\n']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
84,M5G\n,Downtown Toronto\n,Central Bay Street,,
