# Segmenting and Clustering Neighborhoods in Toronto


In [3]:
import numpy as np
import pandas as pd
import requests
import json
import matplotlib
from bs4 import BeautifulSoup


### Getting contents in the url

In [4]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wiki = requests.get(URL)

In [5]:

soup = BeautifulSoup(wiki.content, 'html.parser')

table = soup.find('table')

#dataframe columns: PostalCode, Borough and Neighborhood

df = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighborhood'])

#search for the postalcode, borough and neighborhood

for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data
        
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Cleaning the Data

In [6]:
#remove cells with borough not assigned

df=df[df['Borough']!='Not assigned']

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [13]:
#combinding

new_df = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index(drop=False)

print(new_df.head())
new_df.shape

  PostalCode      Borough                              Neighborhood
0        M1B  Scarborough                           Malvern / Rouge
1        M1C  Scarborough  Rouge Hill / Port Union / Highland Creek
2        M1E  Scarborough       Guildwood / Morningside / West Hill
3        M1G  Scarborough                                    Woburn
4        M1H  Scarborough                                 Cedarbrae


(103, 3)

### Import latitude and longitude csv

In [10]:
latilong = pd.read_csv('https://cocl.us/Geospatial_data')
latilong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Cleaning the data and merge with df

In [17]:
latilong.rename(columns={'Postal Code':'PostalCode'},inplace=True)

df2 = pd.merge(new_df,latilong,on='PostalCode')

df2.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park,43.727929,-79.262029
7,M1L,Scarborough,Golden Mile / Clairlea / Oakridge,43.711112,-79.284577
8,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch Cliff / Cliffside West,43.692657,-79.264848


### Identify postal codes with Toronto

In [18]:
df3 = df2[df2['Borough'].str.contains('Toronto',regex=False)]
df3

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188
42,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,Moore Park / Summerhill East,43.689574,-79.38316
49,M4V,Central Toronto,Summerhill West / Rathnelly / South Hill / For...,43.686412,-79.400049


### Visualizing the data

In [30]:
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [55]:
toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=12)

for lat, lng, label in zip(df3['Latitude'], df3['Longitude'], df3['Neighborhood']):
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue').add_to(toronto)
        

toronto

