# Segmenting and Clustering Neighborhoods in Toronto

### Load the libraries

In [2]:
import numpy as np
import pandas as pd
import json
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
import xml
!conda install -c conda-forge folium=0.5.0 --yes 
import folium

Collecting package metadata: done
Solving environment: \ 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/linux-64::anaconda==5.3.1=py37_0
  - defaults/linux-64::astropy==3.0.4=py37h14c3975_0
  - defaults/linux-64::bkcharts==0.2=py37_0
  - defaults/linux-64::blaze==0.11.3=py37_0
  - defaults/linux-64::bokeh==0.13.0=py37_0
  - defaults/linux-64::bottleneck==1.2.1=py37h035aef0_1
  - defaults/linux-64::dask==0.19.1=py37_0
  - defaults/linux-64::datashape==0.5.4=py37_1
  - defaults/linux-64::mkl-service==1.1.2=py37h90e4bf4_5
  - defaults/linux-64::numba==0.39.0=py37h04863e7_0
  - defaults/linux-64::numexpr==2.6.8=py37hd89afb7_0
  - defaults/linux-64::odo==0.5.1=py37_0
  - defaults/linux-64::pytables==3.4.4=py37ha205bf6_0
  - defaults/linux-64::pytest-arraydiff==0.2=py37h39e3cac_0
  - defaults/linux-64::pytest-astropy==0.4.0=py37_0
  - defaults/linux-64::pytest-doctestplus==0.1.3=py37_0
  - defaults

### Scrape wikipedia page

In [4]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url,'lxml')

### Process the cells with assigned borough using tags to find postal code by Borough and Neighbourhood 

In [5]:
tab = soup.find('table')
f = tab.find_all('td')

post = []
bor = []
nbh = []

for i in range(0, len(f), 3):
    post.append(f[i].text.strip())
    bor.append(f[i+1].text.strip())
    nbh.append(f[i+2].text.strip())
        
dfpc = pd.DataFrame(data=[post, bor, nbh]).transpose()
dfpc.columns = ['Postcode', 'Borough', 'Neighbourhood']
dfpc.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Remove 'Not assigned' and aggregate

In [6]:
dfpc['Borough'].replace('Not assigned', np.nan, inplace=True)
dfpc.dropna(subset=['Borough'], inplace=True)
dfpc.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [7]:
dfpc1 = dfpc.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
dfpc1.columns = ['Postcode', 'Borough', 'Neighbourhood']
dfpc1

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [8]:
dfpc1['Neighbourhood'].replace('Not assigned', "Queen's Park", inplace=True)
dfpc1

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Print number of rows

In [9]:
dfpc1.shape

(103, 3)

In [10]:
dfgeo = pd.read_csv('http://cocl.us/Geospatial_data')
dfgeo.columns = ['Postcode', 'Latitude', 'Longitude']
df2 = pd.merge(dfpc1, dfgeo, on=['Postcode'], how='inner')
df3 = df2[['Postcode','Borough', 'Neighbourhood', 'Latitude', 'Longitude']].copy()
df3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Find coordinates of City of Toronto

In [11]:
addr = 'Toronto, Canada'
geoloc = Nominatim()
loc = geoloc.geocode(addr)
lat = loc.latitude
long = loc.longitude
print('Coordinates of Toronto are {}, {}.'.format(lat, long))

  


Coordinates of Toronto are 43.653963, -79.387207.


### Generate a map with a marker on the City of Toronoto

In [12]:
mapt = folium.Map(location=[lat, long], zoom_start=10)
for lt, lg, bor, nbh in zip(df3['Latitude'], df3['Longitude'], df3['Borough'], df3['Neighbourhood']):
    lab = '{}, {}'.format(nbh, bor)
    lab = folium.Popup(lab, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=3,
        popup=lab,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.3,
        parse_html=False).add_to(mapt)  
mapt

### Foursqaure Credentials

In [None]:
CLIENT_ID = 'N0U44HKDODI2DSMBXYXMHFEBTHTZ0UBVYNP2JUZ3N4QJGRYV' 
CLIENT_SECRET = 'UXVELCYAJDXA5YRX1PZLSX0KYKG0L40Y4LIITFKZRC1DMPA4' 
VERSION = '20180605'

### K-means

In [13]:
mapt = folium.Map(location=[lat, long], zoom_start=10)
X = mapt['Latitude']
Y = mapt['Longitude']
Z = np.stack((X, Y), axis=1)

kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
mapt['Cluster'] = clusters

for lt, lg, bor, cluster in zip(mapt['Latitude'], mapt['Longitude'], mapt['Borough'], mapt['Cluster']):
    label = folium.Popup(bor, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(mapt)  

mapt

TypeError: 'Map' object is not subscriptable