# Segmenation and Clustering of Toronto Neighborhood

### Installng the required packages

In [2]:
pip install bs4

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install folium


Note: you may need to restart the kernel to use updated packages.


## Part I - Preparing data for Toronto by page scraping from Wikipedia

### The following code does the importing of required libraries and page scrapes the data for Toronto

In [5]:
import numpy as np
import pandas as pd
import json as JSON
import requests
import csv
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import geocoder # import geocoder
import io
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium

In [6]:
link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data = requests.get(link).text

In [7]:
soup = BeautifulSoup(data, 'html.parser')
pcList = []
bList = []
nList = []

In [8]:
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        pcList.append(cells[0].text.split("\n")[0])
        bList.append(cells[1].text.split("\n")[0])
        nList.append(cells[2].text.strip("\n"))

# create a new DataFrame from the three lists
df = pd.DataFrame({"PostalCode": pcList,
                           "Borough": bList,
                           "Neighborhood": nList})
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [9]:
df = df[df.Borough != "Not assigned"].reset_index(drop=True)
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [10]:
df = df.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [11]:
for index, row in df.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
df.shape

(103, 3)

## Part II - Explore the neighborhoods of Toronoto

### Get the latitude and the longitude coordinates of each neighborhood.

In [13]:

url = 'http://cocl.us/Geospatial_data'
s = requests.get(url).content
cdf = pd.read_csv(io.StringIO(s.decode('utf-8')))

### Read the Toronto data and find the Latitude, Longitude of the neighborhoods

In [14]:
cdf.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
cdf.head()


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging the two data obtained

In [15]:
toronto_df_new = df.merge(cdf, on="PostalCode", how="left")
toronto_df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Part 3 - Create a map of Toronto with neighborhood

In [16]:
toronto_map = folium.Map(location=[43.65, -79.4], zoom_start=12)

X = toronto_df_new['Latitude']
Y = toronto_df_new['Longitude']
Z = np.stack((X, Y), axis=1)

kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
toronto_df_new['Cluster'] = clusters

for latitude, longitude, borough, cluster in zip(toronto_df_new['Latitude'], toronto_df_new['Longitude'], toronto_df_new['Borough'], toronto_df_new['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map