# Segmenting and Clustering Neighborhoods in the city of Toronto
## First part of the assignment
#### (Creating notebook, scraping data and transforming the dataframe)

Below the necesarry packages will be imported

In [1]:
import pandas as pd
import numpy as np

***Scraping of the data***  
Pandas is used to scrape data form the wikipedia website listing the postal codes of Canada. This returns a dataframe consisting of 3 elements. The first element is the table we need.

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
df_scraped = pd.read_html(url)
df_toronto = df_scraped[0]
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [3]:
df_toronto.dtypes

Postal Code      object
Borough          object
Neighbourhood    object
dtype: object

***Transforming the data***

In [4]:
#Drop unassigned Boroughs
unassigned_borrows = df_toronto[df_toronto["Borough"]== "Not assigned"].index
df_toronto.drop(unassigned_borrows,inplace = True)
df_toronto.rename(columns = {"Postal Code":"PostalCode"}, inplace = True)
#Change datatypes from object to string
df_toronto = df_toronto.astype("string")

_Note that the wikipedia table already has transformed their table in a way that postal codes stretching neighbourhoods display the corresponding neighbourhoods separated with a comma._  

Neighbourhoods that are not assigned will receive the name of their borough

In [5]:
unassigned_neighbourhoods = df_toronto[df_toronto["Neighbourhood"]== "Not assigned"].index
df_toronto.Neighbourhood[unassigned_neighbourhoods] = df_toronto.Borough[unassigned_neighbourhoods]
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


_Please note that on the table in wikipedia all the neighbourhoods have been assigned in the case that they have a borough assigned._

In [25]:
df_toronto.shape

(103, 3)

## Second part of the assignment
#### (Adding Latitude and Longitude data to the dataframe)

In [6]:
#Obtain geographical coordinates from geocoder

In [7]:
# Install a conda package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install geocoder



In [8]:
import geocoder # import geocoder
import os #import os to get workingdirectory

Geocoder is called for each postal code. A while loop is used since it is not the most reliable package. Therefore it tries to obtain the values untill succesful.

In [9]:
# initialize your variable to None
# for i in range(0,len(df_toronto.Neighbourhood)):
#     lat_lng_coords = None

#     # loop until you get the coordinates
#     while(lat_lng_coords is None):
#       g = geocoder.google('{}, Toronto, Ontario'.format(df_toronto.iloc[0,i]))
#       lat_lng_coords = g.latlng

#     latitude = lat_lng_coords[0]
#     longitude = lat_lng_coords[1]
#     #Write the coordinates to the dataframe
#     df_toronto.iloc[3,i] = latitude
#     df_toronto.iloc[4,i] = longitude
# df_toronto.head()

In [10]:
#Since the geocoder call takes forever to run the csv will be used for the latlong data

In [12]:
#Get parent path of current working directory
path = os.getcwd()
parentpath = os.path.dirname(path)

In [13]:
#Load csv
df_latlong = pd.read_csv(parentpath + "\src\data\Geospatial_Coordinates.csv")
df_latlong.rename(columns = {"Postal Code":"PostalCode"}, inplace = True)
df_latlong.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
df_latlong["PostalCode"] = df_latlong["PostalCode"].astype("string")
df_latlong.dtypes

PostalCode     string
Latitude      float64
Longitude     float64
dtype: object

In [15]:
# Join Toronto dataframe with latlong dataframe on postal code
df_toronto_latlong = pd.merge(df_toronto,df_latlong, on = "PostalCode", how = "left")

In [16]:
df_toronto_latlong.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Third part of the assignment
#### (Exploring and clustering the neighborhoods in Toronto)

In [17]:
df_toronto_latlong.dtypes

PostalCode        string
Borough           string
Neighbourhood     string
Latitude         float64
Longitude        float64
dtype: object

In [18]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

Run k-means clustering. However before doing this the Borough categorical values have to be converted to numerical values via one hot encoding. This could be done for all categorical values however the choice has been made to just do this for the boroughs. This will likely be wat the clusters will be based on!

In [19]:
# one hot encoding
df_toronto_dummied= pd.get_dummies(df_toronto_latlong["Borough"])
df_toronto_latlong = pd.concat([df_toronto_latlong, df_toronto_dummied], axis=1)
df_toronto_latlong.head()


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,West Toronto,York
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,0,0,0,0,0,1,0,0,0
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,0,0,0,0,0,1,0,0,0
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,1,0,0,0,0,0,0,0,0
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,0,0,0,0,0,1,0,0,0
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,1,0,0,0,0,0,0,0,0


In [20]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = df_toronto_latlong.drop(['Borough','Neighbourhood', 'PostalCode'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 


array([0, 0, 1, 0, 1, 3, 2, 0, 4, 1])

In [21]:
# add clustering labels
df_toronto_latlong.insert(0, 'Cluster Labels', kmeans.labels_)
df_toronto_latlong.head()

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,West Toronto,York
0,0,M3A,North York,Parkwoods,43.753259,-79.329656,0,0,0,0,0,0,1,0,0,0
1,0,M4A,North York,Victoria Village,43.725882,-79.315572,0,0,0,0,0,0,1,0,0,0
2,1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,1,0,0,0,0,0,0,0,0
3,0,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,0,0,0,0,0,1,0,0,0
4,1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,1,0,0,0,0,0,0,0,0


In [22]:
import folium
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [23]:
# Set latitude and longitude for Toronto to start the map
latitude = 43.651070
longitude = -79.347015

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto_latlong['Latitude'], df_toronto_latlong['Longitude'], df_toronto_latlong['Neighbourhood'], df_toronto_latlong['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters