# Assignment involving Clustering of Toronto

##### Creating the dataframe for Toronto postal codes from wikipedia page
link: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [1]:
import pandas as pd
import numpy as np

In [2]:
wiki_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
#creating dataframe and filling not assigned values
canada = pd.read_html(wiki_link, header = 0)[0]
canada = canada[canada['Borough'] != 'Not assigned']
canada['Neighbourhood'] = np.where(canada['Neighbourhood'] == 'Not assigned', canada['Borough'], canada['Neighbourhood'])


#creating 2 dataframes:
#    ca_n which contains postcode and neighborhoods
#    ca_b which contains boroughs
#    this is the step where multiple neighbourhoods are placed in same row so postcode can be unique

ca_n = canada.groupby(['Postcode'])['Neighbourhood'].apply(','.join)
ca_n = pd.DataFrame(ca_n, columns = ['Neighbourhood']).reset_index()

ca_b = canada[['Postcode','Borough']].drop_duplicates(inplace = False)


#merges the two new dataframes. Final output for Toronto neighbourhoods and boroughs by postcode
canada = ca_b.join(ca_n.set_index('Postcode'), on = 'Postcode')
canada = canada.reset_index(drop = True)
canada

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


In [156]:
canada.shape

(103, 3)




##### Adding latitude and longitude to the dataframe

In [5]:
import geocoder

In [19]:
#lat_lng_coords = None
#
#canada_post = canada.copy()
#canada_post['latitude'] = None
#canada_post['longitude'] = None
#for i in range(103):
#    lat_lng_coords = None
#    while(lat_lng_coords is None):
#        lat_lng_coords = geocoder.google('{}, Toronto, Ontario'.format(canada_post.iloc[i]['Postcode'])).latlng
#    canada_post.iloc[i]['latitude'] = lat_lng_coords[0]
#    canada_post.iloc[i]['latitude'] = lat_lng_coords[1]
#    
#    if(i % 5 == 0):
#        print("at", i, "index")

#This method ended up taking way too much time, decided to default to already known datasource

In [4]:
canada_post = pd.read_csv("http://cocl.us/Geospatial_data")
canada_post = canada.join(canada_post.set_index('Postal Code'), on = 'Postcode').reset_index(drop=True)
canada_post

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.654260,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


#### Running clustering tests on the data to cluster by latitude, longitude, and number of neighbourhoods

In [5]:
X = canada_post.copy()
X['num_N'] = X['Neighbourhood'].apply(lambda h: len(h.split(',')))
X.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,num_N
0,M3A,North York,Parkwoods,43.753259,-79.329656,1
1,M4A,North York,Victoria Village,43.725882,-79.315572,1
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636,2
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763,2
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,1


In [6]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

X = X[['Latitude', 'Longitude', 'num_N']]

s = StandardScaler()

X_norm = s.fit(X).transform(X)

clusterer = KMeans(n_clusters= 5, init = 'random', random_state = 42)
clusterer.fit(X_norm)

KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300, n_clusters=5,
       n_init=10, n_jobs=None, precompute_distances='auto', random_state=42,
       tol=0.0001, verbose=0)

In [7]:
classes = clusterer.predict(X_norm)

In [8]:
X['predictions'] = classes

In [9]:
import folium

To_coord = [43.7532, -79.3832]

themap = folium.Map(location = To_coord, zoom_start = 11)

for i in range(0,len(X)):
    if(X.iloc[i]['predictions'] == 0):
        folium.Marker([X['Latitude'][i], X['Longitude'][i]],
          icon = folium.Icon(color='red')).add_to(themap)
        
    if(X.iloc[i]['predictions'] == 1):
        folium.Marker([X['Latitude'][i], X['Longitude'][i]],
          icon = folium.Icon(color='blue')).add_to(themap)
        
    if(X.iloc[i]['predictions'] == 2):
        folium.Marker([X['Latitude'][i], X['Longitude'][i]],
          icon = folium.Icon(color='green')).add_to(themap)
        
    if(X.iloc[i]['predictions'] == 3):
        folium.Marker([X['Latitude'][i], X['Longitude'][i]],
          icon = folium.Icon(color='orange')).add_to(themap)
        
    if(X.iloc[i]['predictions'] == 4):
        folium.Marker([X['Latitude'][i], X['Longitude'][i]],
          icon = folium.Icon(color='black')).add_to(themap)



In [16]:
from IPython.core.display import HTML
display(themap)

In [79]:
s.inverse_transform(clusterer.cluster_centers_)


array([[ 43.76799803, -79.28748517,   1.96      ],
       [ 43.6784762 , -79.37053168,   1.37142857],
       [ 43.64830877, -79.46592861,   2.3125    ],
       [ 43.6660393 , -79.50723083,   5.55555556],
       [ 43.73670472, -79.48505981,   1.5       ]])

### Analysis
  
  The KMeans test was done with 5 clusters. The objective is to view the features of latitude and longitude, along with the number of neighbourhoods in each postal code. The 5 classes were created, largely split by geographic location (the latitude and longitude). One major point to notice is that the 3rd and 4th class, colored green and orange have a higher number of average neighbourhoods tahn the others. For the 4th class (orange), it is much higher than the other classes. This implies the orange postcodes are more densely populated than the others. 