### Segmenting and Clustering Neighborhoods in Toronto

In [1]:
#reading/scraping the webpage using pandas
import pandas as pd
df=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]

In [2]:
#intial shape of the dataframe
df.shape

(287, 3)

In [3]:
#indexing the 'Not assigned' Boroughs
indexNames = df[df['Borough']=='Not assigned'].index

In [4]:
#Dropping the Indexed 'Not assigned' Boroughs
df.drop(indexNames, inplace=True)

In [5]:
#shape after dropping 'Not assigned' Boroughs
df.shape

(210, 3)

In [6]:
#Grouping by Postcode
df=df.groupby(['Postcode', 'Borough']) ['Neighbourhood'].apply (list)
df=df.sample(frac=1).reset_index()
df['Neighbourhood']=df['Neighbourhood'].str.join(',')

In [7]:
#finding the 'Not assigned' Neighbourhood 
df.loc[df['Postcode']=='M9A']

Unnamed: 0,Postcode,Borough,Neighbourhood
28,M9A,Queen's Park,Not assigned


In [8]:
#replacing Neighbourhood value 'Not assigned' with Borough value
df.loc[(df.Postcode == 'M9A'),'Neighbourhood']=df['Borough']

### Replacing not assigned Neighbourhood 

In [9]:
#checking if the Neighbourhood value 'Not assigned' has updated with Borough value for Postcode M9A 
df.loc[df['Postcode']=='M9A']

Unnamed: 0,Postcode,Borough,Neighbourhood
28,M9A,Queen's Park,Queen's Park


# Result

In [10]:
#shape of the dataframe
df.shape

(103, 3)

### Renaming the column

In [11]:
df2=df.rename(columns={'Postcode': 'Postal Code'})

In [12]:
df2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
1,M1S,Scarborough,Agincourt
2,M1B,Scarborough,"Rouge,Malvern"
3,M5G,Downtown Toronto,Central Bay Street
4,M3K,North York,"CFB Toronto,Downsview East"


# Longitude & Latitude

In [13]:
path="http://cocl.us/Geospatial_data"
df3=pd.read_csv(path)

In [14]:
df3.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Merging Dataframes

In [15]:
df_merged=pd.merge(df2,df3, on='Postal Code')

In [16]:
df_merged.head(11)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
1,M1S,Scarborough,Agincourt,43.7942,-79.262029
2,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
3,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
4,M3K,North York,"CFB Toronto,Downsview East",43.737473,-79.464763
5,M6G,Downtown Toronto,Christie,43.669542,-79.422564
6,M3H,North York,"Bathurst Manor,Downsview North,Wilson Heights",43.754328,-79.442259
7,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
8,M5X,Downtown Toronto,"First Canadian Place,Underground city",43.648429,-79.38228
9,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw...",43.628841,-79.520999


In [18]:
import numpy as np
import pandas as pd
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.21.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

In [19]:
address = 'Toronto City, Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


In [20]:
# create map of Toronto using latitude and longitude values
toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Borough'], df_merged['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto)  
    
toronto

In [22]:
toronto_data = df_merged[df_merged['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M6G,Downtown Toronto,Christie,43.669542,-79.422564
2,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
3,M5X,Downtown Toronto,"First Canadian Place,Underground city",43.648429,-79.38228
4,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937
5,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
6,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
7,M4P,Central Toronto,Davisville North,43.712751,-79.390197
8,M5J,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",43.640816,-79.381752
9,M4V,Central Toronto,"Deer Park,Forest Hill SE,Rathnelly,South Hill,...",43.686412,-79.400049


In [23]:
# set number of clusters
kclusters = 4

toronto_clustering = toronto_data.drop(["Neighbourhood"], axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)

ValueError: could not convert string to float: 'Central Toronto'

In [22]:
kmeans.labels_[0:10] 

array([0, 1, 0, 0, 0, 2, 0, 0, 0, 0], dtype=int32)

In [26]:
# add clustering labels
toronto_clustering.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_clustering = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_data = toronto_data.join(toronto_clustering.set_index('Postal Code'), on='Neighbourhood')

toronto_data.head() # check the last columns!

ValueError: columns overlap but no suffix specified: Index(['Cluster Labels', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude'], dtype='object')

In [25]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_clustering['Latitude'], toronto_clustering['Longitude'], toronto_clustering['Neighbourhood'], toronto_clustering['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

KeyError: 'Cluster Labels'