# Segmenting and Clustering Neighborhoods in Toronto Lab

_By Sean Morris_

#### In this assignment, I will create a pandas dataframe of Toronto neighborhood data and then use it to explore neighborhoods in Toronto

## PART ONE: DOWNLOADING DATA INTO A PANDAS DATAFRAME

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
!pip install lxml



In [2]:
wikipedia_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' # URL of Wikipedia for Canada postal codes list

In [3]:
df_list = pd.read_html(wikipedia_url) # To read HTML

In [4]:
canada_df = pd.DataFrame (df_list[0]) # To create dataframe from the HTML file
canada_df.drop(canada_df[canada_df['Borough'] == 'Not assigned'].index, inplace = True) # To remove rows with 'Not Assigned' in 'Borough' there are no neighborhoods that have a neighborhood but a 'Not Assigned' Borough
canada_df_filtered = canada_df.reset_index(drop=True) #To reset dataframe index
canada_df_filtered.head() # To display filtered dataframe

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
canada_neighborhoods = canada_df_filtered.sort_values('Postal Code')

In [6]:
canada_neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
6,M1B,Scarborough,"Malvern, Rouge"
12,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"
22,M1G,Scarborough,Woburn
26,M1H,Scarborough,Cedarbrae


In [7]:
canada_neighborhoods.shape

(103, 3)

# PART TWO: GETTING GEOGRAPHIC DATA

In [8]:
GeographicURL = 'http://cocl.us/Geospatial_data'

In [9]:
df_geogs = pd.read_csv(GeographicURL) # To read CSV file

In [10]:
df_geogs2 = df_geogs.sort_values('Postal Code')

In [11]:
df_geogs2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## MERGING THE TWO DATASETS

In [12]:
merged_df = pd.merge(canada_neighborhoods, df_geogs2, 
                     left_on = 'Postal Code', 
                     right_on = 'Postal Code', 
                     how='left')

In [13]:
merged_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# PART THREE: CREATING A CLUSTERED MAP

### Make a new dataframe that contains only those Boroughs that contain the word "Toronto"

In [14]:
Toronto_Neighborhoods_Only = merged_df[merged_df["Borough"].str.contains('Toronto')]

In [15]:
Toronto_Neighborhoods_Only.shape

(39, 5)

In [16]:
Toronto_Neighborhoods_Only['Borough'].value_counts()

Downtown Toronto    19
Central Toronto      9
West Toronto         6
East Toronto         5
Name: Borough, dtype: int64

In [17]:
Toronto_Neighborhoods_Only.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [18]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [19]:
Toronto_Clustering = Toronto_Neighborhoods_Only.drop('Neighbourhood', 1)
Toronto_Clustering = Toronto_Clustering.drop('Postal Code',1)
Toronto_Clustering= Toronto_Clustering.drop('Borough',1)
Toronto_Clustering.head()

Unnamed: 0,Latitude,Longitude
37,43.676357,-79.293031
41,43.679557,-79.352188
42,43.668999,-79.315572
43,43.659526,-79.340923
44,43.72802,-79.38879


In [28]:
# set number of clusters
kclusters = 4

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_Clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 3, 3, 2, 2, 2, 2, 2, 2], dtype=int32)

In [29]:
# add clustering labels
Toronto_Neighborhoods_Only.insert(0, 'Cluster Labels', kmeans.labels_)

ValueError: cannot insert Cluster Labels, already exists

In [23]:
from geopy.geocoders import Nominatim

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [25]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [26]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


In [27]:

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_Neighborhoods_Only['Latitude'], Toronto_Neighborhoods_Only['Longitude'], Toronto_Neighborhoods_Only['Neighbourhood'], Toronto_Neighborhoods_Only['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters