# Segmenting and Clustering Neighborhoods in Toronto #

## Question 1 ##

- **Build a dataframe of the postal code of each neighborhood with borough name and neighborhood name in Toronto.**

**Importing librairies**

In [11]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

**Scraping data from Wikipedia page into a Dataframe**

In [12]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'html.parser')

In [13]:
#Creating empty dataframe with columns
column_names = ['PostalCode','Borough','Neighborhood']
toronto_neighbours = pd.DataFrame(columns = column_names)

postal_code = ""
borough = ""
neighborhood = ""

# Get the first table which contains all the postcode, borough, neighborhood information
table = soup.find('table')

# Put the table information in the dataframe
for td in table.findAll('td'):
    postal_code = td.b.text
    text = td.span.text
    #If there is a neighbourhood, then the dataframe is filled
    if(text != 'Not assigned'):
        borough = text.split('(')[0]
        neighborhood = ((((text.split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        toronto_neighbours = toronto_neighbours.append({'PostalCode': postal_code,'Borough': borough,'Neighborhood': neighborhood},ignore_index=True)

#We replace some borough values with other borough values    
toronto_neighbours['Borough']=toronto_neighbours['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
toronto_neighbours.head()



Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [14]:
print("Dataframe shape: {}".format(toronto_neighbours.shape))

Dataframe shape: (103, 3)


## Question 2 ##

- **Get the latitude and the longitude coordinates of each neighborhood and include them to the previous dataframe**

**Installing geocoder library**

In [15]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


**Importing geocoder library**

In [16]:
import geocoder

**Importing geospatial coordinates from csv file**

In [17]:
geospatial_coords = pd.read_csv('Geospatial_Coordinates.csv')
geospatial_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


**Joining geospatial coordinates with dataframe composed of postal codes, borough and neighborhood info**

In [18]:
merged_data = toronto_neighbours.join(geospatial_coords.set_index('Postal Code'), on='PostalCode')
merged_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [19]:
merged_data.shape

(103, 5)

**Verification that the dataframe is like the required dataframe**

In [20]:
post_codes_test = ['M5G', 'M2H', 'M4B', 'M1J', 'M4G', 'M4M', 'M1R', 'M9V', 'M9L',
                   'M5V', 'M1B', 'M5A']


column_names_test = ['PostalCode','Borough','Neighborhood', 'Latitude', 'Longitude']
test_df = pd.DataFrame(columns = column_names_test)

for postcode in post_codes_test:
    test_df = test_df.append(merged_data[merged_data["PostalCode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


## Question 3 ##

- **Explore and cluster the neighborhoods in Toronto**

**Installing geopy and folium libraries**

In [21]:
!pip install geopy
!pip install folium



**Importing librairies**

In [22]:
from geopy.geocoders import Nominatim
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

**Using geopy library to get the latitude and longitude values of Toronto**

In [23]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The coordinates of Toronto are 43.6534817, -79.3839347.


**Creating a map of Toronto with neighbourhoods**

In [24]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# adding markers to map
for latitude, longitude, borough, neighborhood in zip(merged_data['Latitude'], merged_data['Longitude'], merged_data['Borough'], merged_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='red',
        fill=True
        ).add_to(map_toronto)  
    
map_toronto

**Filtering boroughs that contain the word Toronto**

In [25]:
toronto_df = merged_data[merged_data.Borough.str.contains("Toronto")].reset_index(drop=True)
print(toronto_df.shape)
toronto_df.head()

(39, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


**Showing map of Toronto with neighborhood in boroughs that contain the word Toronto**

In [26]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)
# adding markers to map
for lat, lng, label in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], radius=5, popup=label, color='blue', fill=True, fill_color='#3186cc', fill_opacity=0.7,parse_html=False).add_to(map_toronto)  
map_toronto

**Credentials for the Foursquare API to explore the neighborhoods**

In [27]:
CLIENT_ID = 'ZF1TM4YOQHTNZTPQMRJJOS2VMWLAGYXUNWOGKFYBFLNGJHUI' 
CLIENT_SECRET = '3SILTMMAWVOVQ0P4PTFWXALNFQJ52KHHYPQ2AZZNCTWONJ20'
VERSION = '20180605'
LIMIT = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ZF1TM4YOQHTNZTPQMRJJOS2VMWLAGYXUNWOGKFYBFLNGJHUI
CLIENT_SECRET:3SILTMMAWVOVQ0P4PTFWXALNFQJ52KHHYPQ2AZZNCTWONJ20


**Creating a function to get all venues that are within a radius of 500 meters for each neighborhood**

In [28]:
def getNearbyVenues(posts, names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for post, name, lat, lng in zip(posts, names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return relevant information for each venue
        venues_list.append([(
            post,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode',
                             'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
venues_toronto = getNearbyVenues(merged_data['PostalCode'], merged_data['Neighborhood'], merged_data['Latitude'], merged_data['Longitude'])

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills North
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
The Danforth  East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
Downsview East
The Danforth

In [None]:
venues_toronto.shape

In [None]:
venues_toronto.head()

**Showing the venues based on Neighborhood**

In [None]:
venues_toronto.groupby('Neighborhood').head()

**Showing the number of unique venue categories**

In [None]:
print('There are {} uniques categories.'.format(len(venues_toronto['Venue Category'].unique())))
venues_toronto['Venue Category'].unique()[:20]

**Analyse each neighborhood**

In [None]:
# One-hot encoding
venues_toronto_onehot = pd.get_dummies(venues_toronto[['Venue Category']], prefix="", prefix_sep="")

# Add neighborhood column back to dataframe
venues_toronto_onehot['Neighborhood'] = venues_toronto['Neighborhood'] 

# Move neighborhood column to the first column
fixed_columns = [venues_toronto_onehot.columns[-1]] + list(venues_toronto_onehot.columns[:-1])
venues_toronto_onehot = venues_toronto_onehot[fixed_columns]

venues_toronto_onehot.head()

In [None]:
venues_toronto_onehot.shape

**We group rows by neighborhood and we take the mean of the frequency of occurrence of each category**

In [None]:
venues_toronto_grouped = venues_toronto_onehot.groupby('Neighborhood').mean().reset_index()
venues_toronto_grouped

In [None]:
venues_toronto_grouped.shape

**Let's print each neighborhood along with the top 5 most common venues**

In [None]:
num_top_venues = 5

for venue in venues_toronto_grouped['Neighborhood']:
    print("----"+ venue +"----")
    temp = venues_toronto_grouped[venues_toronto_grouped['Neighborhood'] == venue].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

**We write a function to sort the venues in descending order**

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

**We create a DataFrame that contains the top 10 venues for each neighborhood**

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# Create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)

# Create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = venues_toronto_grouped['Neighborhood']

for ind in np.arange(venues_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(venues_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

**Importing librairies for clustering**

In [None]:
from sklearn.cluster import KMeans
import sklearn.cluster.k_means_

**Let's run k-means to cluster the neighborhood into 5 clusters.**

In [None]:
knum_clusters = 5
toronto_grouped_clustering = venues_toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=knum_clusters, random_state=1).fit(toronto_grouped_clustering)
print(kmeans.labels_[0:10])
print(len(kmeans.labels_))

**Create a complete dataframe that includes the cluster, postal code, borough, neighborhood and the top 10 venues for each neighborhood**

In [None]:
# Add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = merged_data
# Join the 2 dataframes so that the latitudes and longitudes are added to the dataframe containing the clusters 
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

**Let's visualize the resulting clusters**

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(knum_clusters)
ys = [i+x+(i*x)**2 for i in range(knum_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
# adding markers to map
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'],kmeans.labels_):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon], radius=5, popup=label, color=rainbow[cluster-1], fill=True, fill_color=rainbow[cluster-1], fill_opacity=0.7).add_to(map_clusters)
map_clusters

**Cluster 1**

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

**Cluster 2**

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

**Cluster 3**

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

**Cluster 4**

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

**Cluster 5**

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]