# Assignment to Explore Neighborhoods in Toronto

## Importing all necessary libraries

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from project_lib import Project

import json # library to handle JSON files

!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium
import folium # map rendering library

print('Libraries imported.')



Collecting folium
  Downloading folium-0.12.0-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 6.3 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.0
Libraries imported.


## Web scraping to obtain required dataframe
We save the URL of the website that we need to scrape and use pandas for scraping.

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
url_tables = pd.read_html(url)
df = url_tables[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Cleaning up the dataframe
We need to perform some modifications to clean up this dataframe, such as get rid of entries where the Boroughs are 'Not assigned' or group rows where Postal Codes are the same.

In [3]:
df.columns = ['PostalCode', 'Borough', 'Neighborhood']
df = df[df.Borough != 'Not assigned'].reset_index(drop=True)
df.groupby('PostalCode')['Neighborhood'].agg(','.join)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


We now verify that all neighboorhoods have assigned values. The next script indeed produces a 'False'.

In [4]:
df['Neighborhood'].isin(['Not assigned']).any()

False

Finally, we verify the shape of our dataframe using .shape.

In [5]:
df.shape


(103, 3)

## Finding the latitude and longitude coordinates of the postal codes
We will use the csv file to find the latitude and longitude of the postal codes and add it to our dataframe

In [6]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
df_lat_long.columns = ['PostalCode', 'Borough', 'Neighborhood']
df_final=pd.merge(df,df_lat_long, how='left', left_on="PostalCode", right_on="PostalCode")
df_final.columns = ['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']
df_final.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494





## Exploring Toronto

The dataframe is now in the desired format. We can use this dataframe to explore the different neighborhoods in Toronto.




In [8]:
# First we create a map of Toronto using folium
map_toronto = folium.Map(location=[43.5032, -79.3832], zoom_start=10)

# add Boroughs to the map
for lat, lon, boro, hood in zip(df_final['Latitude'], df_final['Longitude'], df_final['Borough'], df_final['Neighborhood']):
    label = '{}:{}'.format(boro,hood)
    folium.CircleMarker([lat,lon],
                        radius=5,
                        popup=label,
                        color='blue',
                        fill=True,
                        fill_color='#3186cc',
                        fill_opacity=0.7,
                        parse_html=False).add_to(map_toronto)

map_toronto

### Focus on only Downtown Toronto

For the time being, the focus in exploring the city of Toronto will be on its downtown area. Therefore, we can extract all neighborhoods that pertain to Downtown Toronot from our dataframe. Since some postal codes have multiple neighborhoods, as a first assumption we will only look at the first neighborhood that appears on each postal code.


In [66]:
# Splice df_final to obtain a dataframe only for the Downtown Toronto neighborhoods
dtown_hoods = df_final.loc[df['Borough']=='Downtown Toronto'].reset_index(drop=True).drop(['PostalCode','Borough'],axis=1)

# Keep only the first neighborhood for cases where the same latitude and longitude refer to multiple neighborhoods
dtown_hoods['Neighborhood'] = dtown_hoods['Neighborhood'].str.split(', ').str[0]

# latitude and longitude of Downtown Toronto
dtown_latlon = [43.6548, -79.3883]

# Create map of downtown Toronto
map_downtown = folium.Map(location=dtown_latlon, zoom_start=13)

for hood, lat, lon in zip(dtown_hoods['Neighborhood'],dtown_hoods['Latitude'],dtown_hoods['Longitude']):
    label = hood
    folium.CircleMarker([lat,lon],
                        radius=5,
                        popup=label,
                        color='blue',
                        fill=True,
                        fill_color='#3186cc',
                        fill_opacity=0.7,
                        parse_html=False).add_to(map_downtown)

# map_downtown
map_downtown


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Regent Park,43.65426,-79.360636
1,Queen's Park,43.662301,-79.389494
2,Garden District,43.657162,-79.378937
3,St. James Town,43.651494,-79.375418
4,Berczy Park,43.644771,-79.373306
5,Central Bay Street,43.657952,-79.387383
6,Christie,43.669542,-79.422564
7,Richmond,43.650571,-79.384568
8,Harbourfront East,43.640816,-79.381752
9,Toronto Dominion Centre,43.647177,-79.381576


## Define my Foursquare credentials

In [10]:
CLIENT_ID = 'T5V0I1ANOTP2LLASIJL5EY0RTEAWWFFKGC4PECNZM5VUETB2' # your Foursquare ID
CLIENT_SECRET = '5WXDDYM1WLSOAOSGSTQSFQ1NENKKMV14IY4TYZAEIJDGWQYE' # your Foursquare Secret
VERSION = '20200107' # Foursquare API version - yesterday's date: 07 January 2020
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: T5V0I1ANOTP2LLASIJL5EY0RTEAWWFFKGC4PECNZM5VUETB2
CLIENT_SECRET:5WXDDYM1WLSOAOSGSTQSFQ1NENKKMV14IY4TYZAEIJDGWQYE


## Finding the top 100 venues in 500 meters in each downtown neighborhood

To do so we create a for loop to go through each neighborhood from our dataframe to obtain the needed latitude and logitudes to make the proper Foursquare API call. We create a function for sake of simplicity to iterate through all neighborhoods.

In [11]:
def GetVenueInfo(hood_name, hood_lat, hood_lon):

    LIMIT = 100 # limit of number of venues returned by Foursquare API
    radius = 500 # define radius
    
    venues_list=[]
    for name, lat, lon in zip(hood_name, hood_lat, hood_lon):
        print(name)
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lon, 
            radius, 
            LIMIT)
        
        results = requests.get(url).json()["response"]['groups'][0]['items']

        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lon, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    venue_results = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    venue_results.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']


    print('Done getting all venues in downtown neighborhoods')
    return(venue_results, venues_list)


In [12]:
dtown_venues, dtown_venue_list = GetVenueInfo(dtown_hoods['Neighborhood'], dtown_hoods['Latitude'], dtown_hoods['Longitude'])

Regent Park
Queen's Park
Garden District
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond
Harbourfront East
Toronto Dominion Centre
Commerce Court
University of Toronto
Kensington Market
CN Tower
Rosedale
Stn A PO Boxes
St. James Town
First Canadian Place
Church and Wellesley
Done getting all venues in downtown neighborhoods


In [13]:
dtown_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Regent Park,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Regent Park,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Regent Park,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,Regent Park,43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
4,Regent Park,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


To further explore the downtown area, we can group the above dataframe in terms of neighborhood to get a sense of the number of venues obtained for each neighborhood.

In [17]:
dtown_venues.groupby('Neighborhood').count()


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,57,57,57,57,57,57
CN Tower,15,15,15,15,15,15
Central Bay Street,61,61,61,61,61,61
Christie,16,16,16,16,16,16
Church and Wellesley,79,79,79,79,79,79
Commerce Court,100,100,100,100,100,100
First Canadian Place,100,100,100,100,100,100
Garden District,100,100,100,100,100,100
Harbourfront East,100,100,100,100,100,100
Kensington Market,64,64,64,64,64,64


We can also check the total number of unique category of venues for the entire downtown of Toronto.

In [16]:
print('There are {} uniques categories.'.format(len(dtown_venues['Venue Category'].unique())))

There are 209 uniques categories.


In order to cluster the neighborhood in the downtown area of Toronto by most common venue category, we first need to transform the our dataframe. We perform a one-hot encoding with regards to the category of the venue for each venue obtained.

In [47]:
# one hot encoding
dtown_onehot = pd.get_dummies(dtown_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column as first column in one hot encoded dataframe
dtown_onehot.drop(['Neighborhood'], axis=1, inplace=True)
dtown_onehot.insert(0,'Neighborhood', dtown_venues['Neighborhood'])

dtown_onehot.head()


Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


To assess the most common venue type in each neighborhood, we can group the dataframe and calculate the mean occurence of each venue type.

In [48]:
dtown_grouped = dtown_onehot.groupby('Neighborhood').mean().reset_index()
dtown_grouped

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017544,...,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0
1,CN Tower,0.066667,0.066667,0.133333,0.2,0.133333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016393,0.0,0.0,0.016393,0.0,0.016393
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.012658,0.0,0.0,0.0,...,0.012658,0.012658,0.0,0.0,0.0,0.0,0.012658,0.0,0.0,0.025316
5,Commerce Court,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.01,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
6,First Canadian Place,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.01,...,0.01,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0
7,Garden District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.02,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0
8,Harbourfront East,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.01,...,0.01,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0
9,Kensington Market,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.046875,0.0,0.046875,0.015625,0.0,0.0


For a more visual representation, we can print out the top 5 venues in each of the neighborhoods.

In [62]:
num_top_venues = 5

for hood in dtown_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = dtown_grouped[dtown_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
            venue  freq
0     Coffee Shop  0.09
1    Cocktail Bar  0.05
2  Farmers Market  0.04
3        Beer Bar  0.04
4      Restaurant  0.04


----CN Tower----
              venue  freq
0   Airport Service  0.20
1    Airport Lounge  0.13
2  Airport Terminal  0.13
3           Airport  0.07
4     Boat or Ferry  0.07


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.20
1                Café  0.05
2  Italian Restaurant  0.05
3      Sandwich Place  0.05
4     Bubble Tea Shop  0.03


----Christie----
                venue  freq
0       Grocery Store  0.25
1                Café  0.19
2                Park  0.12
3  Italian Restaurant  0.06
4          Restaurant  0.06


----Church and Wellesley----
                 venue  freq
0          Coffee Shop  0.08
1  Japanese Restaurant  0.06
2     Sushi Restaurant  0.06
3           Restaurant  0.04
4              Gay Bar  0.04


----Commerce Court----
                venue  freq
0         Coffee Sh

Create a function to help organize the above output into a pandas dataframe format.

In [63]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [64]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = dtown_grouped['Neighborhood']

for ind in np.arange(dtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Beer Bar,Farmers Market,Bakery,Cheese Shop,Restaurant,Japanese Restaurant,Hotel
1,CN Tower,Airport Service,Airport Lounge,Airport Terminal,Airport,Boutique,Plane,Rental Car Location,Sculpture Garden,Boat or Ferry,Harbor / Marina
2,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Salad Place,Burger Joint,Bubble Tea Shop,Thai Restaurant,Portuguese Restaurant,Poke Place
3,Christie,Grocery Store,Café,Park,Baby Store,Candy Store,Italian Restaurant,Coffee Shop,Nightclub,Restaurant,Athletics & Sports
4,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Fast Food Restaurant,Hotel,Pub,Café,Men's Store


## Clustering the Neighborhoods with regards to the most common venue category

We do so using k-means clustering from the scikit-learn library.

In [65]:
# set number of clusters
kclusters = 5

dtown_grouped_clustering = dtown_grouped.drop('Neighborhood', 1)


# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dtown_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 3, 4, 1, 0, 0, 0, 0, 0, 0, 4, 4, 0, 2, 0, 0, 0, 0], dtype=int32)

We store this clustering information in a new dataframe along with the most common venue categories in each neighborhood

In [68]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

dtown_merged = dtown_hoods

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
dtown_merged = dtown_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')


ValueError: cannot insert Cluster Labels, already exists

In [69]:
dtown_merged

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Regent Park,43.65426,-79.360636,4,Coffee Shop,Café,Park,Bakery,Breakfast Spot,Pub,Theater,Farmers Market,French Restaurant,Wine Shop
1,Queen's Park,43.662301,-79.389494,4,Coffee Shop,Sushi Restaurant,Yoga Studio,Diner,Beer Bar,Italian Restaurant,Japanese Restaurant,Music Venue,Sandwich Place,Distribution Center
2,Garden District,43.657162,-79.378937,0,Coffee Shop,Clothing Store,Cosmetics Shop,Japanese Restaurant,Café,Middle Eastern Restaurant,Hotel,Bubble Tea Shop,Diner,Theater
3,St. James Town,43.651494,-79.375418,0,Coffee Shop,Café,Restaurant,Gastropub,Park,Italian Restaurant,American Restaurant,Cocktail Bar,Bakery,Farmers Market
4,Berczy Park,43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Seafood Restaurant,Beer Bar,Farmers Market,Bakery,Cheese Shop,Restaurant,Japanese Restaurant,Hotel
5,Central Bay Street,43.657952,-79.387383,4,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Salad Place,Burger Joint,Bubble Tea Shop,Thai Restaurant,Portuguese Restaurant,Poke Place
6,Christie,43.669542,-79.422564,1,Grocery Store,Café,Park,Baby Store,Candy Store,Italian Restaurant,Coffee Shop,Nightclub,Restaurant,Athletics & Sports
7,Richmond,43.650571,-79.384568,0,Coffee Shop,Café,Restaurant,Clothing Store,Deli / Bodega,Gym,Thai Restaurant,Hotel,American Restaurant,Salad Place
8,Harbourfront East,43.640816,-79.381752,0,Coffee Shop,Aquarium,Café,Hotel,Italian Restaurant,Fried Chicken Joint,Brewery,Restaurant,Scenic Lookout,Baseball Stadium
9,Toronto Dominion Centre,43.647177,-79.381576,0,Coffee Shop,Hotel,Café,Italian Restaurant,Salad Place,Seafood Restaurant,American Restaurant,Restaurant,Japanese Restaurant,Steakhouse


### Visualizing the clustering
We use folium to view these neighborhoods and add colors to match the clustering conducted above.

In [72]:
# create map
map_clusters = folium.Map(location=dtown_latlon, zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dtown_merged['Latitude'], dtown_merged['Longitude'], dtown_merged['Neighborhood'], dtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters