# Capstone project Toronto neighborhood exploration
## Importing libraries, data

In [25]:
!pip install beautifulsoup4 lxml html5lib requests
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import folium



In [7]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = requests.get(URL)
content = response.content
soup = BeautifulSoup(content, 'lxml')
table = soup.find_all('table')[0]
df_0 = pd.read_html(str(table))
df_1 = pd.DataFrame(df_0[0])

df_1.head()

Unnamed: 0,Postal Code,District,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [10]:
df_2 = df_1.drop(df_1[df_1.District == 'Not assigned'].index)

for index, row in df_2.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] == row['District']

In [13]:
df_2g = df_2.groupby(['Postal Code','District'], as_index=False).agg(lambda x: ','.join(x))
postal_codes = df_2g['Postal Code'].to_list() # converting the postal code column to a list

df_2g.head()

Unnamed: 0,Postal Code,District,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Getting the coordinates of the postal codes and do some cleaning

In [14]:
file = 'https://cocl.us/Geospatial_data'
coordinates_df = pd.read_csv(file)

# creating a dataframe that has the postal codes arranged in the same order as df_2g
col_names = ['Postcodes', 'Latitude', 'Longitude']
coord_df = pd.DataFrame(columns = col_names)

for codes in postal_codes:
    coord_df = coord_df.append(coordinates_df[coordinates_df['Postal Code']==codes], ignore_index=True)

cord_df = coord_df.drop(['Postcodes'], axis=1)

cord_df.head()

Unnamed: 0,Latitude,Longitude,Postal Code
0,43.806686,-79.194353,M1B
1,43.784535,-79.160497,M1C
2,43.763573,-79.188711,M1E
3,43.770992,-79.216917,M1G
4,43.773136,-79.239476,M1H


In [15]:
lat_lon_df = cord_df[['Latitude', 'Longitude']]
df_2g[['Latitude', 'Longitude']] = lat_lon_df
df_2g.head()

Unnamed: 0,Postal Code,District,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [20]:
district_names = list(df_2g['District'].unique())

toronto_district = []

for district in district_names:
    if 'toronto' in district.lower():
        toronto_district.append(district)
        
tor_df = df_2g[df_2g['District'].isin(toronto_district)].reset_index(drop=True)

tor_df.head().append(tor_df.tail())

Unnamed: 0,Postal Code,District,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
34,M6P,West Toronto,"High Park, The Junction South",43.661608,-79.464763
35,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325
36,M6S,West Toronto,"Runnymede, Swansea",43.651571,-79.48445
37,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
38,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558


In [23]:
tor_lat = tor_df['Latitude'].mean()
tor_lon = tor_df['Longitude'].mean()
print('Toronto has a latitude:{} and longitude:{}'.format(tor_lat, tor_lon))

Toronto has a latitude:43.66713498717948 and longitude:-79.38987324871795


## Visualising the above data using Folium

In [28]:
latitude = 43.653963
longitude = 79.387207

toronto_map = folium.Map(location=[tor_lat, tor_lon], zoom_start=12)

# add markers to map
for lat, lng, District, Neighbourhood in zip(tor_df['Latitude'], tor_df['Longitude'], tor_df['District'], tor_df['Neighbourhood']):
    label = '{}, {}'.format(Neighbourhood, District)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(toronto_map)  
    
toronto_map

In [29]:
# Get number of neighbourhoods in each District
print(tor_df.groupby('District').count()['Neighbourhood'])

District
Central Toronto      9
Downtown Toronto    19
East Toronto         5
West Toronto         6
Name: Neighbourhood, dtype: int64


## Accessing foursquare API and getting venue data

In [34]:
CLIENT_ID = 'IT0NFU2BRJ1WIDYJOAK2WPGIMWQQTZ5LFWPRQNAN3UVJOQGW'  # Foursquare ID
CLIENT_SECRET = 'N20TH1204TXX4TZ3BYX1NKEWLOCIGIJ2L2SOXDDIMCTUUEEB' # Foursquare Secret code
VERSION = '20200605' # Foursquare API version
LIMIT = 200 # Max venues returned by the API
RADIUS = 500 # Radius

In [37]:
def get_nearby_venues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
   
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [36]:
toronto_venues = get_nearby_venues(names=tor_df['Neighbourhood'],
                                latitudes=tor_df['Latitude'],
                                longitudes=tor_df['Longitude'])

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West, Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High 

In [38]:
toronto_venues.shape

(1640, 7)

In [39]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,57,57,57,57,57,57
"Brockton, Parkdale Village, Exhibition Place",24,24,24,24,24,24
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",18,18,18,18,18,18
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",15,15,15,15,15,15
Central Bay Street,61,61,61,61,61,61
Christie,16,16,16,16,16,16
Church and Wellesley,78,78,78,78,78,78
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,33,33,33,33,33,33
Davisville North,9,9,9,9,9,9


## Preparing data for clustering

In [42]:
onehot_df = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot_df['Neighborhoods'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [onehot_df.columns[-1]] + list(onehot_df.columns[:-1])
onehot_df = onehot_df[fixed_columns]

print(onehot_df.shape)
onehot_df.head()

(1640, 235)


Unnamed: 0,Neighborhoods,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
grouped_df = onehot_df.groupby(["Neighborhoods"]).mean().reset_index()

print(grouped_df.shape)
grouped_df

(39, 235)


Unnamed: 0,Neighborhoods,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.066667,0.066667,0.066667,0.133333,0.133333,0.133333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016393,0.0,0.0,0.016393,0.0,0.016393
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.012821,0.0,0.0,...,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025641
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
toronto_venues['Venue Category'].unique()[:100]

array(['Trail', 'Health Food Store', 'Pub', 'Neighborhood',
       'Asian Restaurant', 'Cosmetics Shop', 'Ice Cream Shop',
       'Greek Restaurant', 'Italian Restaurant', 'Brewery', 'Juice Bar',
       'Yoga Studio', 'Fruit & Vegetable Store', 'Dessert Shop',
       'Pizza Place', 'Restaurant', 'Bookstore', 'Grocery Store',
       'Furniture / Home Store', 'Spa', 'Bubble Tea Shop',
       'Caribbean Restaurant', 'Coffee Shop', 'Bakery',
       'Indian Restaurant', 'Café', 'Frozen Yogurt Shop', 'Lounge',
       'American Restaurant', 'Liquor Store', 'Sushi Restaurant', 'Gym',
       'Fish & Chips Shop', 'Fast Food Restaurant', 'Park',
       'Burrito Place', 'Pet Store', 'Steakhouse', 'Movie Theater',
       'Sandwich Place', 'Food & Drink Shop', 'Fish Market',
       'Seafood Restaurant', 'Gay Bar', 'Cheese Shop',
       'Middle Eastern Restaurant', 'Comfort Food Restaurant',
       'Stationery Store', 'Thai Restaurant', 'Coworking Space',
       'Wine Bar', 'Latin American Restaurant

## Selecting out Coffee Shops for clustering analysis

In [58]:
#check if there is a Coffee Shop in the area
"Coffee Shop" in toronto_venues['Venue Category'].unique()

True

In [47]:
len(grouped_df[grouped_df["Coffee Shop"] > 0])

29

In [48]:
coffee_df = grouped_df[["Neighborhoods","Coffee Shop"]]
coffee_df.head(10)

Unnamed: 0,Neighborhoods,Coffee Shop
0,Berczy Park,0.087719
1,"Brockton, Parkdale Village, Exhibition Place",0.083333
2,"Business reply mail Processing Centre, South C...",0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.066667
4,Central Bay Street,0.180328
5,Christie,0.0625
6,Church and Wellesley,0.089744
7,"Commerce Court, Victoria Hotel",0.13
8,Davisville,0.060606
9,Davisville North,0.0


## We do the clustering on our prepared data

In [49]:
from sklearn.cluster import KMeans
k = 3

clustering_df = coffee_df.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=k, random_state=1)
kmeans.fit_transform(clustering_df)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20]

array([2, 2, 1, 2, 0, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 2],
      dtype=int32)

In [50]:
# adding cluster labels to the list
merged_df = coffee_df.copy()

# add clustering labels
merged_df["Cluster Labels"] = kmeans.labels_

In [51]:
merged_df.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
merged_df.head(5)

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels
0,Berczy Park,0.087719,2
1,"Brockton, Parkdale Village, Exhibition Place",0.083333,2
2,"Business reply mail Processing Centre, South C...",0.0,1
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.066667,2
4,Central Bay Street,0.180328,0


In [52]:
merged_df = merged_df.join(toronto_venues.set_index("Neighborhood"), on="Neighborhood")

print(merged_df.shape)
merged_df.head()

(1640, 9)


Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Berczy Park,0.087719,2,43.644771,-79.373306,LCBO,43.642944,-79.37244,Liquor Store
0,Berczy Park,0.087719,2,43.644771,-79.373306,The Keg Steakhouse + Bar - Esplanade,43.646712,-79.374768,Restaurant
0,Berczy Park,0.087719,2,43.644771,-79.373306,Fresh On Front,43.647815,-79.374453,Vegetarian / Vegan Restaurant
0,Berczy Park,0.087719,2,43.644771,-79.373306,Meridian Hall,43.646292,-79.376022,Concert Hall
0,Berczy Park,0.087719,2,43.644771,-79.373306,Biff's Bistro,43.647085,-79.376342,French Restaurant


In [53]:
merged_df.sort_values(["Cluster Labels"], inplace=True)
merged_df.head()

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
23,"Queen's Park, Ontario Provincial Government",0.228571,0,43.662301,-79.389494,Starbucks,43.659456,-79.390411,Coffee Shop
4,Central Bay Street,0.180328,0,43.657952,-79.387383,Poke Guys,43.654895,-79.385052,Poke Place
4,Central Bay Street,0.180328,0,43.657952,-79.387383,Textile Museum of Canada,43.654396,-79.3865,Art Museum
4,Central Bay Street,0.180328,0,43.657952,-79.387383,Silver Snail Comics,43.657031,-79.381403,Comic Shop
4,Central Bay Street,0.180328,0,43.657952,-79.387383,Vegetarian Haven,43.656016,-79.392758,Vegetarian / Vegan Restaurant


## Visualise our cluster data

In [54]:
map_clusters = folium.Map(location=[tor_lat, tor_lon],zoom_start=12)

# set color scheme for the clusters


# add markers to the map
markers_colors={}
markers_colors[0] = 'red'
markers_colors[1] = 'blue'
markers_colors[2] = 'green'
markers_colors[3] = 'yellow'
markers_colors[4] = 'cyan'
markers_colors[5] = 'black'
for lat, lon, cluster in zip(merged_df['Neighborhood Latitude'], merged_df['Neighborhood Longitude'], merged_df['Cluster Labels']):
    
    
    folium.features.CircleMarker(
        [lat, lon],
        radius=5,
       
        color =markers_colors[cluster],
        fill_color=markers_colors[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [55]:
#Cluster 1
merged_df.loc[(merged_df['Cluster Labels'] ==0) & (merged_df['Venue Category'] == 'Coffee Shop') ]

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
23,"Queen's Park, Ontario Provincial Government",0.228571,0,43.662301,-79.389494,Starbucks,43.659456,-79.390411,Coffee Shop
4,Central Bay Street,0.180328,0,43.657952,-79.387383,Starbucks,43.659456,-79.390411,Coffee Shop
4,Central Bay Street,0.180328,0,43.657952,-79.387383,The Library Specialty Coffee,43.654413,-79.390902,Coffee Shop
4,Central Bay Street,0.180328,0,43.657952,-79.387383,Coffee Public,43.660763,-79.386184,Coffee Shop
4,Central Bay Street,0.180328,0,43.657952,-79.387383,Starbucks,43.659509,-79.382132,Coffee Shop
23,"Queen's Park, Ontario Provincial Government",0.228571,0,43.662301,-79.389494,Starbucks,43.661527,-79.383411,Coffee Shop
23,"Queen's Park, Ontario Provincial Government",0.228571,0,43.662301,-79.389494,Starbucks,43.658204,-79.388998,Coffee Shop
4,Central Bay Street,0.180328,0,43.657952,-79.387383,Jimmy's Coffee,43.658421,-79.385613,Coffee Shop
4,Central Bay Street,0.180328,0,43.657952,-79.387383,Tim Hortons,43.65857,-79.385123,Coffee Shop
4,Central Bay Street,0.180328,0,43.657952,-79.387383,Neo Coffee Bar,43.66014,-79.38587,Coffee Shop


In [56]:
#Cluster 2
merged_df.loc[(merged_df['Cluster Labels'] ==1) & (merged_df['Venue Category'] == 'Coffee Shop')]

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
38,"University of Toronto, Harbord",0.027778,1,43.662696,-79.400049,Elchi Chai Shop,43.662695,-79.404652,Coffee Shop


In [57]:
#Cluster 3
merged_df.loc[(merged_df['Cluster Labels'] ==2) & (merged_df['Venue Category'] == 'Coffee Shop')]

Unnamed: 0,Neighborhood,Coffee Shop,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
28,"Runnymede, Swansea",0.088235,2,43.651571,-79.484450,Tim Hortons,43.648526,-79.485066,Coffee Shop
29,St. James Town,0.071429,2,43.651494,-79.375418,Fahrenheit Coffee,43.652384,-79.372719,Coffee Shop
28,"Runnymede, Swansea",0.088235,2,43.651571,-79.484450,The Coffee Bouquets,43.648785,-79.485940,Coffee Shop
28,"Runnymede, Swansea",0.088235,2,43.651571,-79.484450,Wibke's Espresso Bar,43.649132,-79.484802,Coffee Shop
29,St. James Town,0.071429,2,43.651494,-79.375418,Versus Coffee,43.651213,-79.375236,Coffee Shop
...,...,...,...,...,...,...,...,...,...
13,"Garden District, Ryerson",0.090000,2,43.657162,-79.378937,Good Earth Coffeehouse,43.656850,-79.374719,Coffee Shop
13,"Garden District, Ryerson",0.090000,2,43.657162,-79.378937,Starbucks,43.654465,-79.378919,Coffee Shop
13,"Garden District, Ryerson",0.090000,2,43.657162,-79.378937,Starbucks,43.655969,-79.382684,Coffee Shop
13,"Garden District, Ryerson",0.090000,2,43.657162,-79.378937,Balzac's Coffee,43.657854,-79.379200,Coffee Shop
