**Retrieving data**

In [198]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
import geocoder
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')
w_table = soup.find('table',{'class':'wikitable sortable'})
list_contents = [rl.getText().strip().split('\n') for rl in w_table.findAll('tr')]
data = pd.DataFrame(list_contents)
columns = data.iloc[0]
data = data[1:]
data.columns = columns
data_filtered = data.loc[(data['Borough'] != 'Not assigned')]
toronto_data = data_filtered.loc[data['Borough'].str.contains('.*Toronto', regex=True)]
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
18,M5B,Downtown Toronto,Ryerson
19,M5B,Downtown Toronto,Garden District
35,M5C,Downtown Toronto,St. James Town


In [199]:
def findCoordi(row):
    postcode, name = row['Postcode'], row['Neighbourhood']
    print('----'+name+'----')
    lat_lng = None
    address = '{}, Toronto, {}'.format(postcode, name)
    g = geocoder.arcgis(address)
    lat_lng = g.latlng
    return lat_lng

**Finding latitude nad longitude for each neighbourhood.**

In [200]:
toronto_data_lat_lng = toronto_data.loc[:,['Neighbourhood']]
toronto_data_lat_lng.loc[:,'coordi'] = toronto_data.loc[:,['Postcode','Neighbourhood']].apply(findCoordi, axis=1)
toronto_data_lat_lng.loc[:,'latitude'] = toronto_data_lat_lng.loc[:,'coordi'].str.get(0)
toronto_data_lat_lng.loc[:,'longitude'] = toronto_data_lat_lng.loc[:,'coordi'].str.get(1)
toronto_data_lat_lng.reset_index(inplace=True)
toronto_data_lat_lng.drop('coordi', axis=1, inplace=True)
toronto_data_lat_lng.drop('index', axis=1, inplace=True)
toronto_data_lat_lng.head()

----Harbourfront----
----Harbourfront----
----Regent Park----
----Ryerson----
----Garden District----
----St. James Town----
----The Beaches----
----Berczy Park----
----Central Bay Street----
----Christie----
----Adelaide----
----King----
----Richmond----
----Dovercourt Village----
----Dufferin----
----Harbourfront East----
----Toronto Islands----
----Union Station----
----Little Portugal----
----Trinity----
----The Danforth West----
----Riverdale----
----Design Exchange----
----Toronto Dominion Centre----
----Brockton----
----Exhibition Place----
----Parkdale Village----
----The Beaches West----
----India Bazaar----
----Commerce Court----
----Victoria Hotel----
----Studio District----
----Lawrence Park----
----Roselawn----
----Davisville North----
----Forest Hill North----
----Forest Hill West----
----High Park----
----The Junction South----
----North Toronto West----
----The Annex----
----North Midtown----
----Yorkville----
----Parkdale----
----Roncesvalles----
----Davisville----
---

Unnamed: 0,Neighbourhood,latitude,longitude
0,Harbourfront,43.63951,-79.38316
1,Regent Park,43.659741,-79.361564
2,Ryerson,43.648829,-79.402486
3,Garden District,43.65794,-79.37562
4,St. James Town,43.67081,-79.37348


**Try plotting neighbourhood in Toronto**

In [201]:
# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [202]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [203]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(toronto_data_lat_lng['latitude'], toronto_data_lat_lng['longitude'], toronto_data_lat_lng['Neighbourhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [204]:
# @hidden_cell
CLIENT_ID = 'WD4ZCT2AXCER1SL3WPCVPJOC45434AGAPRLWMROEVRXDKDQQ' # your Foursquare ID
CLIENT_SECRET = '4NLO13KRPOWNET5DYDE5K2NFMC3UQCF52RVUQ2YX22XS5DGE' # your Foursquare Secret
VERSION = '20190910' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: WD4ZCT2AXCER1SL3WPCVPJOC45434AGAPRLWMROEVRXDKDQQ
CLIENT_SECRET:4NLO13KRPOWNET5DYDE5K2NFMC3UQCF52RVUQ2YX22XS5DGE


In [205]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        try:
            # make the GET request
            results = requests.get(url).json()["response"]['groups'][0]['items']
        except:
            print("--- Failed to extract items ---")
            continue
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

**Explore venues around each neighbourhood**

In [206]:
LIMIT='100'
toronto_venues = getNearbyVenues(names=toronto_data_lat_lng['Neighbourhood']
                                 ,latitudes=toronto_data_lat_lng['latitude']
                                ,longitudes=toronto_data_lat_lng['longitude'])
toronto_venues.head()

Harbourfront
Regent Park
Ryerson
Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide
King
Richmond
Dovercourt Village
Dufferin
Harbourfront East
Toronto Islands
Union Station
Little Portugal
Trinity
The Danforth West
Riverdale
Design Exchange
Toronto Dominion Centre
Brockton
Exhibition Place
Parkdale Village
The Beaches West
India Bazaar
Commerce Court
Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North
Forest Hill West
High Park
The Junction South
North Toronto West
The Annex
North Midtown
Yorkville
Parkdale
Roncesvalles
Davisville
Harbord
University of Toronto
Runnymede
Swansea
Moore Park
Summerhill East
Chinatown
Grange Park
Kensington Market
Deer Park
Forest Hill SE
Rathnelly
South Hill
Summerhill West
CN Tower
Bathurst Quay
Island airport
Harbourfront West
King and Spadina
Railway Lands
South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown
St. James Town
First Canadian Place
Underground city


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.63951,-79.38316,Harbourfront Centre,43.638556,-79.38319,Performing Arts Venue
1,Harbourfront,43.63951,-79.38316,Harbourfront,43.639526,-79.380688,Neighborhood
2,Harbourfront,43.63951,-79.38316,Natrel Pond/Rink,43.638431,-79.382528,Skating Rink
3,Harbourfront,43.63951,-79.38316,Lick It Gelato,43.639256,-79.38465,Ice Cream Shop
4,Harbourfront,43.63951,-79.38316,PawsWay,43.638599,-79.384992,Event Space


**Create dummy variables from Venue Category**

In [207]:
toronto_venue_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_venue_onehot.drop('Neighborhood', axis=1, inplace=True)
tmp_columns = toronto_venue_onehot.columns
toronto_venue_onehot['Neighborhood'] = toronto_venues['Neighborhood']
toronto_venue_onehot.set_index('Neighborhood', inplace=True)
toronto_venue_onehot.reset_index(level=0, inplace=True)
toronto_venue_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Animal Shelter,...,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category**

In [208]:
toronto_venue_agg = toronto_venue_onehot.groupby('Neighborhood').mean().reset_index()
toronto_venue_agg.head()

Unnamed: 0,Neighborhood,Accessories Store,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Animal Shelter,...,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0
1,Bathurst Quay,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.014925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014925
2,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Brockton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.088889,0.0,0.022222,0.0,0.0,0.0,0.0
4,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Print each neighborhood along with the top 5 most common venues**

In [209]:
num_top_venues = 5

for hood in toronto_venue_agg['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_venue_agg[toronto_venue_agg['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide----
             venue  freq
0      Coffee Shop  0.08
1             Café  0.07
2            Hotel  0.05
3       Steakhouse  0.04
4  Thai Restaurant  0.03


----Bathurst Quay----
                venue  freq
0         Coffee Shop  0.09
1  Italian Restaurant  0.07
2                Café  0.04
3                 Bar  0.04
4          Restaurant  0.04


----Berczy Park----
                venue  freq
0         Coffee Shop  0.10
1                Café  0.06
2          Restaurant  0.05
3              Bakery  0.04
4  Italian Restaurant  0.04


----Brockton----
                   venue  freq
0            Coffee Shop  0.13
1                    Bar  0.09
2  Vietnamese Restaurant  0.09
3                   Café  0.07
4          Grocery Store  0.07


----Business Reply Mail Processing Centre 969 Eastern----
         venue  freq
0  Coffee Shop  0.09
1         Café  0.05
2          Bar  0.04
3   Steakhouse  0.04
4        Hotel  0.04


----CN Tower----
                venue  freq
0         Cof

                venue  freq
0                Park  0.18
1   French Restaurant  0.09
2  Mexican Restaurant  0.09
3                Café  0.09
4  Italian Restaurant  0.09


----Regent Park----
             venue  freq
0      Coffee Shop  0.14
1  Thai Restaurant  0.10
2             Pool  0.05
3        Pet Store  0.05
4       Food Truck  0.05


----Richmond----
            venue  freq
0     Coffee Shop  0.08
1            Café  0.06
2           Hotel  0.04
3      Restaurant  0.04
4  Breakfast Spot  0.03


----Riverdale----
                           venue  freq
0          Vietnamese Restaurant  0.23
1                           Café  0.23
2  Vegetarian / Vegan Restaurant  0.08
3           Caribbean Restaurant  0.08
4                  Grocery Store  0.08


----Roncesvalles----
                         venue  freq
0                  Coffee Shop  0.11
1                       Bakery  0.07
2  Eastern European Restaurant  0.07
3              Thai Restaurant  0.07
4             Sushi Restaurant  0.0

**Function for sorting venues**

In [210]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

**Create the new dataframe and display the top 10 venues for each neighborhood.**

In [211]:
import numpy as np

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_venue_agg['Neighborhood']

for ind in np.arange(toronto_venue_agg.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_venue_agg.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Hotel,Steakhouse,Thai Restaurant,American Restaurant,Restaurant,Bar,Sushi Restaurant,Asian Restaurant
1,Bathurst Quay,Coffee Shop,Italian Restaurant,Café,Bar,Restaurant,Speakeasy,Bakery,Gym / Fitness Center,Park,Sandwich Place
2,Berczy Park,Coffee Shop,Café,Restaurant,Bakery,Italian Restaurant,Cocktail Bar,Gym,Beer Bar,Seafood Restaurant,Gastropub
3,Brockton,Coffee Shop,Vietnamese Restaurant,Bar,Grocery Store,Café,Restaurant,Bakery,Pizza Place,Boutique,Sandwich Place
4,Business Reply Mail Processing Centre 969 Eastern,Coffee Shop,Café,Hotel,Bar,Steakhouse,Sushi Restaurant,Japanese Restaurant,Italian Restaurant,Restaurant,Pizza Place


**Building classification model**

In [212]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_venue_clustering = toronto_venue_agg.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_venue_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

**Create dataframe that includes the cluster as well as the top 10 venues for each neighborhood.**

In [213]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data_lat_lng

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Neighbourhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Harbourfront,43.63951,-79.38316,1,Coffee Shop,Aquarium,Pizza Place,Café,Italian Restaurant,Sporting Goods Shop,Hotel,Park,Brewery,Scenic Lookout
1,Regent Park,43.659741,-79.361564,1,Coffee Shop,Thai Restaurant,Pool,Pub,Sushi Restaurant,Beer Store,Auto Dealership,Restaurant,Park,Indian Restaurant
2,Ryerson,43.648829,-79.402486,1,Coffee Shop,Bar,Furniture / Home Store,Boutique,Dessert Shop,Café,Pizza Place,Restaurant,Record Shop,Nightclub
3,Garden District,43.65794,-79.37562,1,Coffee Shop,Hotel,Café,Sandwich Place,Middle Eastern Restaurant,Pizza Place,Japanese Restaurant,Lounge,Diner,Ramen Restaurant
4,St. James Town,43.67081,-79.37348,1,Coffee Shop,Pizza Place,Grocery Store,Caribbean Restaurant,Market,Japanese Restaurant,Café,Filipino Restaurant,Breakfast Spot,Food & Drink Shop


**Visualize the resulting clusters**

In [214]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['latitude'], toronto_merged['longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters