### import the modouls

In [1]:
from bs4 import BeautifulSoup
import requests
import urllib.request
import time
import pandas as pd

### get the connection to the url and scrape the web page

get the table text after inspecting the site

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')
# after inspecting the web site, need to find the table class
My_table = soup.find('table',{'class':'wikitable sortable'})

### get the table headers

In [3]:
# Get table headers
t_headers = []
for th in My_table.find_all("th"):
    # remove any newlines and extra spaces from left and right
    t_headers.append(th.text.replace('\n', ' ').strip())
print(t_headers)

['Postcode', 'Borough', 'Neighbourhood']


### get the table data and zip each row to the headers

In [4]:
# get the data of each row with its header
table_data = []
for tr in My_table.tbody.find_all("tr"): # find all tr's from table's tbody
    t_row = {}
    # Each table row is stored in the form of
    # t_row = {'Postcode': '', 'Borough': '', 'Neighbourhood': ''}

    # find all td's(3) in tr and zip it with t_header
    for td, th in zip(tr.find_all("td"), t_headers): 
        t_row[th] = td.text.replace('\n', '').strip()
    table_data.append(t_row)

### convert to pandas DF and clean the data
also fixed inconsistency in the column neighborhood that has spelling mistake in the wikipedia web page

In [5]:
# convert the table to pandas dataframe 
df_data_table = pd.DataFrame(table_data[1:], columns=t_headers)

# drop all rows where Borough column is 'Not assigned'
df_data_table = df_data_table.loc[df_data_table.Borough != 'Not assigned']

# fix the 'Not assinged' issue on the Neighbourhood column
df_data_table.loc[df_data_table.Neighbourhood=='Not assigned', 'Neighbourhood'] = \
df_data_table.loc[df_data_table.Neighbourhood=='Not assigned', 'Borough']

# sort values based on the postcode column
df_data_table = df_data_table.sort_values(by='Postcode')
# rename column with spelling mistake in the wikipedia web page
df_data_table = df_data_table.rename(columns={'Neighbourhood': 'Neighborhood'})

# group the borough and concatenate the neighborhood names with comma
df_data_table = df_data_table.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(','.join)
df_data_table = pd.DataFrame(df_data_table)

# reset the dataframe index
df_data_table.reset_index(inplace=True)

df_data_table.head(20)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Golden Mile,Oakridge,Clairlea"
8,M1M,Scarborough,"Cliffcrest,Scarborough Village West,Cliffside"
9,M1N,Scarborough,"Cliffside West,Birch Cliff"


In [6]:
df_data_table.shape

(103, 3)

In [7]:
import geocoder

### geocoding the postal codes
the google geocoding didn't work so... I tried OSM and it gave me partial results, here is my code:

In [8]:
# Pcodes_latlng = []
#Pcodes = []
#for postal in df_data_table.itertuples():
#    # get the geocoding using OSM for each postal code
#    g = geocoder.osm(f'{postal.Postcode}, Toronto, Ontario')
#    # get the coords
#    lat_lng_coords = g.latlng
#    # append the coords to one list
#    Pcodes_latlng.append(lat_lng_coords)
#    # appen the postal codes
#    Pcodes.append(postal.Postcode)
    
## get them together
## list_pcodes_coords = list(zip(Pcodes, Pcodes_latlng))
##check
#Pcodes_latlng[:10]

### mooving on to the csv file

In [9]:
path = r'C:\Study\IBM_DataScience\final_course_Capstone\Geospatial_Coordinates.csv'
df_pcodes = pd.read_csv(path , sep=',')
df_pcodes = df_pcodes.rename(columns={'Postal Code': 'Postcode'})
df_pcodes.head(10)

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [10]:
df_data_table = df_data_table.merge(df_pcodes[['Latitude', 'Longitude', 'Postcode']], on="Postcode")
df_data_table.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile,Oakridge,Clairlea",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Scarborough Village West,Cliffside",43.716316,-79.239476
9,M1N,Scarborough,"Cliffside West,Birch Cliff",43.692657,-79.264848


In [11]:
toronto_grouped = df_data_table.groupby('Borough').mean().reset_index()
toronto_grouped

Unnamed: 0,Borough,Latitude,Longitude
0,Central Toronto,43.70198,-79.398954
1,Downtown Toronto,43.654597,-79.383972
2,East Toronto,43.669436,-79.324654
3,East York,43.700303,-79.335851
4,Etobicoke,43.659333,-79.542967
5,Mississauga,43.636966,-79.615819
6,North York,43.750727,-79.429338
7,Queen's Park,43.667856,-79.532242
8,Scarborough,43.766229,-79.249085
9,West Toronto,43.652653,-79.44929


In [12]:
import folium
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
from sklearn.cluster import KMeans

In [15]:
# create a map of toronto
geolocator = Nominatim(user_agent="coursera")
address = 'Toronto'
try:
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinates of {} are {}, {}.'.format(address, latitude, longitude))
except AttributeError:
    print('Cannot find: {}, will drop index: {}'.format(address, index))

toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_data_table['Latitude'], df_data_table['Longitude'], df_data_table['Postcode']):
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(toronto_map)  
    
toronto_map

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [16]:
CLIENT_ID = '-----' # Foursquare ID
CLIENT_SECRET = '-----' # Foursquare Secret
VERSION = '20200105' # API version

In [17]:
df_data_table.set_index('Postcode', inplace = True) 
neighborhood_latitude = df_data_table.loc['M6G']['Latitude']
neighborhood_longitude = df_data_table.loc['M6G']['Longitude']

In [18]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=-----&client_secret=-----&v=20200105&ll=43.669542,-79.4225637&radius=500&limit=100'

In [19]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e25725a006dce001bbefdb2'},
 'response': {'headerLocation': 'Christie Pits',
  'headerFullLocation': 'Christie Pits, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 17,
  'suggestedBounds': {'ne': {'lat': 43.6740420045, 'lng': -79.41635411972038},
   'sw': {'lat': 43.6650419955, 'lng': -79.42877328027961}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4adcfd7cf964a5203e6321e3',
       'name': 'Fiesta Farms',
       'location': {'address': '200 Christie St',
        'crossStreet': 'at Essex St',
        'lat': 43.66847077052224,
        'lng': -79.42048512748114,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.66847077052224,
          'lng': -79.42048512748114}],
        'distance': 20

In [20]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [21]:
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON
nearby_venues

Unnamed: 0,reasons.count,reasons.items,referralId,venue.categories,venue.id,venue.location.address,venue.location.cc,venue.location.city,venue.location.country,venue.location.crossStreet,...,venue.location.formattedAddress,venue.location.labeledLatLngs,venue.location.lat,venue.location.lng,venue.location.postalCode,venue.location.state,venue.name,venue.photos.count,venue.photos.groups,venue.venuePage.id
0,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4adcfd7cf964a5203e6321e3-0,"[{'id': '4bf58dd8d48988d118951735', 'name': 'G...",4adcfd7cf964a5203e6321e3,200 Christie St,CA,Toronto,Canada,at Essex St,...,"[200 Christie St (at Essex St), Toronto ON M6G...","[{'label': 'display', 'lat': 43.66847077052224...",43.668471,-79.420485,M6G 3B6,ON,Fiesta Farms,0,[],56848730.0
1,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-540c8301498e51456f71ae7e-1,"[{'id': '4bf58dd8d48988d16d941735', 'name': 'C...",540c8301498e51456f71ae7e,1028 Shaw St,CA,Toronto,Canada,Yarmouth,...,"[1028 Shaw St (Yarmouth), Toronto ON, Canada]","[{'label': 'display', 'lat': 43.66910714407297...",43.669107,-79.426105,,ON,Contra Cafe,0,[],
2,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4dc9d2add4c07b350108956a-2,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",4dc9d2add4c07b350108956a,657 Dupont St,CA,Toronto,Canada,at Christie St.,...,"[657 Dupont St (at Christie St.), Toronto ON M...","[{'label': 'display', 'lat': 43.67153, 'lng': ...",43.67153,-79.4214,M6G 1Z4,ON,Starbucks,0,[],
3,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4b003cf6f964a520d73b22e3-3,"[{'id': '4bf58dd8d48988d110941735', 'name': 'I...",4b003cf6f964a520d73b22e3,787 Dupont Street,CA,Toronto,Canada,,...,"[787 Dupont Street, Toronto ON M6G 1Z5, Canada]","[{'label': 'display', 'lat': 43.67067884412717...",43.670679,-79.426148,M6G 1Z5,ON,Vinny’s Panini,0,[],
4,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4adb4d20f964a520b12521e3-4,"[{'id': '4bf58dd8d48988d147941735', 'name': 'D...",4adb4d20f964a520b12521e3,1071 Shaw St.,CA,Toronto,Canada,Dupont,...,"[1071 Shaw St. (Dupont), Toronto ON, Canada]","[{'label': 'display', 'lat': 43.67054982071192...",43.67055,-79.426541,,ON,Universal Grill,0,[],
5,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-536e42af498e8a0880b220d7-5,"[{'id': '4bf58dd8d48988d16d941735', 'name': 'C...",536e42af498e8a0880b220d7,146 Christie Street,CA,Toronto,Canada,Bloor,...,"[146 Christie Street (Bloor), Toronto ON M6G 3...","[{'label': 'display', 'lat': 43.66735987720157...",43.66736,-79.419938,M6G 3B3,ON,Scout and Cash Caffe,0,[],85621950.0
6,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4f737b84e4b060af1b23c5b7-6,"[{'id': '4bf58dd8d48988d1c4941735', 'name': 'R...",4f737b84e4b060af1b23c5b7,971 Ossington Ave,CA,Toronto,Canada,,...,"[971 Ossington Ave, Toronto ON M6G 3V5, Canada]","[{'label': 'display', 'lat': 43.66785822046965...",43.667858,-79.428054,M6G 3V5,ON,Actinolite,0,[],
7,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4ade0037f964a520396721e3-7,"[{'id': '4bf58dd8d48988d16d941735', 'name': 'C...",4ade0037f964a520396721e3,672 Dupont Street,CA,Toronto,Canada,Christie,...,"[672 Dupont Street (Christie), Toronto ON M6G ...","[{'label': 'display', 'lat': 43.6710456734398,...",43.671046,-79.419297,M6G 1Z6,ON,Faema Caffe,0,[],
8,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4b76e890f964a52065692ee3-8,"[{'id': '4bf58dd8d48988d117951735', 'name': 'C...",4b76e890f964a52065692ee3,653 Dupont Street,CA,Toronto,Canada,"""at Christie St.""",...,"[653 Dupont Street (""at Christie St.""), Toront...","[{'label': 'display', 'lat': 43.67156577026589...",43.671566,-79.421289,M6G 1Z4,ON,Stubbe Chocolates,0,[],
9,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-4aee0faef964a520b1d121e3-9,"[{'id': '4bf58dd8d48988d118951735', 'name': 'G...",4aee0faef964a520b1d121e3,650 Dupont St,CA,Toronto,Canada,at Christie St.,...,"[650 Dupont St (at Christie St.), Toronto ON M...","[{'label': 'display', 'lat': 43.671657, 'lng':...",43.671657,-79.421364,M6G 4B1,ON,Loblaws,0,[],


In [22]:
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Fiesta Farms,Grocery Store,43.668471,-79.420485
1,Contra Cafe,Café,43.669107,-79.426105
2,Starbucks,Coffee Shop,43.67153,-79.4214
3,Vinny’s Panini,Italian Restaurant,43.670679,-79.426148
4,Universal Grill,Diner,43.67055,-79.426541


### checking how many venues returned

In [23]:
print(f'{nearby_venues.shape[0]} venues were returned by Foursquare.')

17 venues were returned by Foursquare.


### this function will get the venues in 500 m' radius

In [24]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### appliing the function above

In [25]:
venues = getNearbyVenues(names=df_data_table['Borough'],latitudes=df_data_table['Latitude'],longitudes=df_data_table['Longitude'])

Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
Scarborough
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
North York
East York
East York
East Toronto
East York
East York
East York
East Toronto
East Toronto
East Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
Central Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
North York
Central Toronto
Central Toronto
Central Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
North York
North York
York
York
Downtown Toronto
Wes

### check the output

In [26]:
print(venues.shape)
venues.head()

(2222, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Scarborough,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,Scarborough,43.784535,-79.160497,RIGHT WAY TO GOLF,43.785177,-79.161108,Golf Course
2,Scarborough,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,Scarborough,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,Scarborough,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [27]:
venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Toronto,105,105,105,105,105,105
Downtown Toronto,1312,1312,1312,1312,1312,1312
East Toronto,126,126,126,126,126,126
East York,79,79,79,79,79,79
Etobicoke,71,71,71,71,71,71
Mississauga,11,11,11,11,11,11
North York,247,247,247,247,247,247
Scarborough,89,89,89,89,89,89
West Toronto,162,162,162,162,162,162
York,20,20,20,20,20,20


### analysing the Neighborhoods

In [28]:
# one hot encoding
onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot['Neighborhood'] = venues['Neighborhood'] 

onehot.head()

Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
grouped = onehot.groupby('Neighborhood').mean().reset_index()
grouped

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019048,...,0.009524,0.0,0.0,0.009524,0.0,0.0,0.0,0.0,0.0,0.009524
1,Downtown Toronto,0.0,0.000762,0.000762,0.000762,0.000762,0.001524,0.002287,0.001524,0.009909,...,0.011433,0.001524,0.0,0.005335,0.0,0.00686,0.000762,0.0,0.001524,0.003049
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381,...,0.0,0.0,0.0,0.0,0.0,0.007937,0.0,0.0,0.0,0.02381
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.012658,0.0,0.012658,0.0,0.0,0.0,0.0,0.012658
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014085,0.0,0.0
5,Mississauga,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,North York,0.004049,0.0,0.004049,0.0,0.0,0.0,0.0,0.0,0.008097,...,0.0,0.004049,0.004049,0.008097,0.0,0.0,0.0,0.004049,0.008097,0.0
7,Scarborough,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011236,...,0.0,0.0,0.0,0.011236,0.0,0.0,0.0,0.0,0.0,0.0
8,West Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.012346,0.0,0.0,0.012346,0.0,0.006173,0.0,0.0,0.0,0.006173
9,York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0


In [30]:
num_top_venues = 5

for hood in grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = grouped[grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Toronto----
            venue  freq
0     Coffee Shop  0.08
1  Sandwich Place  0.07
2            Café  0.06
3            Park  0.06
4     Pizza Place  0.05


----Downtown Toronto----
                 venue  freq
0          Coffee Shop  0.10
1                 Café  0.05
2           Restaurant  0.03
3                Hotel  0.03
4  Japanese Restaurant  0.02


----East Toronto----
                venue  freq
0    Greek Restaurant  0.07
1         Coffee Shop  0.06
2  Italian Restaurant  0.05
3             Brewery  0.04
4      Ice Cream Shop  0.04


----East York----
                 venue  freq
0          Coffee Shop  0.08
1         Burger Joint  0.05
2                 Park  0.05
3                 Bank  0.04
4  Sporting Goods Shop  0.04


----Etobicoke----
            venue  freq
0     Pizza Place  0.10
1  Sandwich Place  0.07
2     Coffee Shop  0.06
3        Pharmacy  0.06
4             Gym  0.04


----Mississauga----
                       venue  freq
0                Coffee S

In [31]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### now create a table that ranks the top 10 venues in each Neighborhood (Borough)

In [32]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = grouped['Neighborhood']

for ind in np.arange(grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,Coffee Shop,Sandwich Place,Park,Café,Pizza Place,Dessert Shop,Sushi Restaurant,Restaurant,Pub,Gym
1,Downtown Toronto,Coffee Shop,Café,Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Bar,Park,Seafood Restaurant
2,East Toronto,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Brewery,Café,Yoga Studio,Park,Sandwich Place,Pub
3,East York,Coffee Shop,Park,Burger Joint,Pizza Place,Pharmacy,Bank,Sporting Goods Shop,Gym / Fitness Center,Fast Food Restaurant,Pet Store
4,Etobicoke,Pizza Place,Sandwich Place,Coffee Shop,Pharmacy,Grocery Store,Fast Food Restaurant,Liquor Store,Café,Gym,Discount Store
5,Mississauga,Hotel,Coffee Shop,Middle Eastern Restaurant,Sandwich Place,Burrito Place,Fried Chicken Joint,Mediterranean Restaurant,American Restaurant,Gym,Drugstore
6,North York,Coffee Shop,Clothing Store,Fast Food Restaurant,Pizza Place,Japanese Restaurant,Sandwich Place,Park,Grocery Store,Restaurant,Sushi Restaurant
7,Scarborough,Fast Food Restaurant,Breakfast Spot,Bakery,Pizza Place,Chinese Restaurant,Coffee Shop,Pharmacy,Skating Rink,Playground,Bank
8,West Toronto,Bar,Café,Coffee Shop,Italian Restaurant,Restaurant,Bakery,Pizza Place,Park,Breakfast Spot,Grocery Store
9,York,Park,Coffee Shop,Convenience Store,Field,Dog Run,Hockey Arena,Fast Food Restaurant,Caribbean Restaurant,Sandwich Place,Trail


In [34]:
# set number of clusters
kclusters = 5

grouped_clustering = grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 1, 1, 2, 4, 3, 2, 2, 1, 0])

### the column name 'Neighborhood'  in the venues equales to the column name 'Borough' in the original dataframe. so it needs to be a left join that consider that mixup

In [35]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

merged = df_data_table

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
merged = merged.merge(neighborhoods_venues_sorted.set_index('Neighborhood'), left_on='Borough',right_on='Neighborhood')

merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,"Rouge,Malvern",43.806686,-79.194353,2,Fast Food Restaurant,Breakfast Spot,Bakery,Pizza Place,Chinese Restaurant,Coffee Shop,Pharmacy,Skating Rink,Playground,Bank
1,Scarborough,"Port Union,Rouge Hill,Highland Creek",43.784535,-79.160497,2,Fast Food Restaurant,Breakfast Spot,Bakery,Pizza Place,Chinese Restaurant,Coffee Shop,Pharmacy,Skating Rink,Playground,Bank
2,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,2,Fast Food Restaurant,Breakfast Spot,Bakery,Pizza Place,Chinese Restaurant,Coffee Shop,Pharmacy,Skating Rink,Playground,Bank
3,Scarborough,Woburn,43.770992,-79.216917,2,Fast Food Restaurant,Breakfast Spot,Bakery,Pizza Place,Chinese Restaurant,Coffee Shop,Pharmacy,Skating Rink,Playground,Bank
4,Scarborough,Cedarbrae,43.773136,-79.239476,2,Fast Food Restaurant,Breakfast Spot,Bakery,Pizza Place,Chinese Restaurant,Coffee Shop,Pharmacy,Skating Rink,Playground,Bank


### finaly - create a map that shows the clusters...

In [36]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(merged['Latitude'], merged['Longitude'], merged['Neighborhood'], merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters