In [2]:
import itertools as it
import os
import json
import requests
import bs4
import geocoder
import folium
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
pd.set_option('display.max_columns', None)

# Neighborhood dataframe creation

I decided to cluster neighborhoods instead of boroughs or postal codes, so I went ahead and split postal codes with multiple neighborhoods into multiple rows.

In [3]:
soup = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").content
soup = BeautifulSoup(soup)

In [4]:
data = pd.DataFrame(columns=['Postal Code', 'Borough', 'Neighborhood'])
table = soup.find('table')
for cell in table.find_all('td'):
    row = list(cell.stripped_strings)
    if "Not assigned" in row:
        continue
    neighborhoods = " ".join(row[2:]).strip("() ")
    neighborhoods = " ".join([t.strip() for t in neighborhoods.split(")")])
    neighborhoods = neighborhoods.replace("(", "/")
    neighborhoods = ", ".join([t.strip() for t in neighborhoods.split(",")])
    neighborhoods = ", ".join([t.strip() for t in neighborhoods.split("/")])
    for nbh in neighborhoods.split(','):
        data = data.append(dict(zip(data.columns, row[:2] + [nbh.strip()])), ignore_index=True)
data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park
3,M5A,Downtown Toronto,Harbourfront
4,M6A,North York,Lawrence Manor


# Adding Latitude/Longitude

Latitude, longitude, and radius will be added to the data frame created above. The radius used for FourSquare exploration will be that of a circle inscribed in the bounding box of the postal code area.

In [5]:
def save_locations(data):
    latitude = []
    longitude = []
    radius = []
    for nbh, pc in zip(data['Neighborhood'], data['Postal Code']):
        print("Geocoding", nbh)
        for i in range(10):
            response = None
            response = geocoder.arcgis("{} Toronto Canada {}".format(nbh, pc))
            if response is not None:
                break
        else:
            if response is None:
                print("Too many lookup attempts. The geocoding server may be down...")
        latitude.append(response.lat)
        longitude.append(response.lng)
        NE = response.bbox['northeast']
        SW = response.bbox['southwest']
        NW = [NE[0], SW[1]]
        radius.append(min(geocoder.distance([NE, NW], units='meters'), geocoder.distance([NW, SW], units='meters')) / 2)

    latitude = pd.Series(latitude, name='Latitude')
    longitude = pd.Series(longitude, name='Longitude')
    radius = pd.Series(radius, name="Radius")
    data[["Latitude"]] = latitude
    data[["Longitude"]] = longitude
    data[["Radius"]] = radius
    data.to_csv("toronto_neighborhoods.csv")
    return data

Load from previously saved data frame if we can, otherwise run the function above and save the updated data frame.

In [6]:
if os.access("toronto_neighborhoods.csv", os.R_OK):
    data = pd.read_csv("toronto_neighborhoods.csv", index_col=0)
else:
    data = save_locations(data)

In [7]:
data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Radius
0,M3A,North York,Parkwoods,43.75245,-79.32991,2568.862665
1,M4A,North York,Victoria Village,43.73057,-79.31306,1445.852272
2,M5A,Downtown Toronto,Regent Park,43.659743,-79.361561,80.442896
3,M5A,Downtown Toronto,Harbourfront,43.63923,-79.38307,402.325087
4,M6A,North York,Lawrence Manor,43.714925,-79.449739,80.368924


# Neighborhood Exploration

Load API key and secret and create URL template for calls to the FourSquare exploration endpoint.

In [8]:
if os.access('foursquare_api_keys.txt', os.R_OK):
    with open("foursquare_api_keys.txt") as f:
        foursquare_id = f.readline().strip()
        foursquare_secret = f.readline().strip()
else:
    foursquare_id = ""
    foursquare_secret = ""
foursquare_template = "https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v=20210526".format("{}", 
                                                                                                                foursquare_id, 
                                                                                                                foursquare_secret)
foursquare_explore = foursquare_template.format("explore") + "&limit={}".format(100) + "&ll={},{}&radius={}"

Get top 100 venues for each neighborhood using the Foursquare API and parse response to build a data frame.

In [9]:
def get_venues(data):
    all_venues = pd.DataFrame()
    for i in range(data.shape[0]):
        lat, lng, rad = data[['Latitude', 'Longitude', 'Radius']].iloc[i]
        url = foursquare_explore.format(lat, lng, rad)
        response = requests.api.get(url)
        response.raise_for_status()
        venues = response.json()['response']['groups'][0]["items"]
        venues = pd.json_normalize(venues)
        venues['Neighborhood'] = data.loc[i, 'Neighborhood']
        try:
            venues['reasons.items'] = venues['reasons.items'].map(lambda x: [i['reasonName'] for i in x][0])
        except (TypeError, KeyError):
            pass
        try:
            venues['venue.categories.ids'] = venues['venue.categories'].map(lambda x: [i['id'] for i in x][0])
            venues['venue.categories.names'] = venues['venue.categories'].map(lambda x: [i['name'] for i in x][0])
        except KeyError:
            pass
        all_venues = all_venues.append(venues, ignore_index=True)
    all_venues.to_csv('toronto_neighborhood_venues.csv')
    return all_venues

In [103]:
if os.access('toronto_neighborhood_venues.csv', os.R_OK):
    venues = pd.read_csv('toronto_neighborhood_venues.csv', index_col=0)
else:
    venues = get_venues(data)

keep_cols = ['Neighborhood', 'venue.id', 'venue.name', 'venue.location.address', 'venue.location.lat', 'venue.location.lng',  
             'venue.location.distance', 'venue.categories.names', 'venue.categories.ids']
new_col_names = ['Neighborhood', 'ID', 'Name', 'Address', 'Latitude', 'Longitude', 'Distance', 'Category', 'Category ID']
venues = venues[keep_cols]
venues.columns = new_col_names
venues.head()

Unnamed: 0,Neighborhood,ID,Name,Address,Latitude,Longitude,Distance,Category,Category ID
0,Parkwoods,4b8991cbf964a520814232e3,Allwyn's Bakery,81 Underhill drive,43.75984,-79.324719,922.0,Caribbean Restaurant,4bf58dd8d48988d144941735
1,Parkwoods,57e286f2498e43d84d92d34a,Tim Hortons,215 Brookbanks,43.760668,-79.326368,958.0,Café,4bf58dd8d48988d16d941735
2,Parkwoods,4bd4846a6798ef3bd0c5618d,Donalda Golf & Country Club,12 Bushbury Dr,43.752816,-79.342741,1032.0,Golf Course,4bf58dd8d48988d1e6941735
3,Parkwoods,4e8d9dcdd5fbbbb6b3003c7b,Brookbanks Park,Toronto,43.751976,-79.33214,186.0,Park,4bf58dd8d48988d163941735
4,Parkwoods,4b8ec91af964a520053733e3,Graydon Hall Manor,185 Graydon Hall Drive,43.763923,-79.342961,1652.0,Event Space,4bf58dd8d48988d171941735


If a venue is listed in multiple neighborhoods, keep the one with the minimum distance. This ensures each venue is only associated with one neighborhood and offsets any neighborhood overlap when basing the search radius on neighborhood bounding boxes.

In [11]:
venue_id_groups = venues.groupby('ID')
min_dist_idx = venue_id_groups['Distance'].idxmin()
venue_id_groups['Distance'].min()
venues = venues.iloc[min_dist_idx]
venues.set_index('ID', inplace=True)
venues.head()

Unnamed: 0_level_0,Neighborhood,Name,Address,Latitude,Longitude,Distance,Category,Category ID
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4a8d5b48f964a520840f20e3,Kensington Market,The Moonbean Cafe,30 St Andrew St,43.654147,-79.400182,117.0,Café,4bf58dd8d48988d16d941735
4a9d0a53f964a520d23720e3,Business reply mail Processing Centre 969 Eastern,Hilton,145 Richmond St W,43.649946,-79.385479,139.0,Hotel,4bf58dd8d48988d1fa931735
4aada317f964a5201a6120e3,Yorkville,Isabel Bader Theatre,93 Charles St W,43.667329,-79.392604,382.0,College Theater,4bf58dd8d48988d1ac941735
4ab17387f964a520866920e3,St. James Town,Churchmouse & Firkin,485 Church St.,43.664632,-79.380406,729.0,Pub,4bf58dd8d48988d11b941735
4ab81980f964a5202f7c20e3,The Beaches West,The Salty Dog,1982 Queen St E,43.669944,-79.300894,193.0,Restaurant,4bf58dd8d48988d1c4941735


# Convert venue category into indicator variables

In [12]:
categories = venues['Category']
categories = pd.get_dummies(categories, columns=['Category'], prefix='', prefix_sep='')
categories.drop(columns='Neighborhood', inplace=True)
categories = pd.concat([venues['Neighborhood'], categories], axis=1)
categories.head()

Unnamed: 0_level_0,Neighborhood,ATM,Accessories Store,African Restaurant,Airport,Airport Lounge,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,Auto Garage,Auto Workshop,Automotive Shop,BBQ Joint,Baby Store,Badminton Court,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Basketball Court,Basketball Stadium,Beach,Bed & Breakfast,Beer Bar,Beer Garden,Beer Store,Belgian Restaurant,Big Box Store,Bike Shop,Bistro,Boat or Ferry,Bookstore,Botanical Garden,Boutique,Bowling Alley,Boxing Gym,Brazilian Restaurant,Breakfast Spot,Brewery,Bridal Shop,Bridge,Bubble Tea Shop,Building,Burger Joint,Burrito Place,Bus Line,Bus Station,Bus Stop,Business Service,Butcher,Cafeteria,Café,Camera Store,Campground,Candy Store,Cantonese Restaurant,Caribbean Restaurant,Castle,Cheese Shop,Chinese Restaurant,Chocolate Shop,Church,Circus,Climbing Gym,Clothing Store,Cocktail Bar,Coffee Shop,College Gym,College Rec Center,College Theater,Colombian Restaurant,Comedy Club,Comfort Food Restaurant,Comic Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Costume Shop,Creperie,Cuban Restaurant,Cupcake Shop,Curling Ice,Cycle Studio,Czech Restaurant,Dance Studio,Deli / Bodega,Dentist's Office,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Dry Cleaner,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space,Exhibit,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop,Fish Market,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court,Food Truck,Fountain,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Garden,Garden Center,Gas Station,Gastropub,Gay Bar,General Entertainment,General Travel,Gift Shop,Gluten-free Restaurant,Go Kart Track,Golf Course,Golf Driving Range,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Hakka Restaurant,Harbor / Marina,Hardware Store,Health & Beauty Service,Health Food Store,Historic Site,History Museum,Hobby Shop,Hockey Arena,Hockey Field,Home Service,Hong Kong Restaurant,Hookah Bar,Hospital,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,Housing Development,Hungarian Restaurant,IT Services,Ice Cream Shop,Indian Chinese Restaurant,Indian Restaurant,Indie Movie Theater,Indie Theater,Indonesian Restaurant,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Karaoke Bar,Kitchen Supply Store,Korean Restaurant,Lake,Laser Tag,Latin American Restaurant,Laundromat,Leather Goods Store,Library,Light Rail Station,Lighting Store,Liquor Store,Lounge,Mac & Cheese Joint,Malay Restaurant,Market,Martial Arts School,Massage Studio,Medical Center,Medical Supply Store,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Moroccan Restaurant,Movie Theater,Museum,Music School,Music Store,Music Venue,Nail Salon,National Park,New American Restaurant,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Outdoor Supply Store,Pakistani Restaurant,Paper / Office Supplies Store,Park,Pastry Shop,Performing Arts Venue,Persian Restaurant,Peruvian Restaurant,Pet Store,Pharmacy,Pie Shop,Pizza Place,Playground,Plaza,Poke Place,Polish Restaurant,Pool,Pool Hall,Portuguese Restaurant,Poutine Place,Pub,Racecourse,Racetrack,Ramen Restaurant,Record Shop,Recreation Center,Rental Car Location,Rental Service,Restaurant,River,Road,Rock Club,Roof Deck,Sake Bar,Salad Place,Salon / Barbershop,Sandwich Place,Scenic Lookout,Sculpture Garden,Seafood Restaurant,Shipping Store,Shoe Repair,Shoe Store,Shopping Mall,Shopping Plaza,Skate Park,Skating Rink,Ski Chalet,Smoke Shop,Smoothie Shop,Snack Place,Soccer Field,Soccer Stadium,Soup Place,South American Restaurant,Souvlaki Shop,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Sports Bar,Sri Lankan Restaurant,Steakhouse,Storage Facility,Street Art,Supermarket,Supplement Shop,Sushi Restaurant,Syrian Restaurant,Szechuan Restaurant,Taco Place,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tapas Restaurant,Tattoo Parlor,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Park,Theme Restaurant,Thrift / Vintage Store,Tibetan Restaurant,Toy / Game Store,Track,Trail,Train Station,Tree,Turkish Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1,Unnamed: 267_level_1,Unnamed: 268_level_1,Unnamed: 269_level_1,Unnamed: 270_level_1,Unnamed: 271_level_1,Unnamed: 272_level_1,Unnamed: 273_level_1,Unnamed: 274_level_1,Unnamed: 275_level_1,Unnamed: 276_level_1,Unnamed: 277_level_1,Unnamed: 278_level_1,Unnamed: 279_level_1,Unnamed: 280_level_1,Unnamed: 281_level_1,Unnamed: 282_level_1,Unnamed: 283_level_1,Unnamed: 284_level_1,Unnamed: 285_level_1,Unnamed: 286_level_1,Unnamed: 287_level_1,Unnamed: 288_level_1,Unnamed: 289_level_1,Unnamed: 290_level_1,Unnamed: 291_level_1,Unnamed: 292_level_1,Unnamed: 293_level_1,Unnamed: 294_level_1,Unnamed: 295_level_1,Unnamed: 296_level_1,Unnamed: 297_level_1,Unnamed: 298_level_1,Unnamed: 299_level_1,Unnamed: 300_level_1,Unnamed: 301_level_1,Unnamed: 302_level_1,Unnamed: 303_level_1,Unnamed: 304_level_1,Unnamed: 305_level_1,Unnamed: 306_level_1,Unnamed: 307_level_1,Unnamed: 308_level_1,Unnamed: 309_level_1,Unnamed: 310_level_1,Unnamed: 311_level_1,Unnamed: 312_level_1,Unnamed: 313_level_1,Unnamed: 314_level_1,Unnamed: 315_level_1,Unnamed: 316_level_1,Unnamed: 317_level_1,Unnamed: 318_level_1,Unnamed: 319_level_1,Unnamed: 320_level_1,Unnamed: 321_level_1,Unnamed: 322_level_1,Unnamed: 323_level_1,Unnamed: 324_level_1,Unnamed: 325_level_1,Unnamed: 326_level_1,Unnamed: 327_level_1,Unnamed: 328_level_1,Unnamed: 329_level_1,Unnamed: 330_level_1,Unnamed: 331_level_1,Unnamed: 332_level_1,Unnamed: 333_level_1,Unnamed: 334_level_1,Unnamed: 335_level_1,Unnamed: 336_level_1,Unnamed: 337_level_1,Unnamed: 338_level_1,Unnamed: 339_level_1,Unnamed: 340_level_1
4a8d5b48f964a520840f20e3,Kensington Market,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4a9d0a53f964a520d23720e3,Business reply mail Processing Centre 969 Eastern,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4aada317f964a5201a6120e3,Yorkville,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4ab17387f964a520866920e3,St. James Town,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4ab81980f964a5202f7c20e3,The Beaches West,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Group venues by neighborhood and get category frequencies

In [73]:
category_counts = categories.groupby('Neighborhood').mean()

# Determine top 5 most popular types of places for each neighborhood

In [240]:
def get_top5(category_counts):
    top5 = pd.DataFrame()
    for nbh_id in category_counts.index:
        row = category_counts.loc[nbh_id].nlargest(5).map(lambda x: x if  x > 0 else np.nan)
        row = pd.Series(row.index, name=nbh_id)
        top5 = pd.concat([top5, row], axis=1)
    top5 = top5.T
    top5.index.name = 'Neighborhood'
    top5.columns = ["{}Most Popular".format(n) for n in ['', '2nd ', '3rd ', '4th ', '5th ']]
    return top5
top5 = get_top5(category_counts)
top5.head()

Unnamed: 0_level_0,Most Popular,2nd Most Popular,3rd Most Popular,4th Most Popular,5th Most Popular
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Adelaide,Salad Place,Thai Restaurant,ATM,Accessories Store,African Restaurant
Agincourt North,Chinese Restaurant,Coffee Shop,Bakery,Sandwich Place,Bubble Tea Shop
Alderwood,Coffee Shop,Bank,Bar,Café,Discount Store
Bathurst Manor,Coffee Shop,Deli / Bodega,Middle Eastern Restaurant,Restaurant,Sandwich Place
Bayview Village,Chinese Restaurant,Bank,Café,Clothing Store,Coffee Shop


# Cluster neighborhoods

Cluster neighborhoods based on venue category frequencies

In [136]:
def cluster_label(category_counts):
    km = KMeans(n_clusters=5, n_init=1000)
    km.fit(category_counts)
    labels = pd.Series(km.labels_, index=category_counts.index, name='Cluster')
    neighborhoods = data[['Neighborhood', 'Latitude', 'Longitude']].groupby('Neighborhood').mean()
    labeled_neighborhoods = pd.concat([labels, neighborhoods], axis=1)
    return labeled_neighborhoods

labeled_neighborhoods = cluster_label(category_counts)
labeled_neighborhoods['Cluster'].value_counts()

2.0    105
0.0      5
1.0      3
3.0      2
4.0      2
Name: Cluster, dtype: int64

# Map neighborhoods, color-coded by cluster label
Clicking on a neighborhood marker will display a pop-up with the neighborhood name and the top 5 venue categories

In [139]:
def make_map(labeled_neighborhoods, top5):
    color_list = []
    for r, g, b in it.product(*[['00', 'cc']] * 3):
        color_list.append("#{}{}{}".format(r, g, b))
    color_list = color_list[1:-2]
    map_clusters = folium.Map(location=[labeled_neighborhoods['Latitude'].mean(), labeled_neighborhoods['Longitude'].mean()], zoom_start=11)
    for row in labeled_neighborhoods.iterrows():
        name, row = row
        cluster, lat, lng = row
        try:
            color = color_list[int(cluster)]
        except ValueError:
            color = '#000000'
        try:
            popup_text = "<b>{}</b><br>".format(name) + "<br>".join(["{}. {}".format(i, t) for i, t in zip(range(1, 6), top5.loc[name])])
        except KeyError:
            continue
        popup = folium.Popup(popup_text)
        marker = folium.CircleMarker([lat, lng], radius=6, popup=popup, color=color, fill=True, fillOpacity=0.7, min_width="100%", max_width='100%')
        marker.add_to(map_clusters)
    return map_clusters

In [172]:
map_clusters = make_map(labeled_neighborhoods, top5)
map_clusters

# Cluster neighborhoods based on venue category counts, but this time use broader categories

Foursquare has a category hierarchy. We will use this to translate specific venue categories (eg, Indian Restaurant) to a broader categories (eg, Food). This will decrease the dimensionality of the data used for k-means clustering

In [173]:
if not os.access('category_groups.json', os.R_OK):
    foursquare_categories = foursquare_template.format("categories")
    response = requests.get(foursquare_categories)
    response.raise_for_status()
    category_groups = response.json()
    with open('category_groups.json', 'w') as f:
        json.dump(category_groups, f)
else:
    with open('category_groups.json') as f:
        category_groups = json.load(f)

In [291]:
mapped_categories = {}
children = []
for parent in category_groups['response']['categories']:
    children.extend(parent['categories'])
    while children:
        child = children.pop()
        children.extend(child['categories'])
        mapped_categories[child['id']] = pd.Series({'Category ID': parent['id'], 'Category': parent['name']})

In [292]:
venues_alt = venues.copy()
venues_alt[['Category', 'Category ID']] = venues[['Category', 'Category ID']].apply(lambda x: mapped_categories[x.loc['Category ID']] 
                                                                                              if x.loc['Category ID'] in mapped_categories 
                                                                                              else x, 
                                                                                    axis=1)

Perform the same analysis as before:
1. Map 'Category' to boolean indicator features
2. Determine top 5 venue categories
3. Cluster neighborhoods based on venue frequencies
4. Map neighborhoods color-coded by cluster label

In [293]:
category_alt = venues_alt['Category']
category_alt = pd.get_dummies(category_alt, columns=['Category'], prefix="", prefix_sep="")
category_alt = pd.concat([venues_alt['Neighborhood'], category_alt], axis=1)
category_alt_grouped = category_alt.groupby('Neighborhood').mean()

In [294]:
top5_alt = get_top5(category_alt_grouped)
labeled_alt = cluster_label(category_alt_grouped)
labeled_alt['Cluster'].value_counts()

0.0    77
4.0    23
2.0    13
3.0     9
1.0     5
Name: Cluster, dtype: int64

In [295]:
map_alt = make_map(labeled_alt, top5_alt)
map_alt

# Cluster Summaries

In [296]:
def get_summary(labeled_neighborhoods, category_counts):
    cat_by_cluster = pd.concat([labeled_neighborhoods, category_counts], axis=1)
    cat_by_cluster = cat_by_cluster.drop(columns=['Latitude', 'Longitude']).groupby('Cluster').mean()
    summary = get_top5(cat_by_cluster)
    counts = labeled_neighborhoods['Cluster'].value_counts()
    counts.name = 'Neighborhood Count'
    summary = pd.concat([counts, summary], axis=1)
    summary.index.name = 'Cluster'
    return summary

## Using specific categories

In [297]:
get_summary(labeled_neighborhoods, category_counts).sort_values(by='Neighborhood Count', ascending=False)

Unnamed: 0_level_0,Neighborhood Count,Most Popular,2nd Most Popular,3rd Most Popular,4th Most Popular,5th Most Popular
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2.0,105,Coffee Shop,Park,Restaurant,Café,Bakery
0.0,5,Coffee Shop,Restaurant,Park,Café,Clothing Store
1.0,3,Park,Lighting Store,ATM,Accessories Store,African Restaurant
3.0,2,Pharmacy,Grocery Store,ATM,Accessories Store,African Restaurant
4.0,2,Supermarket,ATM,Accessories Store,African Restaurant,Airport


## Using broad categories

In [298]:
get_summary(labeled_alt, category_alt_grouped).sort_values(by='Neighborhood Count', ascending=False)

Unnamed: 0_level_0,Neighborhood Count,Most Popular,2nd Most Popular,3rd Most Popular,4th Most Popular,5th Most Popular
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,77,Food,Shop & Service,Outdoors & Recreation,Arts & Entertainment,Nightlife Spot
4.0,23,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment
2.0,13,Shop & Service,Food,Outdoors & Recreation,Arts & Entertainment,College & University
3.0,9,Travel & Transport,Arts & Entertainment,Professional & Other Places,Nightlife Spot,Shop & Service
1.0,5,Outdoors & Recreation,Shop & Service,Food,Arts & Entertainment,College & University


# Discussion

It appears that using the broader category definitions helped improve clustering. Using the specific categories, the vast majority of the neighborhoods were assigned to a single cluster, meaning the distinction between neighborhoods was not as good. 

Since k-means clustering the venue category frequency assumes independence between venue categories (eg: frequency differences between 'Indian Restaurant' and 'Italian Restaurant' are treated with the same weight as frequency differences between 'Indian Restaurant' and 'Dog Park'), it makes sense that broadening the categories would work better. Otherwise, more specifically labeled data takes away from the true frequency count of similar venues. For example, restaurants with the style of food they serve in the name reduce the frequency of the simpler 'Restaurant' category and allow the frequency of these types of venues to get washed out when compared to categories for which no sub-sectioning is really possible, like 'Airport'. Additionally, using the broader categories allows us not to worry about whether a venue labeled 'Restaurant' because they serve a wide menu or because whoever tagged the venue as such did not bother to be more specific.