In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
url

'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [4]:
html = urlopen(url)

In [5]:
soup = BeautifulSoup(html, 'lxml')
type(soup)

bs4.BeautifulSoup

In [6]:
rows = soup.find_all('tr')

In [7]:
import re
list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '', str_cells))
    list_rows.append(clean2)

In [8]:
df = pd.DataFrame(list_rows)

In [9]:
df.head()

Unnamed: 0,0
0,[]
1,"[M1A\n, Not assigned\n, Not assigned\n]"
2,"[M2A\n, Not assigned\n, Not assigned\n]"
3,"[M3A\n, North York\n, Parkwoods\n]"
4,"[M4A\n, North York\n, Victoria Village\n]"


In [10]:
df1 = df[0].str.split(',', expand=True)
df1.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,[],,,,,,,,,,...,,,,,,,,,,
1,[M1A\n,Not assigned\n,Not assigned\n],,,,,,,,...,,,,,,,,,,
2,[M2A\n,Not assigned\n,Not assigned\n],,,,,,,,...,,,,,,,,,,
3,[M3A\n,North York\n,Parkwoods\n],,,,,,,,...,,,,,,,,,,
4,[M4A\n,North York\n,Victoria Village\n],,,,,,,,...,,,,,,,,,,


In [11]:
df1[0] = df1[0].str.strip('[')
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,],,,,,,,,,,...,,,,,,,,,,
1,M1A\n,Not assigned\n,Not assigned\n],,,,,,,,...,,,,,,,,,,
2,M2A\n,Not assigned\n,Not assigned\n],,,,,,,,...,,,,,,,,,,
3,M3A\n,North York\n,Parkwoods\n],,,,,,,,...,,,,,,,,,,
4,M4A\n,North York\n,Victoria Village\n],,,,,,,,...,,,,,,,,,,


In [12]:
range(df1.shape[1])

range(0, 31)

In [13]:
for i in range(df1.shape[1]):
    df1[i] = df1[i].str.strip(']')
    df1[i] = df1[i].str.strip('\n')


In [14]:
df1.rename(columns = {0: 'PostalCode', 1: 'Borough'}, inplace = True)
df1.head()

Unnamed: 0,PostalCode,Borough,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,,,,,,,,,,,...,,,,,,,,,,
1,M1A,Not assigned,Not assigned,,,,,,,,...,,,,,,,,,,
2,M2A,Not assigned,Not assigned,,,,,,,,...,,,,,,,,,,
3,M3A,North York,Parkwoods,,,,,,,,...,,,,,,,,,,
4,M4A,North York,Victoria Village,,,,,,,,...,,,,,,,,,,


In [15]:
df2 = df1[df1['Borough'] != ' Not assigned']

In [16]:
df2 = df2[1:]

In [17]:
df2.reset_index(drop = True, inplace = True)

In [18]:
df2['Neighborhood'] = df2[df2.columns[2:]].apply(lambda x: ','.join(x.dropna().astype(str)), axis = 1)
df2.head()

Unnamed: 0,PostalCode,Borough,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,Neighborhood
0,M3A,North York,Parkwoods,,,,,,,,...,,,,,,,,,,Parkwoods
1,M4A,North York,Victoria Village,,,,,,,,...,,,,,,,,,,Victoria Village
2,M5A,Downtown Toronto,Regent Park,Harbourfront,,,,,,,...,,,,,,,,,,"Regent Park, Harbourfront"
3,M6A,North York,Lawrence Manor,Lawrence Heights,,,,,,,...,,,,,,,,,,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,Queen's Park,Ontario Provincial Government,,,,,,,...,,,,,,,,,,"Queen's Park, Ontario Provincial Government"


In [19]:
data = df2[['PostalCode','Borough','Neighborhood']]
data.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor..."
103,,,
104,NL\n\nNS\n\nPE\n\nNB\n\nQC\n\nON\n\nMB\n\nSK\n...,NL,"NS, PE, NB, QC, ON, MB, SK, AB, BC, NU/NT, YT..."
105,NL,NS,"PE, NB, QC, ON, MB, SK, AB, BC, NU/NT, YT"
106,A,B,"C, E, G, H, J, K, L, M, N, P, R, S, T, V, X, Y"


### When the tail of the "data" dataframe was displayed, the last three rows are unwanted data as they have PostalCodes that don't seem to make sense.

In [20]:
data = data[0:103]
data.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South ..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, H..."
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor..."


In [21]:
data.shape[0]

103

In [22]:
geodata = pd.read_csv("Geospatial_Coordinates.csv")

In [23]:
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [24]:
geodf = pd.merge(data, geodata, left_on ='PostalCode', right_on = 'Postal Code')

In [25]:
geodf.drop('Postal Code', axis = 1, inplace = True)

In [27]:
geodf.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [62]:
trtdf = geodf[geodf['Borough'].str.contains("Toronto")].reset_index(drop = True)

In [63]:
trtdf.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [38]:
trtdf.shape

(39, 5)

In [40]:
import json
!conda install -c conda-forge geopy --yes

Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/roxanneli/anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-2.0.0                |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          97 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-2.0.0-pyh9f0ad1d_0

The following packages will be UPDATED:

  conda                       pkgs/main::conda-4.8.3-py37_0 --> conda-forge::conda-4.8.3-py37hc8dfbb8_1

The following packages will be SUPERSEDED by a higher-pr

In [41]:
import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

In [42]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.



In [43]:
from geopy.geocoders import Nominatim 

In [44]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Create map of Toronto

In [45]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(trtdf['Latitude'], trtdf['Longitude'], trtdf['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [50]:
CLIENT_ID = 'OVO4PKGIUFOTL1CFBRWNYD1R1KAK233BSXMEIL2EET0YSF1W' 
CLIENT_SECRET = 'XGGDYM10OPMAEVCIJ5LMIU2YMJMTBCKRF54JFPFH3P3JXF4G' 
VERSION = '20180605' 

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: OVO4PKGIUFOTL1CFBRWNYD1R1KAK233BSXMEIL2EET0YSF1W
CLIENT_SECRET:XGGDYM10OPMAEVCIJ5LMIU2YMJMTBCKRF54JFPFH3P3JXF4G


In [65]:
trtdf = trtdf.sort_values(by = 'Borough')
trtdf.reset_index(drop = True, inplace = True)

In [66]:
trtdf.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5N,Central Toronto,Roselawn,43.711695,-79.416936
1,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Fores...",43.686412,-79.400049
2,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
6,M5P,Central Toronto,"Forest Hill North &amp; West, Forest Hill Roa...",43.696948,-79.411307
7,M4P,Central Toronto,Davisville North,43.712751,-79.390197
8,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### Explore the first neighborhood of Central Toronto: Roselawn

In [67]:
neighborhood_latitude = trtdf.loc[0, 'Latitude'] 
neighborhood_longitude = trtdf.loc[0, 'Longitude'] 
neighborhood_name = trtdf.loc[0, 'Neighborhood'] 

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of  Roselawn are 43.7116948, -79.41693559999999.


### Now, let's get the top 100 venues that are in Roselawn within a radius of 500 meters.

In [68]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=OVO4PKGIUFOTL1CFBRWNYD1R1KAK233BSXMEIL2EET0YSF1W&client_secret=XGGDYM10OPMAEVCIJ5LMIU2YMJMTBCKRF54JFPFH3P3JXF4G&v=20180605&ll=43.7116948,-79.41693559999999&radius=500&limit=100'

In [69]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f1a9c69aa50f96509a0b15a'},
  'headerLocation': 'Lawrence Park South',
  'headerFullLocation': 'Lawrence Park South, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.7161948045, 'lng': -79.41072165393975},
   'sw': {'lat': 43.707194795499994, 'lng': -79.42314954606023}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e6e176c45dd293273b74e3c',
       'name': "Rosalind's Garden Oasis",
       'location': {'lat': 43.71218888050602,
        'lng': -79.41197784736922,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.71218888050602,
          'lng': -79.41197784736922}],
        'distance': 402,
        'cc': 'CA',
        'city': 'Toronto',
        'st

In [70]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [71]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Rosalind's Garden Oasis,Garden,43.712189,-79.411978
1,Aquatics Academy Inc.,Pool,43.709951,-79.412127


## Explore neighborhoods in Toronto

In [72]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [73]:
toronto_venues = getNearbyVenues(names=trtdf['Neighborhood'],
                                   latitudes=trtdf['Latitude'],
                                   longitudes=trtdf['Longitude']
                                  )

 Roselawn
 Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
 Moore Park, Summerhill East
 Davisville
 Lawrence Park
 North Toronto West,  Lawrence Park
 Forest Hill North &amp; West, Forest Hill Road Park
 Davisville North
 The Annex, North Midtown, Yorkville
 Garden District, Ryerson
 Richmond, Adelaide, King
 Christie
 Harbourfront East, Union Station, Toronto Islands
 Central Bay Street
 Berczy Park
 St. James Town
 Toronto Dominion Centre, Design Exchange
 Commerce Court, Victoria Hotel
 Queen's Park, Ontario Provincial Government
 First Canadian Place, Underground city
 St. James Town, Cabbagetown
 Stn A PO Boxes
 Rosedale
 CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
 University of Toronto, Harbord
 Church and Wellesley
 Kensington Market, Chinatown, Grange Park
 Regent Park, Harbourfront
 The Beaches
 Business reply mail Processing Centre, South Central Letter Processing Plant Toronto
 The Danforth We

In [74]:
toronto_venues.head(10)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Roselawn,43.711695,-79.416936,Rosalind's Garden Oasis,43.712189,-79.411978,Garden
1,Roselawn,43.711695,-79.416936,Aquatics Academy Inc.,43.709951,-79.412127,Pool
2,"Summerhill West, Rathnelly, South Hill, Fores...",43.686412,-79.400049,The Market By Longo’s,43.686711,-79.399536,Supermarket
3,"Summerhill West, Rathnelly, South Hill, Fores...",43.686412,-79.400049,LCBO,43.686991,-79.399238,Liquor Store
4,"Summerhill West, Rathnelly, South Hill, Fores...",43.686412,-79.400049,Daeco Sushi,43.687838,-79.395652,Sushi Restaurant
5,"Summerhill West, Rathnelly, South Hill, Fores...",43.686412,-79.400049,Mary Be Kitchen,43.687708,-79.395062,Restaurant
6,"Summerhill West, Rathnelly, South Hill, Fores...",43.686412,-79.400049,Union Social Eatery,43.687895,-79.394916,American Restaurant
7,"Summerhill West, Rathnelly, South Hill, Fores...",43.686412,-79.400049,Starbucks,43.686756,-79.398292,Coffee Shop
8,"Summerhill West, Rathnelly, South Hill, Fores...",43.686412,-79.400049,Tim Hortons,43.687682,-79.39684,Coffee Shop
9,"Summerhill West, Rathnelly, South Hill, Fores...",43.686412,-79.400049,Fionn MacCool's,43.687921,-79.394783,Pub


In [75]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,58,58,58,58,58,58
"Brockton, Parkdale Village, Exhibition Place",22,22,22,22,22,22
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",16,16,16,16,16,16
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",18,18,18,18,18,18
Central Bay Street,64,64,64,64,64,64
Christie,16,16,16,16,16,16
Church and Wellesley,75,75,75,75,75,75
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,35,35,35,35,35,35
Davisville North,10,10,10,10,10,10


In [76]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 231 uniques categories.


## Analyze each neighborhood

In [107]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Roselawn,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Roselawn,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Summerhill West, Rathnelly, South Hill, Fores...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Summerhill West, Rathnelly, South Hill, Fores...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Summerhill West, Rathnelly, South Hill, Fores...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Ha...",0.0,0.055556,0.055556,0.055556,0.111111,0.166667,0.111111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,0.015625,0.0,0.015625
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.013333,0.0,0.0,0.0,0.0,0.0,0.0,0.013333,0.0,...,0.013333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026667
7,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Print the top 5 most common venues within each neighbouthood

In [109]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Berczy Park----
         venue  freq
0  Coffee Shop  0.09
1         Café  0.03
2     Beer Bar  0.03
3       Bakery  0.03
4   Restaurant  0.03


---- Brockton, Parkdale Village, Exhibition Place----
            venue  freq
0            Café  0.14
1     Coffee Shop  0.09
2  Breakfast Spot  0.09
3   Grocery Store  0.05
4          Bakery  0.05


---- Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
                  venue  freq
0    Light Rail Station  0.12
1           Pizza Place  0.06
2         Auto Workshop  0.06
3            Smoke Shop  0.06
4  Fast Food Restaurant  0.06


---- CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0   Airport Service  0.17
1    Airport Lounge  0.11
2  Airport Terminal  0.11
3             Plane  0.06
4           Airport  0.06


---- Central Bay Street----
                venue  freq
0         Coffee Shop  0.17
1                

In [110]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Create a new dataframe and display the top 5 venues for each neighborhood.

In [117]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Berczy Park,Coffee Shop,Seafood Restaurant,Cheese Shop,Bakery,Cocktail Bar
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Grocery Store,Climbing Gym
2,"Business reply mail Processing Centre, South ...",Light Rail Station,Smoke Shop,Auto Workshop,Pizza Place,Comic Shop
3,"CN Tower, King and Spadina, Railway Lands, Ha...",Airport Service,Airport Lounge,Airport Terminal,Coffee Shop,Harbor / Marina
4,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Salad Place


## Cluster Neighbourhoods

In [112]:
from sklearn.cluster import KMeans

In [118]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [119]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = trtdf

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M5N,Central Toronto,Roselawn,43.711695,-79.416936,2,Pool,Garden,Yoga Studio,Dance Studio,Eastern European Restaurant
1,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Fores...",43.686412,-79.400049,0,Pub,Coffee Shop,Bank,Supermarket,Sushi Restaurant
2,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,1,Gym,Tennis Court,Yoga Studio,Dance Studio,Electronics Store
3,M4S,Central Toronto,Davisville,43.704324,-79.38879,0,Pizza Place,Sandwich Place,Dessert Shop,Gym,Italian Restaurant
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Photography Studio,Park,Bus Line,Swim School,Dance Studio


### Visualize the resulting clusters

In [120]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters