In [51]:
import pandas as pd
import numpy as np
import json
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')


Libraries imported.


In [52]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

## Reading html file to make a dataframe

In [53]:
html_df = pd.read_html(url)

In [54]:
df_toronto = html_df[0]

### Removing unassigned rows

In [55]:
df_toronto = df_toronto[df_toronto['Borough']!='Not assigned']

In [56]:
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Importing files iwth coordinates

In [57]:
def geo_coordinates(borough):
    address = borough
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    return {'latitude':location.latitude, 'longitude': location.longitude}


In [58]:
df_toronto['Borough'][2]

'North York'

In [59]:
geo_data = 'https://cocl.us/Geospatial_data'

In [60]:
df_coord = pd.read_csv(geo_data)

In [61]:
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Merging the two imported files

- Some files do not have details so we merge with 'right' as the how

In [62]:
df  = pd.merge(df_toronto, df_coord ,on='Postal Code', how='right')

In [63]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Select boroughs with with word <u>Toronto</u> in it

In [64]:
contains_toronto = ['Toronto' in i for i in df['Borough']]

In [65]:
df_has_toronto = df[contains_toronto]

In [66]:
toronto_coords =geo_coordinates('Toronto')

In [67]:
latitude = toronto_coords['latitude']
longitude = toronto_coords['longitude']

In [68]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_has_toronto['Latitude'], df_has_toronto['Longitude'], df_has_toronto['Borough'], df_has_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Define Foursquare credentials

In [69]:
CLIENT_ID = 'YAASYYGZFGUNUKB3CC4S4RODQQI0VUU31AUS4BPIIW4TTYOO' # your Foursquare ID
CLIENT_SECRET = 'CRQQD5SOQF1IZQQLDMVJDVXMAVLWFODSOX5FK2VLZ1BIMFR5' # your Foursquare Secret
VERSION = '20180604' #'20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: YAASYYGZFGUNUKB3CC4S4RODQQI0VUU31AUS4BPIIW4TTYOO
CLIENT_SECRET:CRQQD5SOQF1IZQQLDMVJDVXMAVLWFODSOX5FK2VLZ1BIMFR5


In [70]:
df_has_toronto = df_has_toronto.reset_index()[['Postal Code', 'Borough', 'Neighborhood', 'Latitude','Longitude']]

In [71]:
df_has_toronto.head(3)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Selecting neighborhood

In [72]:
neighborhood = df_has_toronto['Neighborhood'][0].split(',')[1]

In [73]:
neighorhood_borough = df_has_toronto['Borough'][0]

In [74]:
neighborhood_latitude = geo_coordinates( neighborhood)['latitude'] # neighborhood latitude value
neighborhood_longitude = geo_coordinates( neighborhood)['longitude'] # neighborhood longitude value

neighborhood_name = neighborhood # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of  Harbourfront are 1.2653951, 103.8224032.


In [75]:
neighborhood

' Harbourfront'

In [76]:
# type your answer here
radius =500
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)


In [77]:
results = requests.get(url).json()


In [78]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [79]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,VivoCity,Shopping Mall,1.264713,103.821918
1,Fairprice Xtra,Supermarket,1.264032,103.82268
2,Marks & Spencer,Department Store,1.264012,103.823455
3,GVmax,Movie Theater,1.264315,103.822359
4,Hans im Glück,German Restaurant,1.263173,103.823275


In [80]:
nearby_venues['Neighborhood'] =neighborhood_name

In [81]:
nearby_venues['Neighborhood latitude'] =neighborhood_latitude
nearby_venues['Neighborhood longitude'] =neighborhood_longitude

In [82]:
nearby_venues.columns

Index(['name', 'categories', 'lat', 'lng', 'Neighborhood',
       'Neighborhood latitude', 'Neighborhood longitude'],
      dtype='object')

In [83]:
nearby_venues.rename(columns={'name': 'Venue', 'lat': 'Venue Latitude', 'lng':'Venue Longitude'})

Unnamed: 0,Venue,categories,Venue Latitude,Venue Longitude,Neighborhood,Neighborhood latitude,Neighborhood longitude
0,VivoCity,Shopping Mall,1.264713,103.821918,Harbourfront,1.265395,103.822403
1,Fairprice Xtra,Supermarket,1.264032,103.822680,Harbourfront,1.265395,103.822403
2,Marks & Spencer,Department Store,1.264012,103.823455,Harbourfront,1.265395,103.822403
3,GVmax,Movie Theater,1.264315,103.822359,Harbourfront,1.265395,103.822403
4,Hans im Glück,German Restaurant,1.263173,103.823275,Harbourfront,1.265395,103.822403
...,...,...,...,...,...,...,...
95,True Fitness,Gym,1.263021,103.820281,Harbourfront,1.265395,103.822403
96,HarbourFront Tower 2,Office,1.264671,103.818725,Harbourfront,1.265395,103.822403
97,Starbucks,Coffee Shop,1.264817,103.820166,Harbourfront,1.265395,103.822403
98,TCC – The Connoisseur Concerto,Café,1.263474,103.821449,Harbourfront,1.265395,103.822403


In [84]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### cleaning the data
   - Get the comman separated values in <code>Neighborhood</code> and split them to make more rows
        -  I used the code below <br/>
            <code>reshaped = (df.set_index(df.columns.drop('var1',1).tolist()).var1.str.split(',', expand=True).stack().reset_index().rename(columns={0:'var1'}).loc[:, df.columns])</code>

In [85]:
toronto_neighborhoods = (df.set_index(df.columns.drop('Neighborhood',1).tolist()).Neighborhood.str.split(', ', expand=True).stack()
.reset_index().rename(columns={0: 'Neighborhood'}).loc[:, df.columns])

In [86]:
toronto_neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
4,M6A,North York,Lawrence Manor,43.718518,-79.464763


In [87]:
north_york = toronto_neighborhoods[toronto_neighborhoods['Borough']=='North York']

## Exploring neighborhoods in Toronto

In [88]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [90]:
toronto_venues = getNearbyVenues(names=north_york['Neighborhood'],
                                   latitudes=north_york['Latitude'],
                                   longitudes=north_york['Longitude']
                                  )


Parkwoods
Victoria Village
Lawrence Manor
Lawrence Heights
Don Mills
Glencairn
Don Mills
Hillcrest Village
Bathurst Manor
Wilson Heights
Downsview North
Fairview
Henry Farm
Oriole
Northwood Park
York University
Bayview Village
Downsview
York Mills
Silver Hills
Downsview
North Park
Maple Leaf Park
Upwood Park
Humber Summit
Willowdale
Newtonbrook
Downsview
Bedford Park
Lawrence Manor East
Humberlea
Emery
Willowdale
Willowdale East
Downsview
York Mills West
Willowdale
Willowdale West


In [91]:
toronto_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.332140,Park
1,Parkwoods,43.753259,-79.329656,Brookbanks Pool,43.751389,-79.332184,Pool
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
...,...,...,...,...,...,...,...
520,Willowdale West,43.782736,-79.442259,RBC Royal Bank,43.783894,-79.446603,Bank
521,Willowdale West,43.782736,-79.442259,Dollarama,43.784670,-79.446670,Discount Store
522,Willowdale West,43.782736,-79.442259,Tim Hortons,43.780940,-79.444231,Coffee Shop
523,Willowdale West,43.782736,-79.442259,Price Chopper,43.783237,-79.446339,Grocery Store


In [92]:
toronto_venues.shape

(525, 7)

In [93]:
toronto_venues.groupby('Neighborhood').count().head()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bathurst Manor,20,20,20,20,20,20
Bayview Village,4,4,4,4,4,4
Bedford Park,26,26,26,26,26,26
Don Mills,26,26,26,26,26,26
Downsview,17,17,17,17,17,17


### Analysing each neighborhood

In [94]:
# one hot encoding
york_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
york_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [york_onehot.columns[-1]] + list(york_onehot.columns[:-1])
york_onehot = york_onehot[fixed_columns]

york_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,...,Steakhouse,Supermarket,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Vietnamese Restaurant,Women's Store
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [95]:
york_grouped = york_onehot.groupby('Neighborhood').mean().reset_index()

In [96]:
num_top_venues = 5

for hood in york_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = york_grouped[york_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor----
                 venue  freq
0          Coffee Shop  0.10
1                 Bank  0.10
2             Pharmacy  0.05
3  Fried Chicken Joint  0.05
4                 Park  0.05


----Bayview Village----
                 venue  freq
0   Chinese Restaurant  0.25
1                 Café  0.25
2                 Bank  0.25
3  Japanese Restaurant  0.25
4    Accessories Store  0.00


----Bedford Park----
                venue  freq
0  Italian Restaurant  0.12
1      Sandwich Place  0.08
2         Pizza Place  0.08
3         Coffee Shop  0.08
4          Restaurant  0.08


----Don Mills----
                 venue  freq
0                  Gym  0.12
1           Restaurant  0.08
2          Coffee Shop  0.08
3     Asian Restaurant  0.08
4  Japanese Restaurant  0.08


----Downsview----
            venue  freq
0   Grocery Store  0.18
1            Park  0.12
2  Baseball Field  0.06
3  Discount Store  0.06
4    Liquor Store  0.06


----Downsview North----
                 venue  freq

In [97]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [98]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = york_grouped['Neighborhood']

for ind in np.arange(york_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(york_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bathurst Manor,Coffee Shop,Bank,Mobile Phone Shop,Sandwich Place,Gas Station,Diner,Ice Cream Shop,Deli / Bodega,Middle Eastern Restaurant,Park
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store,Dog Run,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
2,Bedford Park,Italian Restaurant,Restaurant,Coffee Shop,Pizza Place,Sandwich Place,Grocery Store,Café,Juice Bar,Comfort Food Restaurant,Indian Restaurant
3,Don Mills,Gym,Japanese Restaurant,Asian Restaurant,Coffee Shop,Restaurant,Beer Store,Sandwich Place,Discount Store,Italian Restaurant,Dim Sum Restaurant
4,Downsview,Grocery Store,Park,Liquor Store,Airport,Food Truck,Gym / Fitness Center,Athletics & Sports,Discount Store,Bank,Korean Restaurant


In [99]:
# set number of clusters
kclusters = 5

york_grouped_clustering = york_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(york_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 4, 4, 4, 4, 2, 4, 4, 4])

In [104]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

york_merged = north_york

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
york_merged = york_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

york_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0.0,Park,Pool,Food & Drink Shop,Diner,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,4.0,Coffee Shop,French Restaurant,Hockey Arena,Pizza Place,Portuguese Restaurant,Intersection,Women's Store,Dim Sum Restaurant,Comfort Food Restaurant,Construction & Landscaping
4,M6A,North York,Lawrence Manor,43.718518,-79.464763,4.0,Clothing Store,Accessories Store,Furniture / Home Store,Boutique,Event Space,Miscellaneous Shop,Coffee Shop,Vietnamese Restaurant,Bakery,Construction & Landscaping
5,M6A,North York,Lawrence Heights,43.718518,-79.464763,4.0,Clothing Store,Accessories Store,Furniture / Home Store,Boutique,Event Space,Miscellaneous Shop,Coffee Shop,Vietnamese Restaurant,Bakery,Construction & Landscaping
12,M3B,North York,Don Mills,43.745906,-79.352188,4.0,Gym,Japanese Restaurant,Asian Restaurant,Coffee Shop,Restaurant,Beer Store,Sandwich Place,Discount Store,Italian Restaurant,Dim Sum Restaurant


In [105]:
york_merged.dropna(inplace=True)

In [106]:
york_merged['Cluster Labels'] = york_merged['Cluster Labels'].astype(int)

In [107]:
york_merged['Cluster Labels'].unique()

array([0, 4, 1, 3, 2])

In [108]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(york_merged['Latitude'], york_merged['Longitude'], york_merged['Neighborhood'], york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters