<h1>Toronto</h1>

In [1]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

import folium # plotting library
from sklearn.cluster import KMeans
from matplotlib import cm
from colormap import rgb2hex

from bs4 import BeautifulSoup
import urllib.request

In [2]:
#Scrap postal codes V1
wikipedia='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
webpage = urllib.request.urlopen(wikipedia)
soup = BeautifulSoup(webpage, 'html.parser')
table = soup.find_all("table", class_="wikitable sortable")
#table

In [3]:
#Scrap postal codes V2
postal_codes=pd.read_html(wikipedia, attrs={"class": "wikitable"})[0]

#Suppress not assigned borough
postal_codes=postal_codes[postal_codes.Borough!='Not assigned']

#Replace "Not assigned" Neighborough py borough
postal_codes['Neighborhood'].loc[postal_codes['Neighborhood'] =='Not assigned'] =postal_codes['Borough']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [4]:
postal_codes[postal_codes['Postal Code']=="M5A"]

Unnamed: 0,Postal Code,Borough,Neighborhood
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [5]:
print(postal_codes.shape)

(103, 3)


In [6]:
#Tried to import data with geocoder, but it takes too long.
#import geocoder
#latitudes=[]
#longitudes=[]
#for i in postal_codes['Postal Code']:
    #print('{}, Toronto, Ontario'.format(i))
    #lat_lng_coords = None
    #while(lat_lng_coords is None):
        #g = geocoder.google('{}, Toronto, Ontario'.format(i))
        #lat_lng_coords = g.latlng
    #latitude = lat_lng_coords[0]
    #latitudes.append(latitude)
    #longitude = lat_lng_coords[1]
    #longitudes.append(longitude)
#postal_codes['latitude']=latitudes
#postal_codes['longitude']=longitudes

In [7]:
#Import data from csv
latlong=pd.read_csv('https://cocl.us/Geospatial_data')

#Merge with postal_codes

pc=postal_codes.merge(latlong,on='Postal Code')

In [8]:
if pc.shape[0]==postal_codes.shape[0]:
    print("Same number of rows, merge is ok")
else:
    print("Initial was {} rows, merged is {} rows, not ok".format(postal_codes.shape[0],pc.shape[0]))

pc.head()

Same number of rows, merge is ok


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [9]:
# create map of Toronto to visualize data
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(pc['Latitude'], pc['Longitude'], pc['Borough'], pc['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [10]:
#Connect to foursquare
mylink="C:/Users/legea/Documents/003_IBM/"

foursq=pd.read_csv(mylink+"foursquare.txt",header=None,sep=':')

foursq.columns=['type',"key"]

CLIENT_ID = foursq[foursq.type=='ID']['key'].item() # your Foursquare ID
CLIENT_SECRET = foursq[foursq.type=='SECRET']['key'].item() # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30


In [11]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
#Retrive venues by neightborood in toronto

toronto_venues = getNearbyVenues(names=pc['Neighborhood'],
                                   latitudes=pc['Latitude'],
                                   longitudes=pc['Longitude']
                                  )

In [14]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
#Mean by neighborood
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
print(toronto_grouped.shape)

#Suppress Neighborhood with na - for some neighborhood, I don't have enough data!
toronto_grouped.dropna()

(94, 238)


Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.043478,0.000000,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,"Willowdale, Willowdale East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.033333,0.0,0.0,0.0,0.0
90,"Willowdale, Willowdale West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
91,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
92,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.100000,0.000000,0.0,0.0,0.0,0.0


In [16]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [17]:
#Create a dataframe with top 5 venues ordered by neighborood

num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Latin American Restaurant,Lounge,Skating Rink,Breakfast Spot,Women's Store
1,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Gym,Pharmacy,Skating Rink
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Fried Chicken Joint,Ice Cream Shop,Supermarket
3,Bayview Village,Café,Japanese Restaurant,Bank,Chinese Restaurant,Department Store
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Restaurant,Sandwich Place,Italian Restaurant,Grocery Store


In [18]:
# set number of clusters
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

#Merge to obtain coordinates for plotting
toronto_merged = pc

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1.0,Park,Food & Drink Shop,Women's Store,Department Store,Ethiopian Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,2.0,French Restaurant,Pizza Place,Coffee Shop,Intersection,Hockey Arena
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2.0,Coffee Shop,Park,Breakfast Spot,Theater,Bakery
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,2.0,Clothing Store,Furniture / Home Store,Gift Shop,Arts & Crafts Store,Coffee Shop
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2.0,Coffee Shop,Yoga Studio,Fried Chicken Joint,Burrito Place,Distribution Center


In [19]:
toronto_merged['Cluster Labels'].fillna(kclusters+1, inplace=True)
#toronto_merged['Cluster Labels']=toronto_merged['Cluster Labels']

In [20]:
# create map

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
#colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow=['green','blue','red','yellow','pink','white','purple','black','lightblue']


# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    cluster=int(cluster)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_toronto)
       
map_toronto

In [21]:
#I am a park lover. Which cluster is the best for me?

In [25]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
38,Scarborough,0.0,Department Store,Coffee Shop,Convenience Store,Discount Store,Women's Store
64,York,0.0,Convenience Store,Women's Store,Deli / Bodega,Ethiopian Restaurant,Electronics Store


In [26]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,North York,1.0,Park,Food & Drink Shop,Women's Store,Department Store,Ethiopian Restaurant
21,York,1.0,Park,Women's Store,Pool,Dance Studio,Electronics Store
35,East York,1.0,Park,Convenience Store,Women's Store,Department Store,Ethiopian Restaurant
61,Central Toronto,1.0,Park,Swim School,Bus Line,Event Space,Electronics Store
66,North York,1.0,Park,Convenience Store,Bank,Women's Store,Department Store
68,Central Toronto,1.0,Park,Jewelry Store,Trail,Sushi Restaurant,Deli / Bodega
85,Scarborough,1.0,Park,Playground,Women's Store,Dance Studio,Electronics Store
91,Downtown Toronto,1.0,Park,Playground,Trail,Dance Studio,Electronics Store
98,Etobicoke,1.0,Park,River,Smoke Shop,Pool,Eastern European Restaurant


In [27]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,North York,2.0,French Restaurant,Pizza Place,Coffee Shop,Intersection,Hockey Arena
2,Downtown Toronto,2.0,Coffee Shop,Park,Breakfast Spot,Theater,Bakery
3,North York,2.0,Clothing Store,Furniture / Home Store,Gift Shop,Arts & Crafts Store,Coffee Shop
4,Downtown Toronto,2.0,Coffee Shop,Yoga Studio,Fried Chicken Joint,Burrito Place,Distribution Center
6,Scarborough,2.0,Fast Food Restaurant,Print Shop,Women's Store,Deli / Bodega,Electronics Store
...,...,...,...,...,...,...,...
97,Downtown Toronto,2.0,Café,Coffee Shop,Restaurant,Tea Room,Seafood Restaurant
99,Downtown Toronto,2.0,Park,Gay Bar,Bookstore,Breakfast Spot,Bubble Tea Shop
100,East Toronto,2.0,Yoga Studio,Auto Workshop,Light Rail Station,Smoke Shop,Brewery
101,Etobicoke,2.0,Locksmith,Baseball Field,Business Service,Falafel Restaurant,Event Space


In [None]:
#I should definitly choose cluster 1!