# Scraping data with the use of BeautifulSoup package


In [143]:
# import the library we use to open URLs
import urllib.request
# specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)
# import the BeautifulSoup library so we can parse HTML and XML documents
from bs4 import BeautifulSoup
# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")

In [140]:
#print(soup.prettify())

In [144]:
# use the 'find_all' function to bring back all instances of the 'table' tag in the HTML and store in 'all_tables' variable
all_tables=soup.find_all("table")
#all_tables

In [145]:
right_table=soup.find('table', class_='wikitable sortable')
#right_table

In [146]:
A=[]
B=[]
C=[]


for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

In [147]:
import pandas as pd
df=pd.DataFrame(A,columns=['Postal Code'])
df['Borough']= B
df['Neighbourhood']= C

In [148]:
df['Postal Code'] = df['Postal Code'].str.replace(r'\n', '')
df['Borough'] = df['Borough'].str.replace(r'\n', '')
df['Neighbourhood'] = df['Neighbourhood'].str.replace(r'\n', '')
df = df[~df.Borough.str.contains("Not assigned")]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## last cell showing shape of the dataframe

In [149]:
df.shape

(103, 3)

In [23]:
postal_code = list(df['Postal Code'])
print(postal_code)

['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B', 'M5B', 'M6B', 'M9B', 'M1C', 'M3C', 'M4C', 'M5C', 'M6C', 'M9C', 'M1E', 'M4E', 'M5E', 'M6E', 'M1G', 'M4G', 'M5G', 'M6G', 'M1H', 'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M1J', 'M2J', 'M3J', 'M4J', 'M5J', 'M6J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M1L', 'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M9L', 'M1M', 'M2M', 'M3M', 'M4M', 'M5M', 'M6M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N', 'M6N', 'M9N', 'M1P', 'M2P', 'M4P', 'M5P', 'M6P', 'M9P', 'M1R', 'M2R', 'M4R', 'M5R', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M5S', 'M6S', 'M1T', 'M4T', 'M5T', 'M1V', 'M4V', 'M5V', 'M8V', 'M9V', 'M1W', 'M4W', 'M5W', 'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X', 'M4Y', 'M7Y', 'M8Y', 'M8Z']


## Using geospatial data to create dataframe 

In [24]:
import requests
url="http://cocl.us/Geospatial_data"
lat_long=pd.read_csv("http://cocl.us/Geospatial_data")
lat_long.shape

(103, 3)

In [25]:
n = 0
lat = [None] * len(postal_code)
long = [None] * len(postal_code)
for row in lat_long.iterrows():
    c = (lat_long["Postal Code"][n])
    n += 1
    for idx, m in enumerate(postal_code):        
        if c == postal_code[idx]:
            lat[idx] =  lat_long["Latitude"][idx]
            long[idx] =  lat_long["Longitude"][idx]
        else:
            continue


In [30]:
df["Latitude"] = lat
df["Longitude"] = long
df = df.reset_index(drop=True)
df.head(20)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.806686,-79.194353
1,M4A,North York,Victoria Village,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476


In [31]:
df.head(20)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.806686,-79.194353
1,M4A,North York,Victoria Village,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.744734,-79.239476
6,M1B,Scarborough,"Malvern, Rouge",43.727929,-79.262029
7,M3B,North York,Don Mills,43.711112,-79.284577
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.716316,-79.239476
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.692657,-79.264848


In [42]:
CLIENT_ID = 'UY4YC2GHLQ3L2DYEF5FCUYLF1UULMO0ORRFVFB0GXFSB0NFS' # your Foursquare ID
CLIENT_SECRET = 'MDN53XXRKZOLS3ITF14ELCA4HKQENU25B1NODXM5PSLK3JWL' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: UY4YC2GHLQ3L2DYEF5FCUYLF1UULMO0ORRFVFB0GXFSB0NFS
CLIENT_SECRET:MDN53XXRKZOLS3ITF14ELCA4HKQENU25B1NODXM5PSLK3JWL


## clustering data as in the Manhattan analysis

In [32]:
torontohood_data = df[df['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
torontohood_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.692657,-79.264848
3,M5C,Downtown Toronto,St. James Town,43.799525,-79.318389
4,M5E,Downtown Toronto,Berczy Park,43.75749,-79.374714


In [36]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

address = 'Downtown Toronto, TO'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

usage: conda-script.py [-h] [-V] command ...
conda-script.py: error: unrecognized arguments: # uncomment this line if you haven't completed the Foursquare API lab


Libraries imported.
The geograpical coordinate of Toronto are 43.6541737, -79.38081164513409.


In [38]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(torontohood_data['Latitude'], torontohood_data['Longitude'], torontohood_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [39]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [53]:
# type your answer here

toronto_venues = getNearbyVenues(names=torontohood_data['Neighbourhood'],
                                   latitudes=torontohood_data['Latitude'],
                                   longitudes=torontohood_data['Longitude']
                                  )



Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Stn A PO Boxes
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley


In [54]:
print(toronto_venues.shape)
toronto_venues.head()

(165, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank
1,"Regent Park, Harbourfront",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
2,"Regent Park, Harbourfront",43.763573,-79.188711,Sail Sushi,43.765951,-79.191275,Restaurant
3,"Regent Park, Harbourfront",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant
4,"Regent Park, Harbourfront",43.763573,-79.188711,Enterprise Rent-A-Car,43.764076,-79.193406,Rental Car Location


In [55]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",17,17,17,17,17,17
Central Bay Street,6,6,6,6,6,6
Christie,2,2,2,2,2,2
Church and Wellesley,8,8,8,8,8,8
"Commerce Court, Victoria Hotel",2,2,2,2,2,2
"First Canadian Place, Underground city",1,1,1,1,1,1
"Garden District, Ryerson",4,4,4,4,4,4
"Harbourfront East, Union Station, Toronto Islands",6,6,6,6,6,6
"Kensington Market, Chinatown, Grange Park",37,37,37,37,37,37
"Queen's Park, Ontario Provincial Government",8,8,8,8,8,8


In [56]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Airport,Athletics & Sports,Auto Workshop,Bakery,Bank,Bar,Baseball Field,Beer Store,Bookstore,Breakfast Spot,Brewery,Burger Joint,Burrito Place,Butcher,Café,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,College Stadium,Comic Shop,Convenience Store,Curling Ice,Dessert Shop,Diner,Discount Store,Electronics Store,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Fish & Chips Shop,Food & Drink Shop,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Garden,Garden Center,Gas Station,General Entertainment,Gourmet Shop,Grocery Store,Gym,Gym / Fitness Center,Hakka Restaurant,Hardware Store,Health Food Store,Home Service,Ice Cream Shop,Indie Movie Theater,Intersection,Italian Restaurant,Latin American Restaurant,Light Rail Station,Liquor Store,Medical Center,Mexican Restaurant,Middle Eastern Restaurant,Movie Theater,Noodle House,Park,Pet Store,Pharmacy,Pizza Place,Pub,Recording Studio,Rental Car Location,Restaurant,Sandwich Place,Skate Park,Skating Rink,Smoothie Shop,Snack Place,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Tanning Salon,Tea Room,Thai Restaurant,Thrift / Vintage Store,Trail,Vegetarian / Vegan Restaurant,Video Store,Wings Joint,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [57]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.shape

(18, 85)

In [59]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0       Yoga Studio  0.06
1     Burrito Place  0.06
2  Recording Studio  0.06
3        Restaurant  0.06
4       Pizza Place  0.06


----Central Bay Street----
            venue  freq
0  Discount Store  0.17
1     Pizza Place  0.17
2            Bank  0.17
3        Pharmacy  0.17
4   Grocery Store  0.17


----Christie----
                venue  freq
0                Park   0.5
1   Food & Drink Shop   0.5
2             Airport   0.0
3  Mexican Restaurant   0.0
4         Pizza Place   0.0


----Church and Wellesley----
                       venue  freq
0                Pizza Place  0.25
1  Middle Eastern Restaurant  0.12
2         Chinese Restaurant  0.12
3             Sandwich Place  0.12
4                Coffee Shop  0.12


----Commerce Court, Victoria Hotel----
            venue  freq
0             Gym   0.5
1           Trail   0.5
2         Airpo

In [60]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [135]:
num_top_venues = 20

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue
0,"CN Tower, King and Spadina, Railway Lands, Har...",Yoga Studio,Skate Park,Auto Workshop,Brewery,Burrito Place,Butcher,Comic Shop,Farmers Market,Fast Food Restaurant,Garden,Garden Center,Light Rail Station,Park,Pizza Place,Recording Studio,Restaurant,Gym / Fitness Center,Sandwich Place,Chinese Restaurant,Food & Drink Shop
1,Central Bay Street,Grocery Store,Bank,Coffee Shop,Discount Store,Pharmacy,Pizza Place,Garden Center,Falafel Restaurant,Curling Ice,Dessert Shop,Diner,Gourmet Shop,General Entertainment,Electronics Store,Farmers Market,Garden,Fast Food Restaurant,Fish & Chips Shop,Gas Station,French Restaurant
2,Christie,Food & Drink Shop,Park,Fish & Chips Shop,Dessert Shop,Diner,Discount Store,Electronics Store,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Yoga Studio,Gym,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Garden,Garden Center,Gas Station,General Entertainment,Gourmet Shop
3,Church and Wellesley,Pizza Place,Discount Store,Coffee Shop,Chinese Restaurant,Sandwich Place,Middle Eastern Restaurant,Intersection,Fast Food Restaurant,Diner,Electronics Store,Falafel Restaurant,Farmers Market,Fish & Chips Shop,Curling Ice,Food & Drink Shop,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Garden,Garden Center
4,"Commerce Court, Victoria Hotel",Gym,Trail,Fish & Chips Shop,Curling Ice,Dessert Shop,Diner,Discount Store,Electronics Store,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Yoga Studio,Convenience Store,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Garden,Garden Center,Gas Station,General Entertainment


In [136]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 2, 0, 3, 1, 4, 0, 0, 0])

In [137]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = torontohood_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711,0.0,Electronics Store,Bank,Rental Car Location,Restaurant,Mexican Restaurant,Medical Center,Breakfast Spot,Intersection,Yoga Studio,Farmers Market,Diner,Discount Store,Falafel Restaurant,Fish & Chips Shop,Fast Food Restaurant,Curling Ice,Food & Drink Shop,French Restaurant,Fried Chicken Joint,Furniture / Home Store
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476,0.0,Athletics & Sports,Hakka Restaurant,Bakery,Bank,Thai Restaurant,Gas Station,Fried Chicken Joint,Caribbean Restaurant,Yoga Studio,Fish & Chips Shop,Discount Store,Electronics Store,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Furniture / Home Store,Food & Drink Shop,French Restaurant,Dessert Shop,Garden
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.692657,-79.264848,4.0,College Stadium,General Entertainment,Café,Skating Rink,Yoga Studio,Fast Food Restaurant,Diner,Discount Store,Electronics Store,Falafel Restaurant,Farmers Market,Fish & Chips Shop,Curling Ice,Food & Drink Shop,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Garden,Garden Center,Gas Station
3,M5C,Downtown Toronto,St. James Town,43.799525,-79.318389,0.0,Chinese Restaurant,Pharmacy,Supermarket,Discount Store,Pizza Place,Coffee Shop,Sandwich Place,Fast Food Restaurant,Breakfast Spot,Noodle House,Grocery Store,Gym,Bank,Thrift / Vintage Store,French Restaurant,Electronics Store,Dessert Shop,Diner,Garden Center,Falafel Restaurant
4,M5E,Downtown Toronto,Berczy Park,43.75749,-79.374714,,,,,,,,,,,,,,,,,,,,,


In [138]:
toronto_merged = toronto_merged.dropna()
toronto_merged.head() 

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711,0.0,Electronics Store,Bank,Rental Car Location,Restaurant,Mexican Restaurant,Medical Center,Breakfast Spot,Intersection,Yoga Studio,Farmers Market,Diner,Discount Store,Falafel Restaurant,Fish & Chips Shop,Fast Food Restaurant,Curling Ice,Food & Drink Shop,French Restaurant,Fried Chicken Joint,Furniture / Home Store
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476,0.0,Athletics & Sports,Hakka Restaurant,Bakery,Bank,Thai Restaurant,Gas Station,Fried Chicken Joint,Caribbean Restaurant,Yoga Studio,Fish & Chips Shop,Discount Store,Electronics Store,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Furniture / Home Store,Food & Drink Shop,French Restaurant,Dessert Shop,Garden
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.692657,-79.264848,4.0,College Stadium,General Entertainment,Café,Skating Rink,Yoga Studio,Fast Food Restaurant,Diner,Discount Store,Electronics Store,Falafel Restaurant,Farmers Market,Fish & Chips Shop,Curling Ice,Food & Drink Shop,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Garden,Garden Center,Gas Station
3,M5C,Downtown Toronto,St. James Town,43.799525,-79.318389,0.0,Chinese Restaurant,Pharmacy,Supermarket,Discount Store,Pizza Place,Coffee Shop,Sandwich Place,Fast Food Restaurant,Breakfast Spot,Noodle House,Grocery Store,Gym,Bank,Thrift / Vintage Store,French Restaurant,Electronics Store,Dessert Shop,Diner,Garden Center,Falafel Restaurant
5,M5G,Downtown Toronto,Central Bay Street,43.782736,-79.442259,0.0,Grocery Store,Bank,Coffee Shop,Discount Store,Pharmacy,Pizza Place,Garden Center,Falafel Restaurant,Curling Ice,Dessert Shop,Diner,Gourmet Shop,General Entertainment,Electronics Store,Farmers Market,Garden,Fast Food Restaurant,Fish & Chips Shop,Gas Station,French Restaurant


## creating a final map of neighborhoods in Downtown Toronto with similar venues

In [139]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters