# 1. Importing data from a webpage
### Importing libraries

In [2]:
import pandas as pd
import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import numpy as np
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

# All requested packages already installed.



### the following lines imports all the tables in the wikipedia webpage and counts them

In [16]:
df_CP = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
print('there are', len(df_CP), ' tables in the wikipedia web page')

there are 3  tables in the wikipedia web page


### After checking on the webpage, the postale code table is the first one.The following lines put in the dataframe df_CP only the first table and display it

In [17]:
df_CP = df_CP[0]
df_CP.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"


### Cleaning data
let's see how are organized the columns 'Borough' and 'Neighborhood'

In [18]:
Not_assigned_Neighborhood = len(df_CP.loc[df_CP['Neighborhood'].isin(['Not assigned'])])
NaN_Neighborhood = df_CP["Neighborhood"].isna().sum()
Not_assigned_Borough = len(df_CP.loc[df_CP['Borough'].isin(['Not assigned'])])
NaN_Borough = df_CP["Borough"].isna().sum()

print('There are',Not_assigned_Neighborhood, ' non assigned Neighborhood and', Not_assigned_Borough, 'non assigned Boroughs')
print('There are',NaN_Neighborhood, ' data missing in the column Neighborhood and', NaN_Borough, 'data missing in the column Boroughs')

There are 0  non assigned Neighborhood and 77 non assigned Boroughs
There are 77  data missing in the column Neighborhood and 0 data missing in the column Boroughs


From the precedent results we can make the hypothesis that all the rows where there is a 'Non assigned' Boroughs has a missing data in the column 'Neighborhood'. Let's drop the rows which contain the missing data: 

In [19]:
df_CP = df_CP.dropna()
df_CP.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


### Let's now check if there are some 'Not assigned' values left after processing

In [20]:
Not_assigned_Neighborhood = len(df_CP.loc[df_CP['Neighborhood'].isin(['Not assigned'])])
NaN_Neighborhood = df_CP["Neighborhood"].isna().sum()
Not_assigned_Borough = len(df_CP.loc[df_CP['Borough'].isin(['Not assigned'])])
NaN_Borough = df_CP["Borough"].isna().sum()

print('There are',Not_assigned_Neighborhood, ' non assigned Neighborhood and', Not_assigned_Borough, 'non assigned Boroughs after processing')
print('There are',NaN_Neighborhood, ' data missing in the column Neighborhood and', NaN_Borough, 'data missing in the column Boroughs after processing')

There are 0  non assigned Neighborhood and 0 non assigned Boroughs after processing
There are 0  data missing in the column Neighborhood and 0 data missing in the column Boroughs after processing


### Now the data frame doesn't contain any missing data and 'not assigned' borrows. Let's check the number of rows in the new data frame:

In [21]:
size = df_CP.shape
print('There are', size[0], 'rows in the data frame now')

There are 103 rows in the data frame now


# 2. Getting the coordinates of each postal code

### Starting by installing geocoder

In [22]:
#!conda install -c conda-forge geocoder --yes 

### I wanted to try the geocoder package but it return 'Request Denied'

In [15]:

#import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))
  lat_lng_coords = g.latlng
  print(g)

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

ModuleNotFoundError: No module named 'geocoder'

### So I broke the 'while' loop and decided to work with the given csv file

In [23]:
!wget -q -O 'location_data.csv' http://cocl.us/Geospatial_data #Download the data and name it 'location_data.csv'
df_coord = pd.read_csv('location_data.csv')                    #read the CSV file
DF_Located = pd.merge(df_CP, df_coord, on='Postal Code')       #merging the 'location' dataframe with the previous dataframe by matching the column 'postal code'

### Let's check the data

In [24]:
DF_Located.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### Let's create a new dataframe which contains only the borough 'Downtown Toronto'

In [25]:
Toronto_data = DF_Located[DF_Located['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
Toronto_data.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
8,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
9,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576


# 3. Exploring Toronto

### Let's start by show Toronto map and ploting only downtown Toronto neighborhoods

In [26]:
TorLat = 43.651070   #Latitude
TorLong = -79.347015 #Longitude
map_dwntn_toronto = folium.Map(location=[TorLat, TorLong], zoom_start=10) #call Folium to creat the map
# adding markers to the map
for lat, lng, borough, neighborhood in zip(Toronto_data['Latitude'], Toronto_data['Longitude'], Toronto_data['Borough'], Toronto_data['Neighborhood']):
    label = '{}, {}'.format(Toronto_data, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dwntn_toronto)  
    
map_dwntn_toronto

In [27]:
CLIENT_ID = 'XXXXXX'
CLIENT_SECRET = 'XXXXXXXX'
VERSION = '20180604'
LIMIT = 100

### Creation of a fuction which explore all the neighborhood in downtown Toronto

In [28]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Running the function to have all nearby venues in neighborhoods

In [29]:
Toronto_venues = getNearbyVenues(names=Toronto_data['Neighborhood'],
                                   latitudes=Toronto_data['Latitude'],
                                   longitudes=Toronto_data['Longitude']
                                  )

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Stn A PO Boxes
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley


### Infos on venues

In [30]:
print('There are a total of',Toronto_venues.shape[0], 'venues in downtown Toronto grouped in' , len(Toronto_venues['Venue Category'].unique()), 'categories')
Toronto_venues.head()

There are a total of 1215 venues in downtown Toronto grouped in 207 categories


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
3,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
4,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


# 4. Downtown Toronto Neighborhood Analysis
### Creating a dataframe  in which each column correspond to a venue's category

In [31]:
Toronto_spots = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_spots['Neighborhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_spots.columns[-1]] + list(Toronto_spots.columns[:-1])
Toronto_spots = Toronto_spots[fixed_columns]
Neighborhood_grouped = Toronto_spots.groupby('Neighborhood').mean().reset_index()

### Showing the top 5 venue's categories in each neighborhood

In [32]:
venues = 5
print('#########################################################################################')
print('\n')
print('#####  here are the top 5 venue categories in each neighborhood of downtown Toronto #####')
print('\n')
print('##########################################################################################')
print('\n')
for hood in Neighborhood_grouped['Neighborhood']:
    print("----"+hood+"----")
    tmp = Neighborhood_grouped[Neighborhood_grouped['Neighborhood'] == hood].T.reset_index()
    tmp.columns = ['venue','freq']
    tmp = tmp.iloc[1:]
    tmp['freq'] = tmp['freq'].astype(float)
    tmp = tmp.round({'freq': 2})
    print(tmp.sort_values('freq', ascending=False).reset_index(drop=True).head(venues))
    print('\n')

#########################################################################################


#####  here are the top 5 venue categories in each neighborhood of downtown Toronto #####


##########################################################################################


----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1        Cocktail Bar  0.05
2  Seafood Restaurant  0.04
3                Café  0.04
4         Cheese Shop  0.04


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
             venue  freq
0  Airport Service  0.19
1   Airport Lounge  0.12
2         Boutique  0.06
3  Harbor / Marina  0.06
4          Airport  0.06


----Central Bay Street----
                venue  freq
0         Coffee Shop  0.16
1                Café  0.06
2  Italian Restaurant  0.06
3        Dessert Shop  0.04
4      Sandwich Place  0.04


----Christie----
                venue  freq
0       Grocery Store  0

# 5. Searching the 10 most comon venues in each neighborhood

### Creating a function which return the <i> n </i>  most comon venues 
#### <i> with n the number of most comon venues choosen by the user </i>

In [33]:
def return_most_common_venues(row, n):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:n]

### Print the 10 most comon venues

In [92]:
venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Neighborhood_grouped['Neighborhood']

for ind in np.arange(Neighborhood_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Neighborhood_grouped.iloc[ind, :], venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Bakery,Restaurant,Cheese Shop,Café,Seafood Restaurant,Park,Beach
1,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Sculpture Garden,Boat or Ferry,Boutique,Coffee Shop,Airport,Airport Food Court,Airport Gate,Airport Terminal
2,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Dessert Shop,Thai Restaurant,Japanese Restaurant,Ice Cream Shop,Bar,Burger Joint
3,Christie,Grocery Store,Café,Park,Coffee Shop,Baby Store,Restaurant,Candy Store,Italian Restaurant,Diner,Nightclub
4,Church and Wellesley,Japanese Restaurant,Sushi Restaurant,Coffee Shop,Restaurant,Gay Bar,Hotel,Pub,Men's Store,Mediterranean Restaurant,Gastropub


# 5. Clustering the Neighborhood

### kmean calculus

In [93]:
# set number of clusters
kclusters = 5

Toronto_clus = Neighborhood_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_clus)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 4, 2, 0, 0, 0, 0, 4, 0], dtype=int32)

In [94]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
Toronto_Clustered = Toronto_data
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_Clustered = Toronto_Clustered.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
Toronto_Clustered.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4,Coffee Shop,Bakery,Park,Restaurant,Pub,Theater,Breakfast Spot,Café,Cosmetics Shop,Performing Arts Venue
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,4,Coffee Shop,Sushi Restaurant,Yoga Studio,Juice Bar,College Auditorium,Italian Restaurant,Sandwich Place,Japanese Restaurant,Distribution Center,Diner
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Clothing Store,Coffee Shop,Café,Italian Restaurant,Japanese Restaurant,Bubble Tea Shop,Middle Eastern Restaurant,Cosmetics Shop,Restaurant,Diner
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Café,Coffee Shop,Cocktail Bar,Gastropub,American Restaurant,Cosmetics Shop,Gym,Italian Restaurant,Beer Bar,Creperie
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Beer Bar,Bakery,Restaurant,Cheese Shop,Café,Seafood Restaurant,Park,Beach


# 6. Visualizing results on the map

In [95]:
map_clusters = folium.Map(location=[TorLat, TorLong], zoom_start=13)


# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
Toronto_Clustered['Cluster Labels']
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_Clustered['Latitude'], Toronto_Clustered['Longitude'], Toronto_Clustered['Neighborhood'], Toronto_Clustered['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# 7. Exploring the clusters

In [104]:
Clus0 = Toronto_Clustered.loc[Toronto_Clustered['Cluster Labels'] == 0, Toronto_Clustered.columns[[1] + list(range(6, Toronto_Clustered.shape[1]))]]

In [97]:
Clus1 = Toronto_Clustered.loc[Toronto_Clustered['Cluster Labels'] == 1, Toronto_Clustered.columns[[1] + list(range(6, Toronto_Clustered.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Downtown Toronto,Park,Trail,Playground,Women's Store,Convention Center,Distribution Center,Discount Store,Diner,Dessert Shop,Department Store


In [98]:
Clus2 = Toronto_Clustered.loc[Toronto_Clustered['Cluster Labels'] == 2, Toronto_Clustered.columns[[1] + list(range(6, Toronto_Clustered.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,Downtown Toronto,Grocery Store,Café,Park,Coffee Shop,Baby Store,Restaurant,Candy Store,Italian Restaurant,Diner,Nightclub


In [99]:
Clus3 = Toronto_Clustered.loc[Toronto_Clustered['Cluster Labels'] == 3, Toronto_Clustered.columns[[1] + list(range(6, Toronto_Clustered.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,Downtown Toronto,Airport Service,Airport Lounge,Sculpture Garden,Boat or Ferry,Boutique,Coffee Shop,Airport,Airport Food Court,Airport Gate,Airport Terminal


In [100]:
Clus4 = Toronto_Clustered.loc[Toronto_Clustered['Cluster Labels'] ==4, Toronto_Clustered.columns[[1] + list(range(6, Toronto_Clustered.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,Coffee Shop,Bakery,Park,Restaurant,Pub,Theater,Breakfast Spot,Café,Cosmetics Shop,Performing Arts Venue
1,Downtown Toronto,Coffee Shop,Sushi Restaurant,Yoga Studio,Juice Bar,College Auditorium,Italian Restaurant,Sandwich Place,Japanese Restaurant,Distribution Center,Diner
5,Downtown Toronto,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Dessert Shop,Thai Restaurant,Japanese Restaurant,Ice Cream Shop,Bar,Burger Joint
8,Downtown Toronto,Coffee Shop,Aquarium,Café,Hotel,Fried Chicken Joint,Italian Restaurant,Restaurant,Scenic Lookout,Sporting Goods Shop,Brewery


In [76]:
Clus5 = Toronto_Clustered.loc[Toronto_Clustered['Cluster Labels'] == 5, Toronto_Clustered.columns[[1] + list(range(6, Toronto_Clustered.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Downtown Toronto,Clothing Store,Coffee Shop,Café,Italian Restaurant,Japanese Restaurant,Bubble Tea Shop,Middle Eastern Restaurant,Cosmetics Shop,Restaurant,Diner
3,Downtown Toronto,Café,Coffee Shop,Cocktail Bar,Gastropub,American Restaurant,Cosmetics Shop,Gym,Italian Restaurant,Beer Bar,Creperie
4,Downtown Toronto,Coffee Shop,Cocktail Bar,Beer Bar,Bakery,Restaurant,Cheese Shop,Café,Seafood Restaurant,Park,Beach
12,Downtown Toronto,Café,Coffee Shop,Mexican Restaurant,Vietnamese Restaurant,Dessert Shop,Bakery,Bar,Gaming Cafe,Vegetarian / Vegan Restaurant,Burrito Place
15,Downtown Toronto,Coffee Shop,Café,Cocktail Bar,Seafood Restaurant,Japanese Restaurant,Italian Restaurant,Beer Bar,Restaurant,Bakery,Art Gallery
16,Downtown Toronto,Coffee Shop,Restaurant,Pub,Pizza Place,Café,Park,Pharmacy,Market,Italian Restaurant,Bakery
18,Downtown Toronto,Japanese Restaurant,Sushi Restaurant,Coffee Shop,Restaurant,Gay Bar,Hotel,Pub,Men's Store,Mediterranean Restaurant,Gastropub


# 8. Renaming clusters
### Here I gave a name for each cluster number. The name is the most comon category of the 5 first most common venue
### i.e : in Cluster 0 the most comon categories in the 5 first most common venue are: Café and Restaurant, so the cluster will be named 'Café, Restaurant'

In [414]:
Cluster_Name = pd.DataFrame(columns = ['Cluster num', 'Cluster Labels'])
Cluster_Name['Cluster num'] = Toronto_Clustered['Cluster Labels']
test = []
clusName = []
for j in range(5):
    t = []
    Clus = Toronto_Clustered.loc[Toronto_Clustered['Cluster Labels'] == j, Toronto_Clustered.columns[[1] + list(range(6, Toronto_Clustered.shape[1]))]]
    for i in range(1,5):
        t.append(Clus.loc[1:,Clus0.columns[i]].mode()[0]) 
    clusName = list(set(t[1:]))
    #Cluster_Name.loc(Toronto_Clustered['Cluster Labels'],'Cluster name') = str(clusNames)
    print(clusName)
    test.append(clusName)

for k in range(5):
    Cluster_Name.loc[Toronto_Clustered['Cluster Labels'] == k, 'Cluster Labels'] = ", ".join(test[k])
   

['Café', 'Restaurant']
["Women's Store", 'Playground', 'Trail']
['Café', 'Coffee Shop', 'Park']
['Sculpture Garden', 'Airport Lounge', 'Boat or Ferry']
['Café', 'Hotel', 'Aquarium']


### Verifying the new dataframe which contains the new labels

In [415]:
Cluster_Name

Unnamed: 0,Cluster num,Cluster Labels
0,4,"Café, Hotel, Aquarium"
1,4,"Café, Hotel, Aquarium"
2,0,"Café, Restaurant"
3,0,"Café, Restaurant"
4,0,"Café, Restaurant"
5,4,"Café, Hotel, Aquarium"
6,2,"Café, Coffee Shop, Park"
7,0,"Café, Restaurant"
8,4,"Café, Hotel, Aquarium"
9,0,"Café, Restaurant"


# 9. Plotting the new figure
### Now if you clic on a cluster you will see the neighborhood and the venues of the cluster.

In [425]:
map_clusters = folium.Map(location=[TorLat, TorLong], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
Toronto_Clustered['Cluster Labels']
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, k in zip(Toronto_Clustered['Latitude'], Toronto_Clustered['Longitude'], Toronto_Clustered['Neighborhood'], Cluster_Name['Cluster Labels'], Cluster_Name['Cluster num']):
    label = folium.Popup('Neighborhood: ' + str(poi) + '. Venues: ' + cluster, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[k-1],
        fill=True,
        fill_color=rainbow[k-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters