The objective of this Notebook is to perform the Data Science Capstone Project, week 3 assignment.


# First Part: Dataframe creation

In [1]:
import pandas as pd
import numpy as np


In [2]:
pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 4.1MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [2]:
#first we will read the table from the wikipedia page.

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#pd.read_html returns a list (of dataframes). There is only one and it can be accessed as the first element of the list.
df=pd.read_html(url)
df=df[0]
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


In [3]:
# We will now drop the 'not assigned' values

df=df[df.Borough != 'Not assigned']

df.reset_index(drop=True,inplace=True)

df.isna().sum()

Postal code     0
Borough         0
Neighborhood    0
dtype: int64

In [4]:
# We will now replace the 'NaN' values for neighborhood for their corresponding borough


for i, neighborhood in zip(df.index, df.Neighborhood):
    
    if neighborhood == 'NaN':
        df.replace({df['Neighborhood'][i]: df['Borough'][i]}, inplace=True)

In [5]:
# We can verify that no NaN remain

print(sum(df['Neighborhood']=='NaN'))

0


In [6]:
# We will identify postal codes that share a neighborhood, but will clean later

indices = [index for index, element in enumerate(df['Neighborhood'].duplicated()) if element == True]
indices
df.iloc[indices]

Unnamed: 0,Postal code,Borough,Neighborhood
13,M3C,North York,Don Mills
46,M3L,North York,Downsview
53,M3M,North York,Downsview
60,M3N,North York,Downsview
72,M2R,North York,Willowdale


In [7]:
# Lets change the '/' for ',':

for index in range(len(df.Neighborhood)):
    
    df['Neighborhood'][index]=df['Neighborhood'][index].replace('/',',')
    
    
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


In [8]:
df.shape

(103, 3)

### Important remark!: even though there are no different neighborhoods sharing a postal code (as they are already grouped in the wikipedia table), there are neighborhoods with several postal codes. This will be solved later on as we will group them and calculate the mean coordinates at the same time

In [9]:
#we can see that the number of unique neighborhoods is smaller than the number of postal codes

len(df.Neighborhood.unique())

98

# Part two: Coordinate Retrieval

## Below code should not be run (as it will time out), geocoder does not seem to be working: plan B just after

In [17]:
import geocoder # import geocoder

# The code below timed out so I will go with plan B: retrieving coordinates from the url
latitude=[]
longitude=[]
#Create a bigger loop to move through post codes
j=0
for postal_code in df['Postal code']:
# initialize your variable to None
    lat_lng_coords = None
    i=0
# loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        i=i+1 
        print(i)
        
    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1])
    j=j+1
    print(j)
    
df['Latitude']=latitude
df['Longitude']=longitude

KeyboardInterrupt: 

In [10]:
url2='http://cocl.us/Geospatial_data'
df2=pd.read_csv(url2)
df2

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [11]:
df['Latitude']=df['Postal code']
df['Longitude']=df['Postal code']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
for j,postal_code in zip(df2.index, df2['Postal Code']):
    for index in df.index:
        if df['Postal code'][index]==postal_code:
            df['Latitude'][index]=df2['Latitude'][j]
            df['Longitude'][index]=df2['Longitude'][j]
            
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7533,-79.3297
1,M4A,North York,Victoria Village,43.7259,-79.3156
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.6543,-79.3606
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.7185,-79.4648
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.6623,-79.3895
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.6537,-79.5069
99,M4Y,Downtown Toronto,Church and Wellesley,43.6659,-79.3832
100,M7Y,East Toronto,Business reply mail Processing CentrE,43.6627,-79.3216
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",43.6363,-79.4985


### We will now group the postal codes that are shared by multiple neighborhoods and recalculate the coordinates for the neighborhoods as the mean for the different postal codes in them:

In [13]:
df['Latitude']=df['Latitude'].astype('float')
df['Longitude']=df['Longitude'].astype('float')
df_aux=df.drop(['Postal code','Borough'],axis=1)


df_aux=df_aux.groupby('Neighborhood').mean()
df_aux

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Latitude,Longitude
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1
Agincourt,43.794200,-79.262029
"Alderwood , Long Branch",43.602414,-79.543484
"Bathurst Manor , Wilson Heights , Downsview North",43.754328,-79.442259
Bayview Village,43.786947,-79.385975
"Bedford Park , Lawrence Manor East",43.733283,-79.419750
...,...,...
"Willowdale , Newtonbrook",43.789053,-79.408493
Woburn,43.770992,-79.216917
Woodbine Heights,43.695344,-79.318389
"York Mills , Silver Hills",43.757490,-79.374714


In [14]:
df_aux['Postal code']='NaN'
df_aux['Borough']='NaN'
df_aux.reset_index()
for postal_code, neighborhood, borough in zip(df['Postal code'], df['Neighborhood'], df['Borough']):
    if df_aux.loc[neighborhood,'Postal code']=='NaN':
        df_aux.loc[neighborhood,'Postal code']=postal_code
        df_aux.loc[neighborhood,'Borough']=borough
    else:
        df_aux.loc[neighborhood,'Postal code']=df_aux.loc[neighborhood,'Postal code']+', ' + postal_code

In [15]:
df=df_aux

In [16]:
df.reset_index(inplace=True)

# Part Three: Clustering and Reporting

We will now cluster the neighborhoods using K-means using data from the Foursquare API

In [17]:
CLIENT_ID = '4U0ND42NTP142QAL52MLARWTMECJTHOQIWD5ZQFZKV1BUJCZ' # your Foursquare ID
CLIENT_SECRET = '1RVKEFT5TDWCQG320UAQ45I5BV1YG53HU0DWWM3NFSONKR5H' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [18]:
# new dataframe to contain feature informations
df3=df[['Neighborhood']]

df3

Unnamed: 0,Neighborhood
0,Agincourt
1,"Alderwood , Long Branch"
2,"Bathurst Manor , Wilson Heights , Downsview North"
3,Bayview Village
4,"Bedford Park , Lawrence Manor East"
...,...
93,"Willowdale , Newtonbrook"
94,Woburn
95,Woodbine Heights
96,"York Mills , Silver Hills"


## Let us retrieve the venues for all the neighborhoods:

In [19]:
import requests

# a loop that runs through the neighborhoods
LIMIT=100
radius=500

# Note: with the objective of clustering the neighborhoods, I will only be looking at the percentage of venues in each category and therefore, I will only retrieve this information)
venues_info=[]

for neighborhood_name, neighborhood_latitude, neighborhood_longitude in zip(df.Neighborhood, df.Latitude, df.Longitude):
    
    #setup the API request URL

    search_query=''
    url= 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, search_query, radius, LIMIT)

    results = requests.get(url).json()["response"]['venues']
    
    for v in results:
        try:
            venues_info.append([v['categories'][0]['name'], neighborhood_name])
        except:
            pass



In [20]:
df_venues=pd.DataFrame(venues_info)

In [21]:
df_venues.rename({0: 'Category', 1:'Neighborhood'}, axis=1, inplace=True)

## We 'One-hot code' all categories and retrieve the most common venues per neighborhood (in relative terms, that's why we use means):

In [22]:
dfx=pd.get_dummies(df_venues['Category'])


In [23]:
df_features=df_venues.merge(dfx, left_index=True, right_index=True)
df_features.drop('Category',inplace=True, axis=1)

In [24]:
df_weightedNhoods=df_features.groupby('Neighborhood').mean()

In [25]:
df_weightedNhoods

Unnamed: 0_level_0,ATM,Accessories Store,Acupuncturist,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,...,Waste Facility,Water Park,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Agincourt,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
"Alderwood , Long Branch",0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
"Bathurst Manor , Wilson Heights , Downsview North",0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.011236,0.0,0.0
Bayview Village,0.0,0.0,0.011364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
"Bedford Park , Lawrence Manor East",0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.011494,0.0,0.011494,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Willowdale , Newtonbrook",0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.011111,0.011111,0.000000,0.0,0.000000,0.0,0.0
Woburn,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
Woodbine Heights,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0
"York Mills , Silver Hills",0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0


## Now we will use the K-Means method to cluster the neighborhoods


In [26]:
from sklearn.cluster import KMeans

We will cluster the neighborhoods using K-means

In [27]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_weightedNhoods)
len(kmeans.labels_)

98

In [28]:
df.insert(0, 'Cluster Labels', kmeans.labels_)

In [29]:
df

Unnamed: 0,Cluster Labels,Neighborhood,Latitude,Longitude,Postal code,Borough
0,4,Agincourt,43.794200,-79.262029,M1S,Scarborough
1,4,"Alderwood , Long Branch",43.602414,-79.543484,M8W,Etobicoke
2,3,"Bathurst Manor , Wilson Heights , Downsview North",43.754328,-79.442259,M3H,North York
3,3,Bayview Village,43.786947,-79.385975,M2K,North York
4,0,"Bedford Park , Lawrence Manor East",43.733283,-79.419750,M5M,North York
...,...,...,...,...,...,...
93,0,"Willowdale , Newtonbrook",43.789053,-79.408493,M2M,North York
94,0,Woburn,43.770992,-79.216917,M1G,Scarborough
95,0,Woodbine Heights,43.695344,-79.318389,M4C,East York
96,0,"York Mills , Silver Hills",43.757490,-79.374714,M2L,North York


## We will now create a map

In [30]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
latitude=df['Latitude'].mean()
longitude=df['Longitude'].mean()
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighborhood'],df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Interpreting the clusters

In [39]:
# We will create a dataframe to group the neighborhoods according to their cluster label. Then we will find which types of features are more characteristic for each cluster:

df_clusters=df_weightedNhoods.reset_index()

In [63]:

#df_clusters.drop('Neighborhood', axis=1, inplace=True)
df_clusters=df_clusters.groupby('Cluster Labels').mean()
df_clusters

Unnamed: 0_level_0,ATM,Accessories Store,Acupuncturist,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,...,Waste Facility,Water Park,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.000358,0.002035,0.0,0.0,0.000359,0.000184,0.00116,0.0,0.0,0.0,...,0.000193,0.000216,0.001118,0.000997,0.001532,0.001158,0.003912,0.00251,0.000182,0.003021
1,0.0,0.005376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.005376,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037037,0.018519,0.222222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000745,0.0,0.001719,0.000654,0.000774,0.001176,0.0,0.0,0.0,0.0,...,0.0,0.001225,0.0,0.000692,0.000668,0.001176,0.000692,0.001329,0.0,0.0
4,0.001266,0.00253,0.0,0.0,0.003198,0.000619,0.001892,0.000619,0.0,0.0,...,0.0,0.000633,0.0,0.000654,0.0,0.0,0.003184,0.002129,0.0,0.000654


In [69]:
# Lets find the main types of venue for each cluster

# Let's create a dataframe to store the results


clusters_main_features=[]
for i in df_clusters.index:

    cluster_sorted=df_clusters.iloc[i].sort_values(ascending=False)
    clusters_main_features.append(cluster_sorted.index[0:5])
    
clusters_main_features=pd.DataFrame(clusters_main_features)
clusters_main_features.rename(columns={0:'First most common venue', 1:'Second most common venue', 2:'Third most common venue', 3:'Fourth most common venue', 4:'Fifht most common venue'},inplace=True)

Based on the most common venues for each cluster, we can easily find names for them

In [71]:
# In my particular case (if we rerun the kernel it might not hold), this is a proposed classification for the clusters:

clusters_main_features.rename(index={0:'Office district', 1:'Commercial area', 2:'Airport area', 3:'Residential area', 4:'Mixed office and residential area'},inplace=True)
clusters_main_features.reset_index(inplace=True)

In [74]:
clusters_main_features.rename(columns={'index':'Area Type'}, inplace=True)
clusters_main_features

Unnamed: 0,Area Type,First most common venue,Second most common venue,Third most common venue,Fourth most common venue,Fifht most common venue
0,Office district,Office,Salon / Barbershop,Park,Church,Building
1,Commercial area,Automotive Shop,Building,Furniture / Home Store,Gas Station,Office
2,Airport area,Airport Gate,Airport Service,Moving Target,Airport Terminal,Boat or Ferry
3,Residential area,Residential Building (Apartment / Condo),Office,Building,Park,Doctor's Office
4,Mixed office and residential area,Office,Building,Automotive Shop,Residential Building (Apartment / Condo),Coffee Shop


In [84]:
df['Area Type']=''
for i in df.index:
   
    df['Area Type'][i]=clusters_main_features.loc[df['Cluster Labels'][i],'Area Type']

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Cluster Labels,Neighborhood,Latitude,Longitude,Postal code,Borough,Area Type
0,4,Agincourt,43.794200,-79.262029,M1S,Scarborough,Mixed office and residential area
1,4,"Alderwood , Long Branch",43.602414,-79.543484,M8W,Etobicoke,Mixed office and residential area
2,3,"Bathurst Manor , Wilson Heights , Downsview North",43.754328,-79.442259,M3H,North York,Residential area
3,3,Bayview Village,43.786947,-79.385975,M2K,North York,Residential area
4,0,"Bedford Park , Lawrence Manor East",43.733283,-79.419750,M5M,North York,Office district
...,...,...,...,...,...,...,...
93,0,"Willowdale , Newtonbrook",43.789053,-79.408493,M2M,North York,Office district
94,0,Woburn,43.770992,-79.216917,M1G,Scarborough,Office district
95,0,Woodbine Heights,43.695344,-79.318389,M4C,East York,Office district
96,0,"York Mills , Silver Hills",43.757490,-79.374714,M2L,North York,Office district


### Final map with legend

In [96]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
latitude=df['Latitude'].mean()
longitude=df['Longitude'].mean()
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
feature_group=[]
for i in range(kclusters):
    feature_group.append( folium.FeatureGroup(name=clusters_main_features.loc[i,'Area Type']))

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, cluster_name in zip(df['Latitude'], df['Longitude'], df['Neighborhood'],df['Cluster Labels'], df['Area Type']):
    label = folium.Popup(str(cluster_name) + ': ' + str(poi), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(feature_group[cluster]),

for i in range(kclusters):
    map_clusters.add_child(feature_group[i])
    
map_clusters.add_child(folium.map.LayerControl())
map_clusters

Please note on the map above the layer control, which allows to select and de-select certain clusters