# Applied Data Science Capstone Project - Week 3, part 3
Explore and cluster the dataframe in Toronto.

Just make sure:
1. To add enough Markdown cells to explain what you decided to do and to report any observations you make.
2. To generate maps to visualize your dataframe and how they cluster together.

In [1]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Loading data
fname = 'https://raw.githubusercontent.com/RiccardoGiro/IBM_ADSC_Project/master/Data/Data_final.csv'
df = pd.read_csv(fname, index_col=0)
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [3]:
print(f"The loaded dataframe consists in {df['Borough'].unique().shape[0]} different boroughs and a total of"
      f" {df.shape[0]} neighborhoods.")

The loaded dataframe consists in 10 different boroughs and a total of 103 neighborhoods.


#### Generation of a Toronto map, in order to visualize the neighborhoods

In [4]:
# Extraction of Toronto's coordinates
pos = 'Toronto, Canada'
geo = Nominatim(user_agent="can_explorer")
loc = geo.geocode(pos)
lat = loc.latitude
lon = loc.longitude
print(f"The geographical coordinates of Toronto are:\nLatitude:   {np.around(lat, 3)}° N;\nLongitude: "
      f"{np.around(lon, 3)}° O.")
# Map generation
map_T = folium.Map(location=[lat, lon], zoom_start=10)
# Adding markers corresponding to the neighborhoods in the map
for lat, lon, bor, nei in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(nei, bor)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lon], radius=3, popup=label, color='green', fill=True, fill_color='#3186cc',
        fill_opacity=0.7, parse_html=False).add_to(map_T)  
# Displaying the map
map_T

The geographical coordinates of Toronto are:
Latitude:   43.653° N;
Longitude: -79.384° O.


### Exploration of Toronto neighborhoods
#### Foursquare API access and data extraction

In [5]:
limit = 100
ID = '1B2QEZLDCQCQUAXR325SRGV0T1YT4FPSQLIJLYBBGTBEHIKE'
secret = 'CUVST2LHRHS1LHKBXH4SPMH1C3H3P5FBVIFLNFSSH4GZFDF2' 
version = '20200401'
print(f"Credentials\nCLIENT ID:     {ID}\nCLIENT SECRET: {secret}")

Credentials
CLIENT ID:     1B2QEZLDCQCQUAXR325SRGV0T1YT4FPSQLIJLYBBGTBEHIKE
CLIENT SECRET: CUVST2LHRHS1LHKBXH4SPMH1C3H3P5FBVIFLNFSSH4GZFDF2


In [6]:
# This function retrieves the nearby venues for a given neighborhood in Toronto
def getNearbyVenues(names, LAT, LON, radius=500):
    venues_list=[]
    for i, j, k in zip(names, LAT, LON):
        #  API request URL
        url = f"https://api.foursquare.com/v2/venues/explore?&client_id={ID}&client_secret={secret}&v={version}" \
              f"&ll={j},{k}&radius={radius}&limit={limit}"
        # GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # Return only relevant information for each nearby venue
        venues_list.append([(i, j, k, v['venue']['name'], v['venue']['location']['lat'], 
            v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue',
                             'Venue Latitude', 'Venue Longitude', 'Venue Category']
    return nearby_venues
# Extraction of Toronto venues for each neighborhood in the database
venues = getNearbyVenues(names=df['Neighborhood'], LAT=df['Latitude'], LON=df['Longitude'])
print("All venues successfully extracted.")

All venues successfully extracted.


#### General information about the extracted dataframe and number of different venue categories

In [7]:
print(f"Shape of the dataframe: {venues.shape}")
print(f"There are {len(venues['Venue Category'].unique())} different types of venues.")
venues.groupby('Neighborhood').count().head()

Shape of the dataframe: (2206, 7)
There are 267 different types of venues.


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood, Long Branch",9,9,9,9,9,9
"Bathurst Manor, Wilson Heights, Downsview North",20,20,20,20,20,20
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",23,23,23,23,23,23


#### The presence/absence of the individual venues in each neighborhood needs to be encoded for later use

In [8]:
df2 = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")
df2['Neighborhood'] = venues['Neighborhood'] 
pos = [df2.columns[-1]] + list(df2.columns[:-1])
df2 = df2[pos]
# Rows are then grouped by neighborhood and the values corresponding to each category are obtained by computing the
# mean of the relative frequency of occurrence of each category
df2 = df2.groupby('Neighborhood').mean().reset_index()
print(f"Original dataframe shape: {venues.shape}.")
print(f"Transformed dataframe shape: {df2.shape}.")

Original dataframe shape: (2206, 7).
Transformed dataframe shape: (93, 267).


#### Sorting venues in descending order

In [9]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]


N = 10
ordinality = ['st', 'nd', 'rd']
for _ in range(N-len(ordinality)):
    ordinality.append('th')

# Add N columns
cols = ['Neighborhood']
for _ in np.arange(N):
    cols.append(f"{_+1}{ordinality[_]} Most Common Venue")
# Sorted dataframe
df3 = pd.DataFrame(columns=cols)
df3['Neighborhood'] = df2['Neighborhood']
for _ in np.arange(df2.shape[0]):
    df3.iloc[_, 1:] = return_most_common_venues(df2.iloc[_, :], N)
df3.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Latin American Restaurant,Breakfast Spot,Clothing Store,Lounge,Eastern European Restaurant,Electronics Store,Dumpling Restaurant,Empanada Restaurant,Dessert Shop,Drugstore
1,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Pharmacy,Sandwich Place,Skating Rink,Athletics & Sports,Pub,Gym,Comic Shop,Deli / Bodega
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Convenience Store,Ice Cream Shop,Supermarket,Deli / Bodega,Sushi Restaurant,Restaurant,Middle Eastern Restaurant,Diner
3,Bayview Village,Café,Bank,Japanese Restaurant,Chinese Restaurant,Dog Run,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Doner Restaurant
4,"Bedford Park, Lawrence Manor East",Sandwich Place,Coffee Shop,Italian Restaurant,Restaurant,Grocery Store,Thai Restaurant,Pub,Café,Sushi Restaurant,Indian Restaurant


#### K-Means clustering of the neighborhoods

In [10]:
# Number of clusters
K = 5
df_clust = df2.drop('Neighborhood', 1)
# Clustering using K-Means
clust_result = KMeans(n_clusters=K, random_state=0).fit(df_clust)
# Adding the labels obtained from clustering
try:
    df3.insert(0, 'Cluster Labels', clust_result.labels_)
except ValueError:
    pass
temp = df
# Adding latitude/longitude of each neighborhood
temp = temp.join(df3.set_index('Neighborhood'), on='Neighborhood')
temp.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,3.0,Fast Food Restaurant,Women's Store,Department Store,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,2.0,Bar,Women's Store,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Donut Shop,Falafel Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0.0,Mexican Restaurant,Rental Car Location,Breakfast Spot,Intersection,Bank,Medical Center,Electronics Store,Cosmetics Shop,Costume Shop,Eastern European Restaurant
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1.0,Coffee Shop,Korean Restaurant,Insurance Office,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1.0,Gas Station,Thai Restaurant,Fried Chicken Joint,Bank,Athletics & Sports,Caribbean Restaurant,Bakery,Hakka Restaurant,Drugstore,Donut Shop


##### Visualization of the results

In [11]:
# Map
map_clust = folium.Map(location=[loc.latitude, loc.longitude], zoom_start=10)
# Color scheme for the clusters
x = np.arange(K)
ys = [i + x + (i*x)**2 for i in range(K)]
colors_array = cm.jet(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# Adding markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(temp['Latitude'], temp['Longitude'], temp['Neighborhood'], temp['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon], radius=5, popup=label, fill=True, fill_opacity=0.7).add_to(map_clust)
map_clust