# Segmenting and Clustering Neighborhoods in Toronto

### A project assignment for Applied Data Science for IBM/Coursera

#### Done By: Shravan Bharadwaj

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import folium
import pgeocode
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

### Scraping Webpage by using BeautifulSoup
    
     Scraping the webpage by using BeautifulSoup and storing the tables data into a variable.

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'    # Creating URL
toronto_source = requests.get(url).text                                    # URL to Text
soup = BeautifulSoup(toronto_source,'html.parser')                         # soup object
tables=soup.find('table')                                                  # extracting table

### Extracting the Information from the tables Data
    
    The Data of CODE, BOROUGHS and NEIGHBORHOODS are extracted into 3 lists respectively.

In [3]:
li_code,li_borough=[],[]                              # List to store Code, Borough and Neighborhoods
li_neighborhood=[]
for row in tables.find_all('tr'):                     # Looping to find the tag 'tr'
    cols=row.find_all('td')                           # Finding the tag 'td'
    for info in cols:                                 # Looping through the 'td' tag
        info=info.get_text(separator='',strip=True)   # Converting it to a String with get_text Method
        li_code.append(info[0:3])                     # First 3 Characters are CODES.
        try:                                          # Try block to get the index of '(' and ')'
            a=info.index('(')
            b=info.index(')')
        except:                                       # Since some do not have any Information it will be 
            li_borough.append('Not Assigned')         # "Not Assigned"
            li_neighborhood.append('Not Assigned')  
        else:                                         # Some Info which has will be sliced and appended to
            li_borough.append(info[3:a])              # the respective lists.
            li_neighborhood.append(info[a+1:b])
 

### Obtaining a Pandas Data Frame:
    
    1. The Postal Codes which have multiple Neighborhoods which are seprated by '/' are separated by ', '.
    2. A Dictionary is created with the help of lists.
    3. The Dictionary is converted to a Pandas DATAFRAME.

In [4]:
# The Postal Codes which have multiple Neighborhoods which are seprated by '/' are separated by ', '
li_neigh=[]
for i,j in enumerate(li_neighborhood):
    if '/' in j:
        j=j.replace('/',', ')
        li_neigh.append(j)
    else:
        li_neigh.append(j)

# Creating the Pandas Dataframe by creating a dictionary of columns required:
    
di={'Postal Code':li_code,'Borough':li_borough,'Neighborhood':li_neigh}                 # Dictionary
toronto_df =pd.DataFrame(di)                                                            # Data Frame
toronto_df.head(15) 

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not Assigned,Not Assigned
1,M2A,Not Assigned,Not Assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government
7,M8A,Not Assigned,Not Assigned
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"


In [5]:
# Description of the Data Frame:
print('The Shape of DF: ',toronto_df.shape)
print('******************************************************************************************************')
print('Description: ',toronto_df.describe())
print('******************************************************************************************************')
print('Info: ',toronto_df.info())

The Shape of DF:  (180, 3)
******************************************************************************************************
Description:         Postal Code       Borough  Neighborhood
count          180           180           180
unique         180            16            99
top            M7J  Not Assigned  Not Assigned
freq             1            77            77
******************************************************************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Postal Code   180 non-null    object
 1   Borough       180 non-null    object
 2   Neighborhood  180 non-null    object
dtypes: object(3)
memory usage: 4.3+ KB
Info:  None


### Cleaning the data frame:
    
    Deleting rows which have 'Not Assigned' Values.
    

In [6]:
toronto_df1= toronto_df[toronto_df.Borough!="Not Assigned"]
toronto_df1.reset_index(drop=True,inplace=True)
toronto_df1.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


### Data Frame shape:

In [7]:
print('The Shape of New Data Frame is: ',toronto_df1.shape)

The Shape of New Data Frame is:  (103, 3)


### Adding new columns of Latitude and Longitude:

    A list of none values will be added to the Data Frame.

In [8]:
lati_list = [None] * toronto_df1.shape[0]
longi_list = [None] * toronto_df1.shape[0]
toronto_df1['Latitude'] = lati_list
toronto_df1['Longitude'] = longi_list
toronto_df1.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toronto_df1['Latitude'] = lati_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toronto_df1['Longitude'] = longi_list


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
3,M6A,North York,"Lawrence Manor, Lawrence Heights",,
4,M7A,Queen's Park,Ontario Provincial Government,,


### Obtaining the Latitudes and Longitudes:
    
    Using Geocoder package we will get the information regarding Latitude and Longitude.
    
    Note: Due to the result of getting 'None' values for certain adresses  in geopy Nominatim Library,
          pgeocode library is used to get latitudes and longitudes using the Postal Codes.
    
    

In [9]:
for i, code in enumerate(toronto_df1['Postal Code']):
    nomi = pgeocode.Nominatim('ca')                           # Canada code is 'ca'
    postal_code = code                                        # Postal Code
    location = nomi.query_postal_code(postal_code)            # getting location details by postal code
    latitude = location.latitude                              # Latitude
    longitude = location.longitude                            # Longitude
    toronto_df1['Latitude'][i] = latitude
    toronto_df1['Longitude'][i] = longitude

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
print('The Data Frame has {} boroughs '.format(len(toronto_df1['Borough'].unique())))

The Data Frame has 15 boroughs 


### Cleaning the Data Frame:
     
    It seems that the pgeocode library could not find the latitude and longitude of row 76. So this row will be
    dropped.

In [11]:
toronto_df1.iloc[76]  # Show the details of row 76

Postal Code                                                  M7R
Borough         MississaugaCanada Post Gateway Processing Centre
Neighborhood                                      Enclave of L4W
Latitude                                                     NaN
Longitude                                                    NaN
Name: 76, dtype: object

In [12]:
toronto_df1.drop(labels=76,axis=0,inplace=True)  # Dropping Row 76 as it has Nan Value

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [13]:
toronto_df1.reset_index(drop=True, inplace=True) # Resting Index

### Vizualizing using Folium Map

##### Get the address of Toronto:

In [14]:
address = 'Toronto, Ontario, CA'
geolocator = Nominatim(user_agent='toronto_explorer')
location = geolocator.geocode(address)
lati_toronto = location.latitude
longi_toronto = location.longitude
print(f'The Cordinates of Toronto is {lati_toronto} and {longi_toronto}.')

The Cordinates of Toronto is 43.6534817 and -79.3839347.


##### Map with all Neighborhoods:

In [15]:
toronto_map= folium.Map(location=[lati_toronto,longi_toronto],zoom_start=10)

for lat,lng,borough,neighborhood in zip(toronto_df1['Latitude'],toronto_df1['Longitude'],toronto_df1['Borough'],toronto_df1['Neighborhood']):
    
    label = '{}, {}'.format(neighborhood,borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker( [lat,lng],radius = 5, popup = label, color ='red', fill =True, fill_color = 'red', fill_capacity=0.7, parse_html=False).add_to(toronto_map)
        

toronto_map

#### Display th DataFrame with coordinates

In [16]:
toronto_df1.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889
5,M9A,Etobicoke,Islington Avenue,43.6662,-79.5282
6,M1B,Scarborough,"Malvern, Rouge",43.8113,-79.193
7,M3B,North York,Don Mills,43.745,-79.359
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7063,-79.3094
9,M5B,Downtown Toronto,"Garden District,Ryerson",43.6572,-79.3783


### Define Foursquare Credintials:
    

In [1]:
CLIENT_ID = ''
CLIENT_SECRET = ''
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 
CLIENT_SECRET:


### Lets explore all the Postal Codes addresses:
     
     1. Define a function to extract the info from the API
     2. Call the Function with all the names, latitudes and longitudes.

##### Defining the Function:

In [18]:
def get_nearby_venues(names, latitude, longitude, radius=500):
    LIMIT = 100
    venues_list =[]
    for name, lati, longi in zip(names,latitude,longitude):
        print(name)
        
        # API Request:
        url = '''https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},
                    {}&radius={}&limit={}'''.format(CLIENT_ID, CLIENT_SECRET, VERSION, lati, longi, radius, LIMIT)
        
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([(name,lati,longi, v['venue']['name'], v['venue']['location']['lat'],
                            v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])
    
        
    # Update to a Data Frame: 
    near_by_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    near_by_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 
                              'Venue Latitude', 'Venue Longitude', 'Venue Category']
    return(near_by_venues)

##### Calling the function:

In [19]:
toronto_codes_venues = get_nearby_venues(names = toronto_df1['Neighborhood'],
                                         latitude = toronto_df1['Latitude'], longitude = toronto_df1['Longitude'])

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District,Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington,  Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate ,  Bloordale Gardens ,  Old Burnhamthorpe , Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
CentralBay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, DownsviewNorth
Thorncliffe Park
Richmond,  Adelaide , King
Dufferin, Dovercourt Village
Scarborough Village
Fairview , Henry Farm,  Oriole
Northwood Park, York University
The DanforthEast
HarbourfrontEast , Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview,  EastBirchmount Park
Bayview Village
Downsview
The DanforthWest , River

In [20]:
toronto_codes_venues.head(20)   # Display first 20 rows

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.7545,-79.33,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.7545,-79.33,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.7276,-79.3148,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.7276,-79.3148,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,Victoria Village,43.7276,-79.3148,Tim Hortons,43.725517,-79.313103,Coffee Shop
5,Victoria Village,43.7276,-79.3148,Eglinton Ave E & Sloane Ave/Bermondsey Rd,43.726086,-79.31362,Intersection
6,Victoria Village,43.7276,-79.3148,Pizza Nova,43.725824,-79.31286,Pizza Place
7,Victoria Village,43.7276,-79.3148,Wigmore Park,43.731023,-79.310771,Park
8,"Regent Park, Harbourfront",43.6555,-79.3626,Tandem Coffee,43.653559,-79.361809,Coffee Shop
9,"Regent Park, Harbourfront",43.6555,-79.3626,Roselle Desserts,43.653447,-79.362017,Bakery


In [21]:
toronto_codes_venues.groupby('Neighborhood').count()         # Count the venues for each neighborhoods

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
"Alderwood, Long Branch",6,6,6,6,6,6
"Bathurst Manor, Wilson Heights, DownsviewNorth",6,6,6,6,6,6
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence ManorEast",26,26,26,26,26,26
...,...,...,...,...,...,...
"Willowdale , Newtonbrook",2,2,2,2,2,2
Woburn,2,2,2,2,2,2
Woodbine Heights,5,5,5,5,5,5
"York Mills, Silver Hills",2,2,2,2,2,2


##### Print the unique categories

In [22]:
print('There are {} uniques categories.'.format(len(toronto_codes_venues['Venue Category'].unique())))

There are 263 uniques categories.


### Analyzing each Neighborhood:

##### Onehot Encoding

In [23]:
toronto_onehot = pd.get_dummies(toronto_codes_venues[['Venue Category']], prefix='', prefix_sep='')
toronto_onehot['Neighborhood'] = toronto_codes_venues['Neighborhood']
fixed_columns= [toronto_onehot.columns[-1]] +list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head(20)

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Shape of toronto_onehot
toronto_onehot.shape

(2147, 263)

##### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [25]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, DownsviewNorth",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence ManorEast",0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,"Willowdale , Newtonbrook",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,"York Mills, Silver Hills",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Shape of grouped data:
toronto_grouped.shape

(95, 263)

##### Print each neighborhood along with the top 5 most common venues

In [27]:
num_top_venues = 10
for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freqency']
    temp = temp.iloc[1:]
    temp['freqency'] = temp['freqency'].astype(float)
    temp = temp.round({'freqency': 2})
    print(temp.sort_values('freqency', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freqency
0                  Newsagent       0.2
1            Badminton Court       0.2
2               Skating Rink       0.2
3  Latin American Restaurant       0.2
4             Breakfast Spot       0.2
5                  Pet Store       0.0
6      Performing Arts Venue       0.0
7         Mexican Restaurant       0.0
8  Middle Eastern Restaurant       0.0
9         Miscellaneous Shop       0.0


----Alderwood, Long Branch----
                             venue  freqency
0                Convenience Store      0.17
1                              Gym      0.17
2                              Pub      0.17
3                      Coffee Shop      0.17
4                   Sandwich Place      0.17
5                      Pizza Place      0.17
6               Italian Restaurant      0.00
7                Mobile Phone Shop      0.00
8  Molecular Gastronomy Restaurant      0.00
9              Monument / Landmark      0.00


----Bathurst Manor, Wil

9  Salon / Barbershop      0.04


----DavisvilleNorth----
                  venue  freqency
0               Dog Run      0.29
1      Department Store      0.14
2     Food & Drink Shop      0.14
3        Breakfast Spot      0.14
4                  Park      0.14
5  Gym / Fitness Center      0.14
6         Movie Theater      0.00
7           Music Store      0.00
8                Museum      0.00
9         Moving Target      0.00


----Del Ray , Mount Dennis, Keelsdale and Silverthorn----
                  venue  freqency
0  Fast Food Restaurant      0.67
1           Coffee Shop      0.33
2           Yoga Studio      0.00
3   Moroccan Restaurant      0.00
4           Music Venue      0.00
5           Music Store      0.00
6                Museum      0.00
7         Moving Target      0.00
8         Movie Theater      0.00
9   Monument / Landmark      0.00


----Don Mills----
                        venue  freqency
0                         Gym      0.25
1                       River     

9           Molecular Gastronomy Restaurant       0.0


----Humber Summit----
                             venue  freqency
0           Furniture / Home Store      0.67
1       Construction & Landscaping      0.33
2                        Newsagent      0.00
3        Middle Eastern Restaurant      0.00
4               Miscellaneous Shop      0.00
5                Mobile Phone Shop      0.00
6       Modern European Restaurant      0.00
7  Molecular Gastronomy Restaurant      0.00
8              Monument / Landmark      0.00
9              Moroccan Restaurant      0.00


----Humberlea, Emery----
                       venue  freqency
0                       Café      0.14
1             Discount Store      0.14
2  Latin American Restaurant      0.14
3              Grocery Store      0.14
4                Coffee Shop      0.14
5                  Nightclub      0.14
6         African Restaurant      0.14
7        Moroccan Restaurant      0.00
8              Movie Theater      0.00
9         

                             venue  freqency
0                Convenience Store      0.25
1                   Baseball Field      0.25
2                             Park      0.25
3                      Flower Shop      0.25
4               Miscellaneous Shop      0.00
5                Mobile Phone Shop      0.00
6       Modern European Restaurant      0.00
7  Molecular Gastronomy Restaurant      0.00
8              Monument / Landmark      0.00
9                        Newsagent      0.00


----Ontario Provincial Government----
                   venue  freqency
0       Sushi Restaurant      0.07
1               Beer Bar      0.04
2        Bubble Tea Shop      0.04
3          Burrito Place      0.04
4            Coffee Shop      0.04
5      College Cafeteria      0.04
6        College Theater      0.04
7             Restaurant      0.04
8       Ramen Restaurant      0.04
9  Portuguese Restaurant      0.04


----Parkdale, Roncesvalles----
                         venue  freqency
0     


----The DanforthWest , Riverdale----
                venue  freqency
0    Greek Restaurant      0.20
1         Coffee Shop      0.06
2                Café      0.06
3      Ice Cream Shop      0.06
4  Italian Restaurant      0.06
5         Yoga Studio      0.03
6          Restaurant      0.03
7       Grocery Store      0.03
8              Lounge      0.03
9     Bubble Tea Shop      0.03


----The Kingsway,  Montgomery Road , Old MillNorth----
              venue  freqency
0  Business Service      0.12
1              Bank      0.12
2  Sushi Restaurant      0.12
3    Breakfast Spot      0.12
4               Bar      0.06
5      Liquor Store      0.06
6       Coffee Shop      0.06
7            Lounge      0.06
8            Bakery      0.06
9        Restaurant      0.06


----Thorncliffe Park----
                venue  freqency
0   Indian Restaurant      0.12
1   Afghan Restaurant      0.08
2         Coffee Shop      0.08
3  Turkish Restaurant      0.08
4        Intersection      0.04
5   

##### Storing the above data into a Dataframe:
      
      First, let's write a function to sort the venues in descending order.

##### Function to sort

In [28]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [29]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']

for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Badminton Court,Skating Rink,Newsagent,Latin American Restaurant,Breakfast Spot
1,"Alderwood, Long Branch",Pizza Place,Pub,Sandwich Place,Coffee Shop,Gym
2,"Bathurst Manor, Wilson Heights, DownsviewNorth",Pizza Place,Deli / Bodega,Mediterranean Restaurant,Middle Eastern Restaurant,Fried Chicken Joint
3,Bayview Village,Flower Shop,Park,Locksmith,Trail,Ethiopian Restaurant
4,"Bedford Park, Lawrence ManorEast",Italian Restaurant,Coffee Shop,Sandwich Place,Restaurant,Fast Food Restaurant


### Clustering Neighborhoods:
        
     Clustering will be done by K-Means Clustering

In [30]:
kcluster = 5
toronto_clustering = toronto_grouped.drop('Neighborhood',1)
toronto_clustering.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# run k-means clustering
kmeans = KMeans(n_clusters =kcluster, random_state =0).fit(toronto_clustering)
kmeans.labels_[0:10]

array([0, 0, 0, 3, 0, 0, 0, 0, 0, 3])

##### Adding cluster to the data frame:

In [32]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_df1

toronto_merged =toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.7545,-79.33,4.0,Food & Drink Shop,Park,Women's Store,Ethiopian Restaurant,Donut Shop
1,M4A,North York,Victoria Village,43.7276,-79.3148,3.0,Pizza Place,Park,Coffee Shop,Portuguese Restaurant,Intersection
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626,0.0,Coffee Shop,Breakfast Spot,Restaurant,Yoga Studio,Italian Restaurant
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504,0.0,Clothing Store,Coffee Shop,Women's Store,Restaurant,Furniture / Home Store
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889,0.0,Sushi Restaurant,Creperie,Theater,Martial Arts School,Bubble Tea Shop


##### Some rows have NaN Values, so drop them:

In [33]:
toronto_merged.dropna(inplace=True)
toronto_merged.reset_index(drop=True, inplace=True)

##### Visualizing Clusters:

In [34]:
map_clusters = folium.Map(location= [lati_toronto,longi_toronto], zoom_start=10)
x=np.arange(kcluster)
ys = [i + x + (i*x)**2 for i in range(kcluster)]
colors_array =  cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []

for lat, lng, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

### Examine Clusters:

##### Cluster 1:

In [35]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2,Downtown Toronto,0.0,Coffee Shop,Breakfast Spot,Restaurant,Yoga Studio,Italian Restaurant
3,North York,0.0,Clothing Store,Coffee Shop,Women's Store,Restaurant,Furniture / Home Store
4,Queen's Park,0.0,Sushi Restaurant,Creperie,Theater,Martial Arts School,Bubble Tea Shop
8,East York,0.0,Pizza Place,Gym / Fitness Center,Pet Store,Café,Flea Market
9,Downtown Toronto,0.0,Coffee Shop,Clothing Store,Café,Italian Restaurant,Cosmetics Shop
...,...,...,...,...,...,...,...
94,Downtown Toronto,0.0,Coffee Shop,Hotel,Café,Restaurant,Gym
95,Etobicoke,0.0,Breakfast Spot,Business Service,Sushi Restaurant,Bank,Bakery
96,Downtown Toronto,0.0,Japanese Restaurant,Coffee Shop,Sushi Restaurant,Gay Bar,Restaurant
97,East TorontoBusiness reply mailProcessing Cent...,0.0,Restaurant,Coffee Shop,Yoga Studio,Deli / Bodega,Bank


##### Cluster 2:

In [36]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
6,Scarborough,1.0,Home Service,Flower Shop,Fish Market,Fish & Chips Shop,Field
62,Central Toronto,1.0,Home Service,Fast Food Restaurant,Flower Shop,Fish Market,Fish & Chips Shop
68,Central Toronto,1.0,Home Service,Trail,Escape Room,Doner Restaurant,Donut Shop


##### Cluster 3:

In [37]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
45,North York,2.0,Cafeteria,Pool,Women's Store,Ethiopian Restaurant,Donut Shop


##### Cluster 4:

In [38]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,North York,3.0,Pizza Place,Park,Coffee Shop,Portuguese Restaurant,Intersection
5,Etobicoke,3.0,Pharmacy,Grocery Store,Park,Skating Rink,Bank
7,North York,3.0,Gym,Home Service,Trail,River,Park
13,North York,3.0,Gym,Home Service,Trail,River,Park
16,York,3.0,Field,Park,Tennis Court,Deli / Bodega,Trail
21,York,3.0,Park,Women's Store,Mexican Restaurant,Beer Store,Bakery
25,Downtown Toronto,3.0,Café,Grocery Store,Athletics & Sports,Park,Playground
31,West Toronto,3.0,Grocery Store,Bakery,Park,Bank,Furniture / Home Store
36,Downtown Toronto,3.0,Music Venue,Café,Park,Harbor / Marina,Escape Room
39,North York,3.0,Flower Shop,Park,Locksmith,Trail,Ethiopian Restaurant


##### Cluster 5:

In [39]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,North York,4.0,Food & Drink Shop,Park,Women's Store,Ethiopian Restaurant,Donut Shop
27,North York,4.0,Park,Residential Building (Apartment / Condo),Women's Store,Ethiopian Restaurant,Donut Shop
32,Scarborough,4.0,Park,Grocery Store,Women's Store,Ethiopian Restaurant,Donut Shop
35,East YorkEast Toronto,4.0,Convenience Store,Park,Dog Run,Fish & Chips Shop,Field
61,Central Toronto,4.0,Photography Studio,Park,Women's Store,Ethiopian Restaurant,Donut Shop
64,York,4.0,Park,Convenience Store,Dog Run,Fish & Chips Shop,Field
66,North York,4.0,Convenience Store,Park,Daycare,Ethiopian Restaurant,Donut Shop
69,West Toronto,4.0,Bowling Alley,Residential Building (Apartment / Condo),Park,Ethiopian Restaurant,Donut Shop
