# Web scraping Toronto neighborhood data

### Importing required libraries

In [21]:
import requests
import pandas as pd
import csv

### Installing BeautifulSoup if not already installed

In [2]:
#!conda install beautifulsoup4
from bs4 import BeautifulSoup

### Web scraping to obtain required neighbourhood data from wikipedia

In [36]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text 
soup = BeautifulSoup(source, 'lxml')
file = open('postal_codes_toronto.csv', 'w')
csv_wr = csv.writer(file)
csv_wr.writerow(['PostalCode', 'Borough', 'Neighbourhood'])

table = soup.find('table', class_ = 'wikitable')
table_rows = table.find_all('tr')
postal_codes = []
boroughs = [] 
neighbourhoods = [] 

for row in table_rows:    
    columns = row.find_all('td')
    try :
        if columns[1].text != 'Not assigned':  
            
            postalcode = columns[0].text.split('\n')[0]
            postal_codes.append(postalcode)
            borough = columns[1].text.split('\n')[0]
            boroughs.append(borough)
            neighbourhood = columns[2].text.split('\n')[0]    
            neighbourhoods.append(neighbourhood)
            
    except Exception as e : 
        pass    
    
    
postal_present = []
for index_i, postal_i in enumerate(postal_codes) :   
    if postal_i not in postal_present :
        nbds = neighbourhoods[index_i]
        for index_f, postal_f in enumerate(postal_codes) :
            if postal_i == postal_f and index_i != index_f:
                nbds = nbds + ', ' + neighbourhoods[index_f] # Concatenating the neighbourhood names
        csv_wr.writerow([postal_i, boroughs[index_i], nbds]) # Writing the rows in the csv file
        postal_present.append(postal_i)
file.close()

### Creating a dataframe

In [96]:
df = pd.read_csv('postal_codes_toronto.csv')

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
PostalCode       180 non-null object
Borough          180 non-null object
Neighbourhood    103 non-null object
dtypes: object(3)
memory usage: 4.3+ KB


In [98]:
df.shape

(180, 3)

In [99]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Drpping NaN values

In [100]:
df2 = df.dropna()
df2.reset_index(drop=True, inplace=True)

In [101]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
PostalCode       103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
dtypes: object(3)
memory usage: 2.5+ KB


In [102]:
df2.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


### Replacing / in neighbourhood with ,

In [103]:
df2['Neighbourhood'] = df2.Neighbourhood.str.replace('/',',')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [113]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
PostalCode       103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
dtypes: object(3)
memory usage: 2.5+ KB


In [108]:
df2 = df2.astype(str)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
PostalCode       103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
dtypes: object(3)
memory usage: 2.5+ KB


### storing the final cleaned data into a csv file

In [109]:
df2.to_csv('Toronto_neighbourhoods.csv')

In [115]:
df2.shape

(103, 3)

### Obtaining latitude and longitude information from the csv file

In [25]:
df_loc = pd.read_csv("Geospatial_Coordinates.csv")

In [26]:
df_loc.rename(columns={'Postal Code':'PostalCode'},inplace=True)

In [27]:
df_loc.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [24]:
df_nbd = pd.read_csv('Toronto_neighbourhoods.csv',index_col=[0])
df_nbd.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [21]:
df_loc.rename(columns={'Postal Code':'PostalCode'},inplace=True)

In [28]:
df_final = pd.merge(df_nbd,df_loc,how="inner")

In [29]:
df_final.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


In [30]:
df_final.to_csv('Latitude_longitude_data.csv')

# Neighborhood clustering

## Creating a map of Toronto

### Importing necessary libraries

In [2]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes 
import folium 

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\kt5702\AppData\Local\Continuum\anaconda3

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         184 KB  conda-forge
    certifi-2020.4.5.1         |   py37hc8dfbb8_0         150 KB  conda-forge
    conda-4.8.3                |   py37hc8dfbb8_1         3.1 MB  conda-forge
    folium-0.5.0               |             py_0



  current version: 4.8.2
  latest version: 4.8.3

Please update conda by running

    $ conda update -n base -c defaults conda




In [3]:
!conda install -c conda-forge geopy --yes

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\kt5702\AppData\Local\Continuum\anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          92 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.21.0-py_0



Downloading and Extracting Packages

geopy-1.21.0         | 58 KB     |            |   0% 
geopy-1.21.0         | 58 KB     | ##7        |  27% 
geo

### Co-ordinates of Toronto

In [4]:
from geopy.geocoders import Nominatim
address = 'Toronto'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [6]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
df_nbd = pd.read_csv('Latitude_longitude_data.csv')
# add markers to map
for lat, lng, borough, neighbourhood in zip(df_nbd['Latitude'], df_nbd['Longitude'], df_nbd['Borough'], df_nbd['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Lets explore Central Toronto for the sake of simplicity

In [7]:
central_toronto_data = df_nbd[df_nbd['Borough'] == 'Central Toronto'].reset_index(drop=True)
central_toronto_data.head()

Unnamed: 0.1,Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,61,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,62,M5N,Central Toronto,Roselawn,43.711695,-79.416936
2,67,M4P,Central Toronto,Davisville North,43.712751,-79.390197
3,68,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307
4,73,M4R,Central Toronto,North Toronto West,43.715383,-79.405678


### Obtaining geographical co-ordinates of Central Toronto

In [9]:
address = 'Central Toronto'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Central Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Central Toronto are 43.6534817, -79.3839347.


### Creating a map of Central Toronto

In [11]:
map_central_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(central_toronto_data['Latitude'], central_toronto_data['Longitude'], central_toronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_central_toronto)  
    
map_central_toronto

### Defining foursquare credentials

In [63]:
CLIENT_ID = 'ID' # your Foursquare ID
CLIENT_SECRET = 'SECRET' # your Foursquare Secret
VERSION = '20200405' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ID
CLIENT_SECRET:SECRET


### Exploring the first neighbourhood

In [13]:
central_toronto_data.loc[0, 'Neighbourhood']

'Lawrence Park'

In [14]:
neighbourhood_latitude = central_toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = central_toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighbourhood_name = central_toronto_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Lawrence Park are 43.7280205, -79.3887901.


### Top 100 venues around Lawrence park

In [16]:
LIMIT = 100 
radius = 500 
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=FFUEFXAWWQLZDW03A5BEPUSJAWP3XFEHBQVESTI3CGROWUM4&client_secret=QATOK0O05BB4KZG00TGT3NCAC5B0X4I423S4P1T5WF043ATX&v=20200405&ll=43.7280205,-79.3887901&radius=500&limit=100'

### Repeating for all neighbourhoods

In [18]:
import json
from pandas.io.json import json_normalize

In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
central_toronto_venues = getNearbyVenues(names=central_toronto_data['Neighbourhood'],
                                   latitudes=central_toronto_data['Latitude'],
                                   longitudes=central_toronto_data['Longitude']
                                  )

Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
North Toronto West
The Annex , North Midtown , Yorkville
Davisville
Moore Park , Summerhill East
Summerhill West , Rathnelly , South Hill , Forest Hill SE , Deer Park


In [23]:
print(central_toronto_venues.shape)
central_toronto_venues.head()

(111, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,Roselawn,43.711695,-79.416936,Rosalind's Garden Oasis,43.712189,-79.411978,Garden
4,Davisville North,43.712751,-79.390197,Summerhill Market North,43.715499,-79.392881,Food & Drink Shop


In [25]:
central_toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Davisville,34,34,34,34,34,34
Davisville North,7,7,7,7,7,7
Forest Hill North & West,5,5,5,5,5,5
Lawrence Park,3,3,3,3,3,3
"Moore Park , Summerhill East",2,2,2,2,2,2
North Toronto West,22,22,22,22,22,22
Roselawn,1,1,1,1,1,1
"Summerhill West , Rathnelly , South Hill , Forest Hill SE , Deer Park",15,15,15,15,15,15
"The Annex , North Midtown , Yorkville",22,22,22,22,22,22


In [27]:
print('There are {} unique categories.'.format(len(central_toronto_venues['Venue Category'].unique())))

There are 61 unique categories.


### Analysis of all neighbourhoods

In [29]:
# one hot encoding
c_toronto_onehot = pd.get_dummies(central_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
c_toronto_onehot['Neighborhood'] = central_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [c_toronto_onehot.columns[-1]] + list(c_toronto_onehot.columns[:-1])
c_toronto_onehot = c_toronto_onehot[fixed_columns]

c_toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Bank,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,Cosmetics Shop,Dance Studio,Department Store,Dessert Shop,Diner,Discount Store,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station,Gourmet Shop,Greek Restaurant,Gym,Gym / Fitness Center,Health & Beauty Service,History Museum,Hotel,Indian Restaurant,Italian Restaurant,Jewelry Store,Light Rail Station,Liquor Store,Mexican Restaurant,Middle Eastern Restaurant,Park,Pharmacy,Pizza Place,Pub,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Spa,Sporting Goods Shop,Sports Bar,Summer Camp,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Roselawn,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Davisville North,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [30]:
c_toronto_grouped = c_toronto_onehot.groupby('Neighborhood').mean().reset_index()
c_toronto_grouped

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Bank,Breakfast Spot,Brewery,Burger Joint,Bus Line,Café,Chinese Restaurant,Clothing Store,Coffee Shop,Cosmetics Shop,Dance Studio,Department Store,Dessert Shop,Diner,Discount Store,Donut Shop,Farmers Market,Fast Food Restaurant,Food & Drink Shop,Fried Chicken Joint,Garden,Gas Station,Gourmet Shop,Greek Restaurant,Gym,Gym / Fitness Center,Health & Beauty Service,History Museum,Hotel,Indian Restaurant,Italian Restaurant,Jewelry Store,Light Rail Station,Liquor Store,Mexican Restaurant,Middle Eastern Restaurant,Park,Pharmacy,Pizza Place,Pub,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Spa,Sporting Goods Shop,Sports Bar,Summer Camp,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.0,0.029412,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.058824,0.0,0.0,0.058824,0.0,0.029412,0.0,0.088235,0.029412,0.029412,0.0,0.029412,0.0,0.0,0.0,0.0,0.029412,0.029412,0.029412,0.058824,0.0,0.0,0.0,0.0,0.029412,0.058824,0.0,0.0,0.0,0.0,0.0,0.029412,0.029412,0.058824,0.0,0.029412,0.0,0.088235,0.029412,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.029412,0.029412,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Forest Hill North & West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.2,0.0,0.0,0.0
3,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0
4,"Moore Park , Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,North Toronto West,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.045455,0.045455,0.181818,0.090909,0.0,0.0,0.0,0.045455,0.045455,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.045455,0.0,0.0,0.0,0.045455,0.045455,0.0,0.045455,0.045455,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455
6,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Summerhill West , Rathnelly , South Hill , For...",0.066667,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.066667,0.0,0.0,0.0,0.0,0.066667,0.133333,0.066667,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.066667,0.066667,0.0,0.0,0.0,0.0,0.0,0.066667,0.0
8,"The Annex , North Midtown , Yorkville",0.045455,0.0,0.045455,0.0,0.0,0.0,0.0,0.045455,0.0,0.136364,0.0,0.0,0.090909,0.045455,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.045455,0.0,0.0,0.0,0.045455,0.0,0.045455,0.045455,0.045455,0.045455,0.045455,0.0,0.0,0.136364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0


### Printing top 5 most common venues

In [32]:
num_top_venues = 5

for hood in c_toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = c_toronto_grouped[c_toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
            venue  freq
0  Sandwich Place  0.09
1    Dessert Shop  0.09
2            Café  0.06
3             Gym  0.06
4     Pizza Place  0.06


----Davisville North----
               venue  freq
0              Hotel  0.14
1                Gym  0.14
2   Department Store  0.14
3  Food & Drink Shop  0.14
4               Park  0.14


----Forest Hill North & West----
              venue  freq
0          Bus Line   0.2
1             Trail   0.2
2     Jewelry Store   0.2
3              Park   0.2
4  Sushi Restaurant   0.2


----Lawrence Park----
                 venue  freq
0          Swim School  0.33
1             Bus Line  0.33
2                 Park  0.33
3  American Restaurant  0.00
4   Salon / Barbershop  0.00


----Moore Park , Summerhill East----
                 venue  freq
0                 Park   0.5
1          Summer Camp   0.5
2  American Restaurant   0.0
3   Salon / Barbershop   0.0
4    Indian Restaurant   0.0


----North Toronto West----
                 

In [33]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [35]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = c_toronto_grouped['Neighborhood']

for ind in np.arange(c_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(c_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Sandwich Place,Dessert Shop,Coffee Shop,Sushi Restaurant,Pizza Place,Gym,Italian Restaurant,Café,Pharmacy,Indian Restaurant
1,Davisville North,Department Store,Gym,Hotel,Sandwich Place,Breakfast Spot,Park,Food & Drink Shop,Donut Shop,Diner,Discount Store
2,Forest Hill North & West,Park,Trail,Sushi Restaurant,Bus Line,Jewelry Store,Yoga Studio,Farmers Market,Diner,Discount Store,Donut Shop
3,Lawrence Park,Swim School,Bus Line,Park,Yoga Studio,Department Store,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint,Food & Drink Shop
4,"Moore Park , Summerhill East",Summer Camp,Park,Yoga Studio,Dance Studio,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint,Food & Drink Shop,Fast Food Restaurant


### K-Means clustering

In [59]:
from sklearn.cluster import KMeans
kclusters = 5

c_toronto_grouped_clustering = c_toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(c_toronto_grouped_clustering)

kmeans.labels_[0:10] 

array([0, 0, 4, 3, 2, 0, 1, 0, 0])

In [60]:
central_toronto_data.rename(columns={'Neighbourhood':'Neighborhood'},inplace=True)

In [61]:
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

c_toronto_merged = central_toronto_data
c_toronto_merged = c_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

c_toronto_merged.head()

Unnamed: 0.1,Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,61,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Swim School,Bus Line,Park,Yoga Studio,Department Store,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint,Food & Drink Shop
1,62,M5N,Central Toronto,Roselawn,43.711695,-79.416936,1,Garden,Yoga Studio,Gym / Fitness Center,Greek Restaurant,Gourmet Shop,Gas Station,Fried Chicken Joint,Food & Drink Shop,Fast Food Restaurant,Farmers Market
2,67,M4P,Central Toronto,Davisville North,43.712751,-79.390197,0,Department Store,Gym,Hotel,Sandwich Place,Breakfast Spot,Park,Food & Drink Shop,Donut Shop,Diner,Discount Store
3,68,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307,4,Park,Trail,Sushi Restaurant,Bus Line,Jewelry Store,Yoga Studio,Farmers Market,Diner,Discount Store,Donut Shop
4,73,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,0,Clothing Store,Coffee Shop,Yoga Studio,Salon / Barbershop,Bagel Shop,Café,Chinese Restaurant,Dessert Shop,Diner,Fast Food Restaurant


In [62]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(c_toronto_merged['Latitude'], c_toronto_merged['Longitude'], c_toronto_merged['Neighborhood'], c_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [49]:
c_toronto_merged.loc[c_toronto_merged['Cluster Labels'] == 0, c_toronto_merged.columns[[1] + list(range(5, c_toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M4P,-79.390197,0,Department Store,Gym,Hotel,Sandwich Place,Breakfast Spot,Park,Food & Drink Shop,Donut Shop,Diner,Discount Store
4,M4R,-79.405678,0,Clothing Store,Coffee Shop,Yoga Studio,Salon / Barbershop,Bagel Shop,Café,Chinese Restaurant,Dessert Shop,Diner,Fast Food Restaurant
5,M5R,-79.405678,0,Sandwich Place,Café,Coffee Shop,American Restaurant,Donut Shop,History Museum,Indian Restaurant,Liquor Store,Middle Eastern Restaurant,Park
6,M4S,-79.38879,0,Sandwich Place,Dessert Shop,Coffee Shop,Sushi Restaurant,Pizza Place,Gym,Italian Restaurant,Café,Pharmacy,Indian Restaurant
8,M4V,-79.400049,0,Pub,Coffee Shop,Sports Bar,Fried Chicken Joint,Vietnamese Restaurant,Light Rail Station,Liquor Store,Pizza Place,Restaurant,American Restaurant


Cluster zero consists predominantly of restaraunts.

In [50]:
c_toronto_merged.loc[c_toronto_merged['Cluster Labels'] == 1, c_toronto_merged.columns[[1] + list(range(5, c_toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M5N,-79.416936,1,Garden,Yoga Studio,Gym / Fitness Center,Greek Restaurant,Gourmet Shop,Gas Station,Fried Chicken Joint,Food & Drink Shop,Fast Food Restaurant,Farmers Market


Cluster 1 seems to have only one neighbourhood consiting of fitness related establishments.

In [51]:
c_toronto_merged.loc[c_toronto_merged['Cluster Labels'] == 2, c_toronto_merged.columns[[1] + list(range(5, c_toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,M4T,-79.38316,2,Summer Camp,Park,Yoga Studio,Dance Studio,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint,Food & Drink Shop,Fast Food Restaurant


Cluster 2 seems to be similar to cluster 1

In [52]:
c_toronto_merged.loc[c_toronto_merged['Cluster Labels'] == 3, c_toronto_merged.columns[[1] + list(range(5, c_toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,-79.38879,3,Swim School,Bus Line,Park,Yoga Studio,Department Store,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint,Food & Drink Shop


Cluster 1 , 2 and 3 consist of similar establishments in the top.

# Observations

Cluster 0 can be a very interesting and lucrative spot for realtors who are planning on starting a cafe or a restaraunt.
Clusters 1, 2 ans 3 are spots to initiate fitness related establishments.