# ACNiT: Analysing and Clustering Neighbourhoods in Toronto
### Part 1: Scrapping and cleaning data

In [108]:
import urllib.request
import pandas as pd
import numpy as np

In [109]:
pip install BeautifulSoup4


Note: you may need to restart the kernel to use updated packages.


In [110]:
from bs4 import BeautifulSoup

## 1) Scraping the data from  wikipedia page: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [111]:
#specify the url
wiki = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [112]:
#Query the website and return the html to the variable 'page'
page = urllib.request.urlopen(wiki)

In [113]:
#Parse the html in the 'page' variable, and store it in Beautiful Soup format
soup = BeautifulSoup(page)

In [114]:
# test the requested data
#print(soup.prettify())
#right_table=soup.find('table', class_= "wikitable sortable")
#right_table

### 1-1) Extracting the right information from soup and form it as an array

In [115]:
# extract the right information from soup and form it as an array

data = []   #will contain the table information
columns = []  #names of columns' table 

#find the right table in the webpage
right_table=soup.find('table', class_='wikitable sortable')

# extract the name of the columns from table's header
table_header = right_table.find_all('th')  
for th in table_header:
    #table_header = right_table.find_all('th')  
    columns.append(th.get_text(strip=True))

#try clause to skip any mis-formatted table with missing/empty tables
try:
#loop through table, grab each of the 3 columns text and append to the list of tuples. 
    for row in right_table.find_all('tr'):
        cols = row.find_all('td')
        if len(cols) == 3:
            data.append(( cols[0].text.strip(), cols[1].text.strip(), cols[2].text.strip()))
except: pass

#convert output to new array, check length
data = np.asarray(data)
print(len(data))

288


### 1-2) Creating the dataframe from the array, rename the columns and the index: 

In [116]:
#creat the dataframe, rename the columns 
df_data = pd.DataFrame(data)
df_data.columns = columns

#Rename index column
inx=df_data.index
inx.rename(name='index',inplace=True)

#and check output
df_data.head(10)


Unnamed: 0_level_0,Postcode,Borough,Neighbourhood
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### 1-3) Cleanning the data in the dataframe

In [117]:
#Remove/drop rows with unassigned Boroughs' values and reset the index

df_data = df_data[df_data.Borough!='Not assigned']
df_data.reset_index(drop= True, inplace=True)
df_data.head(10)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [118]:
#Aggregate rows with simillar postcodes

combine_data_neigh=lambda neigh: " , ".join(neigh)
combine_data_Boro= lambda boro: " , ".join(boro)

df_combinedData=df_data.groupby('Postcode').aggregate({'Borough': combine_data_Boro, 'Neighbourhood': combine_data_neigh}).reset_index()
df_combinedData.head(20)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,"Scarborough , Scarborough","Rouge , Malvern"
1,M1C,"Scarborough , Scarborough , Scarborough","Highland Creek , Rouge Hill , Port Union"
2,M1E,"Scarborough , Scarborough , Scarborough","Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,"Scarborough , Scarborough , Scarborough","East Birchmount Park , Ionview , Kennedy Park"
7,M1L,"Scarborough , Scarborough , Scarborough","Clairlea , Golden Mile , Oakridge"
8,M1M,"Scarborough , Scarborough , Scarborough","Cliffcrest , Cliffside , Scarborough Village West"
9,M1N,"Scarborough , Scarborough","Birch Cliff , Cliffside West"


In [119]:
# clean Borough column 
j=0
for i in range(len(df_combinedData)):
    x = df_combinedData.iloc[i]['Borough'].split(',')  
    #print (x[j])
    df_combinedData.at[i,'Borough'] = x[j]
df_combinedData.tail(30)

Unnamed: 0,Postcode,Borough,Neighbourhood
73,M6C,York,Humewood-Cedarvale
74,M6E,York,Caledonia-Fairbanks
75,M6G,Downtown Toronto,Christie
76,M6H,West Toronto,"Dovercourt Village , Dufferin"
77,M6J,West Toronto,"Little Portugal , Trinity"
78,M6K,West Toronto,"Brockton , Exhibition Place , Parkdale Village"
79,M6L,North York,"Downsview , North Park , Upwood Park"
80,M6M,York,"Del Ray , Keelesdale , Mount Dennis , Silverthorn"
81,M6N,York,"The Junction North , Runnymede"
82,M6P,West Toronto,"High Park , The Junction South"


In [120]:
# To address borough that has 'Not Assigned' neighborhood: The neighborhood will be the same as the borough
j=0
for i in range(len(df_combinedData)):
    if (df_combinedData.iloc[i]['Neighbourhood'] == 'Not assigned'):
        df_combinedData.at[i,'Neighbourhood'] = df_combinedData.at[i,'Borough']
df_combinedData.tail(20) 

Unnamed: 0,Postcode,Borough,Neighbourhood
83,M6R,West Toronto,"Parkdale , Roncesvalles"
84,M6S,West Toronto,"Runnymede , Swansea"
85,M7A,Queen's Park,Queen's Park
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
88,M8V,Etobicoke,"Humber Bay Shores , Mimico South , New Toronto"
89,M8W,Etobicoke,"Alderwood , Long Branch"
90,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
91,M8Y,Etobicoke,"Humber Bay , King's Mill Park , Kingsway Park ..."
92,M8Z,Etobicoke,"Kingsway Park South West , Mimico NW , The Que..."


In [121]:
df_combinedData.shape

(103, 3)

# Part 2) Add cordinations to the dataframe

### 2-1) Retrive geographical cordinations of each borough from provided link

In [122]:

filename = "http://cocl.us/Geospatial_data"
#headers = ["Postcode","Latitude","Longitude"]
geographical_cordination = pd.read_csv(filename)
geographical_cordination=geographical_cordination[0:]
#geographical_cordination.reset_index(inplace=True)
geographical_cordination.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 2-2)  add cordinations to the dataframe

In [123]:
# add cordinations to the dataframe
df_combinedData['Latitude'] = geographical_cordination['Latitude']
df_combinedData['Longitude'] = geographical_cordination['Longitude']
df_combinedData.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge , Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [124]:
df_combinedData.shape

(103, 5)

# Part 3: Explore and cluster the neighborhoods in Toronto

### 3-1) Create a map of Toronto with neighborhoods superimposed on top

In [125]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Libraries imported.


In [126]:
def get_location(address):
    """Get the address and return geographical cordinations of the address.i.e., latitude and longitude."""
    geolocator = Nominatim(user_agent="t_explorer")
    location = geolocator.geocode(address)
    if location:
        latitude = location.latitude
        longitude = location.longitude
    else:
        latitude = float('nan')
        longitude = float('nan')
            
    return latitude,longitude

In [127]:
# create map of Torononto using latitude and longitude values 
address='Toronto'
[lat,lon] = get_location(address)
map_toronto = folium.Map(location=[lat,lon], zoom_start=10)
#map_toronto
# add markers to map
for lat, lon, Borough, Neighbourhood in zip(df_combinedData['Latitude'],df_combinedData['Longitude'],df_combinedData['Borough'],df_combinedData['Neighbourhood']):
    label = '{},{}'.format(Neighbourhood, Borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
           [lat,lon],
            radius=5,
            popup=label,
             color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
             parse_html=False).add_to(map_toronto)
map_toronto   

### 3-2) Slice the original dataframe and create a new dataframe of the Old Toronto (which includes Downtown Core, Central Toronto, East Toronto and West Toronto)

In [128]:
#slice dataframe based on Borough names that contains 'Toronto'
old_toronto_data = df_combinedData[df_combinedData['Borough'].str.contains('Toronto')]
old_toronto_data.reset_index(drop=True, inplace=True)

In [129]:
old_toronto_data.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West , Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West , India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park , Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park , Forest Hill SE , Rathnelly , South...",43.686412,-79.400049


#### * It seems there are spaces in values of the Borough column that prevent 'groupby' function works properly on this column. So, we need to remove those spaces.

In [165]:
# remove spaces in Brorough column 

newCol = old_toronto_data['Borough'].map(lambda x: x.lstrip(' ').rstrip(' '))
#newCol
#newCol1=newCol.str.strip()

old_toronto_data.replace(old_toronto_data['Borough'],newCol, inplace=True)



In [167]:
#old_toronto_data['Borough']

In [168]:
old_toronto_data.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West , Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West , India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park , Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park , Forest Hill SE , Rathnelly , South...",43.686412,-79.400049


### 3-3) create map of "Old Toronto", using latitude and longitude values. Aadd markers to map using segmentaed data from old_toronto_data

In [169]:
# create map of Old Toronto, Toronto using latitude and longitude values and add markers to map using segmentaed data from old_toronto_data
address='Toronto'
map_old_toronto = folium.Map(location=get_location(address), zoom_start=11)

# add markers to map
for lat, lon, label in zip(old_toronto_data['Latitude'], old_toronto_data['Longitude'], old_toronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_old_toronto)  
    
map_old_toronto

### 3-4) Utilise the Foursquare API to explore the Neighbourhoods and segment them

In [170]:
#Define Foursquare Credentials and Version
CLIENT_ID = 'EOZDYROV2XFKJM5MGI10GCO5V5CRUEHSSR4YQKFM0U2132WU' #  Foursquare ID
CLIENT_SECRET = 'LTBESQL2W3GB5R5LYK4QHAXHIEYZAI2KILAN0A4EYKNLBNGK' #  Foursquare Secret
VERSION = '20180605' # Foursquare API version

#### 3-4-1) Explore the first Neighbourhood in the segmented dataframe

In [171]:
#retrive name and location of the first neighbourhood
neighbourhood_name = old_toronto_data.loc[0,'Neighbourhood']
neighbourhood_latitude = old_toronto_data.loc[0,'Latitude']
neighbourhood_longitude = old_toronto_data.loc[0,'Longitude']

In [172]:
# Creat the url to create the GET request URL
url= ' https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,VERSION, neighbourhood_latitude, neighbourhood_longitude,500,100)
results=requests.get(url).json()

In [33]:
#results

In [173]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [174]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]  

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Glen Stewart Ravine,Other Great Outdoors,43.6763,-79.294784
4,Upper Beaches,Neighborhood,43.680563,-79.292869


In [175]:
print('{} venues were returned by Foursquare for first neighbourhood ({}).'.format(nearby_venues.shape[0], neighbourhood_name))

5 venues were returned by Foursquare for first neighbourhood (The Beaches).


#### 3-4-2) Explore all Neighborhoods in Toronto Boroughs

In [176]:
# create a function to repeat the same process to all the neighborhoods in Old Toronto

def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=30):
    """ get nearby venues for all neighbourhoods in old toronto"""
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
        # creat url and #  the API request
        url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,VERSION, lat, lng, radius, LIMIT)
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)


In [177]:
#run above function on each borough and create a new dataframe called old_toronto_venues
old_toronto_venues = getNearbyVenues(names=old_toronto_data['Borough'],
                                   latitudes=old_toronto_data['Latitude'],
                                   longitudes=old_toronto_data['Longitude']
                                  )

#old_toronto_venues

In [178]:
print(old_toronto_venues.shape)
old_toronto_venues.head(20)

(839, 7)


Unnamed: 0,Borough,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,East Toronto,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,East Toronto,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,East Toronto,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,East Toronto,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,East Toronto,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
5,East Toronto,43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
6,East Toronto,43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop
7,East Toronto,43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop
8,East Toronto,43.679557,-79.352188,Cafe Fiorentina,43.677743,-79.350115,Italian Restaurant
9,East Toronto,43.679557,-79.352188,La Diperie,43.67753,-79.352295,Ice Cream Shop


##### 3-4-2-1) Numbers of venues per each Borough in old toronto

In [179]:
old_toronto_venues.groupby('Borough').count()

Unnamed: 0_level_0,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Toronto,110,110,110,110,110,110
Downtown Toronto,486,486,486,486,486,486
East Toronto,102,102,102,102,102,102
West Toronto,141,141,141,141,141,141


##### 3-4-2-2) Number of unique categories that can be curated from all the returned venues

In [180]:
unique_venues= len(old_toronto_venues['Venue Category'].unique())
print('There are {} unique categories.'.format(unique_venues))

There are 188 unique categories.


#### 3-4-3) Analyze Each borough

In [181]:
# one hot encoding
old_toronto_onehot = pd.get_dummies(old_toronto_venues[['Venue Category']], prefix="", prefix_sep="")
print('data frame size:{}'.format(old_toronto_onehot.shape))
old_toronto_onehot.head()

data frame size:(839, 188)


Unnamed: 0,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,Art Gallery,Arts & Crafts Store,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [182]:
# add Borough column back to the dataframe
old_toronto_onehot['Borough'] = old_toronto_venues['Borough']
old_toronto_onehot.head()


Unnamed: 0,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,Art Gallery,Arts & Crafts Store,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio,Borough
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,East Toronto
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,East Toronto
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,East Toronto
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,East Toronto
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,East Toronto


In [183]:
# move Borough column to the first column
fixed_columns = [old_toronto_onehot.columns[-1]] + list(old_toronto_onehot.columns[:-1])
old_toronto_onehot = old_toronto_onehot[fixed_columns]
print('data frame size:{}'.format(old_toronto_onehot.shape))
old_toronto_onehot.head()

data frame size:(839, 189)


Unnamed: 0,Borough,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,Art Gallery,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,East Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,East Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,East Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,East Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,East Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### 3-4-3-1) Group rows by Borough and by taking the mean of the frequency of occurrence of each category


In [184]:
old_toronto_onehot_grouped = old_toronto_onehot.groupby('Borough').mean().reset_index()
print('data frame size:{}'.format(old_toronto_onehot_grouped.shape))
old_toronto_onehot_grouped.head()

data frame size:(4, 189)


Unnamed: 0,Borough,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Aquarium,Art Gallery,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,...,0.0,0.0,0.009091,0.009091,0.0,0.009091,0.0,0.009091,0.0,0.009091
1,Downtown Toronto,0.002058,0.002058,0.002058,0.004115,0.004115,0.004115,0.012346,0.002058,0.014403,...,0.00823,0.002058,0.0,0.002058,0.002058,0.012346,0.002058,0.004115,0.002058,0.0
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.009804,0.0,0.0,...,0.0,0.0,0.0,0.019608,0.0,0.0,0.0,0.0,0.0,0.029412
3,West Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014184,...,0.007092,0.0,0.0,0.0,0.0,0.007092,0.0,0.014184,0.007092,0.021277


##### 3-4-3-2) print each neighborhood along with the top 5 most common venues

In [185]:
num_top_venues = 5

for borou in old_toronto_onehot_grouped['Borough']:
    print("----"+borou+"----")
    temp = old_toronto_onehot_grouped[old_toronto_onehot_grouped['Borough'] == borou].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Toronto----
            venue  freq
0     Coffee Shop  0.07
1            Café  0.05
2            Park  0.05
3     Pizza Place  0.05
4  Sandwich Place  0.05


----Downtown Toronto----
         venue  freq
0         Café  0.08
1  Coffee Shop  0.08
2   Restaurant  0.04
3    Gastropub  0.03
4         Park  0.03


----East Toronto----
                venue  freq
0    Greek Restaurant  0.08
1  Italian Restaurant  0.05
2         Coffee Shop  0.05
3                Café  0.04
4      Ice Cream Shop  0.04


----West Toronto----
                venue  freq
0                Café  0.07
1                 Bar  0.07
2              Bakery  0.04
3         Coffee Shop  0.04
4  Italian Restaurant  0.04




##### 3-4-3-3) put above information into a pandas dataframe

In [186]:
def return_most_common_venues(row, num_top_venues):
    """sort the venues in descending order"""
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [187]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
Borough_venues_sorted = pd.DataFrame(columns=columns)
Borough_venues_sorted['Borough'] = old_toronto_onehot_grouped['Borough']

for ind in np.arange(old_toronto_onehot_grouped.shape[0]):
    Borough_venues_sorted.iloc[ind, 1:] = return_most_common_venues(old_toronto_onehot_grouped.iloc[ind, :], num_top_venues)

Borough_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,Coffee Shop,Sandwich Place,Park,Café,Pizza Place,Dessert Shop,Restaurant,Gym,Sushi Restaurant,Pub
1,Downtown Toronto,Café,Coffee Shop,Restaurant,Park,Gastropub,Hotel,Japanese Restaurant,Bakery,Italian Restaurant,Steakhouse
2,East Toronto,Greek Restaurant,Italian Restaurant,Coffee Shop,Café,Ice Cream Shop,Yoga Studio,Pizza Place,Park,Pub,Brewery
3,West Toronto,Café,Bar,Coffee Shop,Italian Restaurant,Bakery,Pizza Place,Breakfast Spot,Asian Restaurant,Bookstore,Yoga Studio


## 3-5) Cluster Neighborhoods

### 3-5-1) Run k-means to cluster the neighborhood into 4 clusters.

In [189]:
# set number of clusters
kclusters = 4

old_toronto_grouped_clustering = old_toronto_onehot_grouped.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(old_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:4] 

array([0, 3, 2, 1], dtype=int32)

### 3-5-2) create a new dataframe that includes the cluster as well as the top 10 venues for each Borough.

In [190]:
# add clustering labels
Borough_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

old_toronto_merged = old_toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
old_toronto_merged = old_toronto_merged.join(Borough_venues_sorted.set_index('Borough'), on='Borough')

old_toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,Greek Restaurant,Italian Restaurant,Coffee Shop,Café,Ice Cream Shop,Yoga Studio,Pizza Place,Park,Pub,Brewery
1,M4K,East Toronto,"The Danforth West , Riverdale",43.679557,-79.352188,2,Greek Restaurant,Italian Restaurant,Coffee Shop,Café,Ice Cream Shop,Yoga Studio,Pizza Place,Park,Pub,Brewery
2,M4L,East Toronto,"The Beaches West , India Bazaar",43.668999,-79.315572,2,Greek Restaurant,Italian Restaurant,Coffee Shop,Café,Ice Cream Shop,Yoga Studio,Pizza Place,Park,Pub,Brewery
3,M4M,East Toronto,Studio District,43.659526,-79.340923,2,Greek Restaurant,Italian Restaurant,Coffee Shop,Café,Ice Cream Shop,Yoga Studio,Pizza Place,Park,Pub,Brewery
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Coffee Shop,Sandwich Place,Park,Café,Pizza Place,Dessert Shop,Restaurant,Gym,Sushi Restaurant,Pub


### 3-5-3) visualise the resulting clusters

In [191]:
# create map
address='Toronto'
[lat,lon] = get_location(address)
map_clusters = folium.Map(location=[lat, lon], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(old_toronto_merged['Latitude'], old_toronto_merged['Longitude'], old_toronto_merged['Borough'], old_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters