In [1]:
import numpy as np

import pandas as pd 
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json
from geopy.geocoders import Nominatim 
import requests 
from bs4 import BeautifulSoup 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

print("Libraries imported.")

Libraries imported.


In [2]:
# send the GET request
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [4]:
# create three lists to store table data
postalCodeList = []
boroughList = []
neighborhoodList = []

In [5]:
# append the data into the respective lists
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text.strip('\n'))
        boroughList.append(cells[1].text.strip('\n'))
        neighborhoodList.append(cells[2].text.rstrip('\n')) # avoid new lines in neighborhood cell

<h5> Creating the dataframe

In [6]:
# create a new DataFrame from the three lists
toronto_df = pd.DataFrame({"PostalCode": postalCodeList})
toronto_df["Borough"] = boroughList
toronto_df["Neighborhood"] = neighborhoodList
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


<h5> Ignoring cells with a borough that is "Not assigned".

In [7]:
toronto_df_drop = toronto_df[toronto_df.Borough != "Not assigned"].reset_index(drop=True)
toronto_df_drop.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


<h5> Combining rows with the same postal code into one with the neighborhoods sepearted with a comma 

In [8]:
toronto_df_grouped = toronto_df_drop.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<h5> Making the neighborhood same as the borough for cells that have a borough but a "Not assigned" neighborhood

In [9]:
for index, row in toronto_df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<h5> Verifying if the dataframe is the same as shown in the question

In [10]:
column_names = ["PostalCode", "Borough", "Neighborhood"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(toronto_df_grouped[toronto_df_grouped["PostalCode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


<h5> Printing the number of rows in the dataframe

In [11]:
toronto_df_grouped.shape

(103, 3)

<h5> Reading the CSV file via panda

In [12]:
coordinates = pd.read_csv('https://cocl.us/Geospatial_data')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<h5> Merging the data

In [14]:
toronto_df_new = toronto_df_grouped.merge(coordinates, on="PostalCode", how="left")
toronto_df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


<h5> Verifying the coordinates to make sure they are as required by the question

In [15]:
column_names = ["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(toronto_df_new[toronto_df_new["PostalCode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


<h5> Latitude and Longitude

In [18]:
address = 'Toronto'

geolocator = Nominatim(user_agent="Assignment")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


<h5> Creating map with markers

In [19]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df_new['Latitude'], toronto_df_new['Longitude'], toronto_df_new['Borough'], toronto_df_new['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

<h5> Exploring neighborhoods in Toronto

In [20]:
borough_names = list(toronto_df_new.Borough.unique())

borough_with_toronto = []

for x in borough_names:
    if "toronto" in x.lower():
        borough_with_toronto.append(x)
        
borough_with_toronto

['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']

In [21]:
# create a new DataFrame with only boroughs that contain the word Toronto
toronto_df_new = toronto_df_new[toronto_df_new['Borough'].isin(borough_with_toronto)].reset_index(drop=True)
print(toronto_df_new.shape)
toronto_df_new.head()

(39, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [22]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df_new['Latitude'], toronto_df_new['Longitude'], toronto_df_new['Borough'], toronto_df_new['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

In [23]:
CLIENT_ID = 'TTVTM0PGMSUVEAXRSM3TSQ03SSKDA2VS4WNMRAXMCJ5RBF0I' 
CLIENT_SECRET = 'BHMPDJ4G2LSIH5NUWBD0UMI3PF5D4FCJNFD4NTR23UCXG1NY'  # your Foursquare Secret
VERSION = '20180605'

In [24]:
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_df_new['Latitude'], toronto_df_new['Longitude'], toronto_df_new['PostalCode'], toronto_df_new['Borough'], 
                                                  toronto_df_new['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id=ZYLC4Q3I000O4R32DVJWJJTOTHCGC4O02TXYEPLDAS211SPQ&client_secret=OPKHF1MTRWKRHVR2DAV0IT1IK2H2XZDXJYTCNHVY5L44T55H&v=20180605 \
     &ll=43.653963,-79.387207&radius=500&limit=100".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

<h5> Converting the venues list to a new dataframe

In [25]:
venues_df = pd.DataFrame(venues)

venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(1833, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Downtown Toronto,43.653232,-79.385296,Neighborhood
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,Japango,43.655268,-79.385165,Sushi Restaurant
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Poke Guys,43.654895,-79.385052,Poke Place
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,Textile Museum of Canada,43.654396,-79.3865,Art Museum
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,Cafe Plenty,43.654571,-79.38945,Café


<h5> Checking the number of venues returned

In [26]:
venues_df.groupby(["PostalCode", "Borough", "Neighborhood"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
PostalCode,Borough,Neighborhood,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M4E,East Toronto,The Beaches,47,47,47,47,47,47
M4K,East Toronto,"The Danforth West, Riverdale",47,47,47,47,47,47
M4L,East Toronto,"India Bazaar, The Beaches West",47,47,47,47,47,47
M4M,East Toronto,Studio District,47,47,47,47,47,47
M4N,Central Toronto,Lawrence Park,47,47,47,47,47,47
M4P,Central Toronto,Davisville North,47,47,47,47,47,47
M4R,Central Toronto,"North Toronto West, Lawrence Park",47,47,47,47,47,47
M4S,Central Toronto,Davisville,47,47,47,47,47,47
M4T,Central Toronto,"Moore Park, Summerhill East",47,47,47,47,47,47
M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park",47,47,47,47,47,47


<h5> Analysing each area

In [27]:
# one hot encoding
toronto_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_onehot['PostalCode'] = venues_df['PostalCode'] 
toronto_onehot['Borough'] = venues_df['Borough'] 
toronto_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_onehot.columns[-3:]) + list(toronto_onehot.columns[:-3])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

(1833, 38)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Art Gallery,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot,Bubble Tea Shop,Café,Coffee Shop,Concert Hall,Dessert Shop,Donut Shop,Food Court,French Restaurant,Gastropub,Gift Shop,Hotel,Japanese Restaurant,Juice Bar,Korean Restaurant,Miscellaneous Shop,Modern European Restaurant,Monument / Landmark,Movie Theater,Neighborhood,New American Restaurant,Park,Plaza,Poke Place,Pub,Salon / Barbershop,Seafood Restaurant,Smoke Shop,Sushi Restaurant,University,Vegetarian / Vegan Restaurant
0,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,M4E,East Toronto,The Beaches,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,M4E,East Toronto,The Beaches,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


<h5> Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [28]:
toronto_grouped = toronto_onehot.groupby(["PostalCode", "Borough", "Neighborhoods"]).mean().reset_index()

print(toronto_grouped.shape)
toronto_grouped

(39, 38)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Art Gallery,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot,Bubble Tea Shop,Café,Coffee Shop,Concert Hall,Dessert Shop,Donut Shop,Food Court,French Restaurant,Gastropub,Gift Shop,Hotel,Japanese Restaurant,Juice Bar,Korean Restaurant,Miscellaneous Shop,Modern European Restaurant,Monument / Landmark,Movie Theater,Neighborhood,New American Restaurant,Park,Plaza,Poke Place,Pub,Salon / Barbershop,Seafood Restaurant,Smoke Shop,Sushi Restaurant,University,Vegetarian / Vegan Restaurant
0,M4E,East Toronto,The Beaches,0.06383,0.021277,0.021277,0.021277,0.021277,0.021277,0.06383,0.12766,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.042553,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.021277,0.021277
1,M4K,East Toronto,"The Danforth West, Riverdale",0.06383,0.021277,0.021277,0.021277,0.021277,0.021277,0.06383,0.12766,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.042553,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.021277,0.021277
2,M4L,East Toronto,"India Bazaar, The Beaches West",0.06383,0.021277,0.021277,0.021277,0.021277,0.021277,0.06383,0.12766,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.042553,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.021277,0.021277
3,M4M,East Toronto,Studio District,0.06383,0.021277,0.021277,0.021277,0.021277,0.021277,0.06383,0.12766,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.042553,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.021277,0.021277
4,M4N,Central Toronto,Lawrence Park,0.06383,0.021277,0.021277,0.021277,0.021277,0.021277,0.06383,0.12766,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.042553,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.021277,0.021277
5,M4P,Central Toronto,Davisville North,0.06383,0.021277,0.021277,0.021277,0.021277,0.021277,0.06383,0.12766,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.042553,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.021277,0.021277
6,M4R,Central Toronto,"North Toronto West, Lawrence Park",0.06383,0.021277,0.021277,0.021277,0.021277,0.021277,0.06383,0.12766,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.042553,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.021277,0.021277
7,M4S,Central Toronto,Davisville,0.06383,0.021277,0.021277,0.021277,0.021277,0.021277,0.06383,0.12766,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.042553,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.021277,0.021277
8,M4T,Central Toronto,"Moore Park, Summerhill East",0.06383,0.021277,0.021277,0.021277,0.021277,0.021277,0.06383,0.12766,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.042553,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.021277,0.021277
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",0.06383,0.021277,0.021277,0.021277,0.021277,0.021277,0.06383,0.12766,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.042553,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.021277,0.042553,0.021277,0.021277


<h5> Creating a new dataframe and displaying the top 10 venues for each postal code

In [29]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_grouped['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_grouped['Neighborhoods']

for ind in np.arange(toronto_grouped.shape[0]):
    row_categories = toronto_grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

# neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted

(39, 13)


Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
1,M4K,East Toronto,"The Danforth West, Riverdale",Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
2,M4L,East Toronto,"India Bazaar, The Beaches West",Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
3,M4M,East Toronto,Studio District,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
4,M4N,Central Toronto,Lawrence Park,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
5,M4P,Central Toronto,Davisville North,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
6,M4R,Central Toronto,"North Toronto West, Lawrence Park",Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
7,M4S,Central Toronto,Davisville,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
8,M4T,Central Toronto,"Moore Park, Summerhill East",Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot


<h3> Clustering

In [30]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop(["PostalCode", "Borough", "Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

  


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [31]:
#create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
toronto_merged = toronto_df_new.copy()

# add clustering labels
toronto_merged["Cluster Labels"] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.drop(["Borough", "Neighborhoods"], 1).set_index("PostalCode"), on="PostalCode")

print(toronto_merged.shape)
toronto_merged.head() # check the last columns!

(39, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot


In [32]:
# sort the results by Cluster Labels
print(toronto_merged.shape)
toronto_merged.sort_values(["Cluster Labels"], inplace=True)
toronto_merged

(39, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
21,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
22,M5N,Central Toronto,Roselawn,43.711695,-79.416936,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
23,M5P,Central Toronto,"Forest Hill North & West, Forest Hill Road Park",43.696948,-79.411307,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
24,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
25,M5S,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
26,M5T,Downtown Toronto,"Kensington Market, Chinatown, Grange Park",43.653206,-79.400049,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
27,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
20,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
28,M5W,Downtown Toronto,Stn A PO Boxes,43.646435,-79.374846,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot


<h5> Visulaizing the clusters

In [33]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Borough'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<h5> Checking the clusters

<h5> Cluster 1

In [34]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + \
                                                                                 list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
21,Downtown Toronto,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
22,Central Toronto,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
23,Central Toronto,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
24,Central Toronto,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
25,Downtown Toronto,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
26,Downtown Toronto,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
27,Downtown Toronto,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
20,Downtown Toronto,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot
28,Downtown Toronto,0,Coffee Shop,Café,Art Gallery,Sushi Restaurant,Japanese Restaurant,Hotel,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot


<h5> Cluster 2

In [35]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + \
                                                                                 list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


<h5> Cluster 3

In [36]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + \
                                                                                 list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


<h5> Cluster 4

In [37]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + \
                                                                                 list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


<h5> Cluster 5

In [38]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + \
                                                                                 list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


<h3> Conclusion
<h5> Most of the neighborhoods fall in the first cluster