# IBM PROJECT

## IMPORTING PACKAGES 

In [83]:
from bs4 import BeautifulSoup 
import requests
import csv
import pandas as pd
import numpy as np 
import seaborn as sns
import scipy.stats as stats
import statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols
from IPython.display import display, HTML

import json
from pandas.io.json import json_normalize 

import geocoder # import geocoder

%matplotlib inline
import matplotlib as mpl 
import matplotlib.pyplot as plt

!pip install xlrd
import researchpy as rp
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor


from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library



distributed 1.21.7 requires msgpack, which is not installed.
You are using pip version 10.0.1, however version 19.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


## WEBSCRAPING WITH BEAUTIFULSOUL

So, first parse the website by using the "request.get()" fuction to get the html file

In [25]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

Here we use beautiful soul to read the html file.

In [26]:
soup = BeautifulSoup(source, "html5lib")

In [27]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":890001695,"wgRevisionId":890001695,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wg

### PARSING THE DATA TO CSV FILE

Next, we filter out the the table we need from the html file. Then, I use  for loops to obtain every variable that is in the "td" tags. Finally, I pass the oupt into a list called "output_row" and then write that output to an csv file.

In [28]:
table =soup.find("table", class_ = "wikitable sortable")

output_rows = []
for table_row in table.findAll('tr'):
    columns = table_row.findAll('td')
    output_row = []
    for column in columns:
        output_row.append(column.text)
    output_rows.append(output_row)
    
with open('output.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(output_rows)


## IMPORTING DATA 

Import the csv file "outpot_row" as a dataframe. Assign headers and remove "\r" and "\n" from the end of the words in column Neighbourhood.

In [114]:
data = "C:\\Users\\user\\Documents\\Python\\IBM_Project\\output.csv"

In [115]:
df = pd.read_csv(data, header=None)

In [116]:
header = ['Postalcode', 'Borough', 'Neighborhood']

In [117]:
df.columns = header

In [118]:
df = df.replace(['\r', '\n'],'', regex=True)

## DATA OVERVIEW

So, here we see that the data as 288 rows and 3 columns.

In [119]:
df

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [120]:
df.shape

(288, 3)

## FILTER DATASET

Filter out all "Not assigned" rows in column Borough from the dataframe.

In [121]:
df_work = df[df["Borough"] != "Not assigned"]

In [122]:
df_work.shape

(211, 3)

In [123]:
df_work.head(211)

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


## JOINING NEIGHBOURHOODS BASED ON POSTALCODE

Collapse columns based on Postalcode; allowing Postalcode to have more than one neighbourhood.

In [124]:
df_work.groupby(['Postalcode', 'Borough'])["Neighborhood"].apply(','.join).to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighborhood
Postalcode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
M1N,Scarborough,"Birch Cliff,Cliffside West"


## REMOVING "NOT aSSIGNED" FROM NEIGHBOURHOOD

Finally, replace "Not assigned" in column Neighbourhood with values in the same row in column Borough.

In [125]:
df_work['Neighbourhood'] = np.where(df_work['Neighborhood'] == "Not assigned", df_work['Borough'], df_work['Neighborhood'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [126]:
df_work

Unnamed: 0,Postalcode,Borough,Neighborhood,Neighbourhood
2,M3A,North York,Parkwoods,Parkwoods
3,M4A,North York,Victoria Village,Victoria Village
4,M5A,Downtown Toronto,Harbourfront,Harbourfront
5,M5A,Downtown Toronto,Regent Park,Regent Park
6,M6A,North York,Lawrence Heights,Lawrence Heights
7,M6A,North York,Lawrence Manor,Lawrence Manor
8,M7A,Queen's Park,Not assigned,Queen's Park
10,M9A,Etobicoke,Islington Avenue,Islington Avenue
11,M1B,Scarborough,Rouge,Rouge
12,M1B,Scarborough,Malvern,Malvern


In [127]:
df_work.shape

(211, 4)

In [128]:
data_geo = "C:\\Users\\user\\Documents\\Python\\IBM_Project\\Geospatial_Coordinates.csv"

In [129]:
df_geo = pd.read_csv(data_geo)

In [130]:
df_geo

Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [131]:
neighborhood = pd.merge(df_work, df_geo)
neighborhood

Unnamed: 0,Postalcode,Borough,Neighborhood,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,Harbourfront,43.654260,-79.360636
3,M5A,Downtown Toronto,Regent Park,Regent Park,43.654260,-79.360636
4,M6A,North York,Lawrence Heights,Lawrence Heights,43.718518,-79.464763
5,M6A,North York,Lawrence Manor,Lawrence Manor,43.718518,-79.464763
6,M7A,Queen's Park,Not assigned,Queen's Park,43.662301,-79.389494
7,M9A,Etobicoke,Islington Avenue,Islington Avenue,43.667856,-79.532242
8,M1B,Scarborough,Rouge,Rouge,43.806686,-79.194353
9,M1B,Scarborough,Malvern,Malvern,43.806686,-79.194353


## AN OVERVIEW OF BOROUGHS THAT CONTAIN "TORONTO"

In [132]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhood['Borough'].unique()),
        neighborhood.shape[0]
    )
)

The dataframe has 11 boroughs and 211 neighborhoods.


Identify all boroughs that are in Toronto.

In [133]:
neighborhood["Borough"].unique()

array(['North York', 'Downtown Toronto', "Queen's Park", 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

Filter out all the Borough that contains "Toronto"

In [134]:
toronto_data = neighborhood[(neighborhood['Borough'] == 'East Toronto') | (neighborhood['Borough'] == 'West Toronto') | (neighborhood['Borough'] == 'Central Toronto')].reset_index(drop=True)
toronto_data.head(50)

Unnamed: 0,Postalcode,Borough,Neighborhood,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,The Beaches,43.676357,-79.293031
1,M6H,West Toronto,Dovercourt Village,Dovercourt Village,43.669005,-79.442259
2,M6H,West Toronto,Dufferin,Dufferin,43.669005,-79.442259
3,M6J,West Toronto,Little Portugal,Little Portugal,43.647927,-79.41975
4,M6J,West Toronto,Trinity,Trinity,43.647927,-79.41975
5,M4K,East Toronto,The Danforth West,The Danforth West,43.679557,-79.352188
6,M4K,East Toronto,Riverdale,Riverdale,43.679557,-79.352188
7,M6K,West Toronto,Brockton,Brockton,43.636847,-79.428191
8,M6K,West Toronto,Exhibition Place,Exhibition Place,43.636847,-79.428191
9,M6K,West Toronto,Parkdale Village,Parkdale Village,43.636847,-79.428191


Finding the latitude and longitude coordinates for Toronto.

In [135]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Creating a map for Toronto that includes all the boroughs.

In [136]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## NOW, LETS SINGLE OUT ON BOROUGH FROM TORONTO

Let's explore East Toronto

In [138]:
east_toronto_data = neighborhood[neighborhood['Borough'] == 'East Toronto'].reset_index(drop=True)
east_toronto_data.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,The Danforth West,The Danforth West,43.679557,-79.352188
2,M4K,East Toronto,Riverdale,Riverdale,43.679557,-79.352188
3,M4L,East Toronto,The Beaches West,The Beaches West,43.668999,-79.315572
4,M4L,East Toronto,India Bazaar,India Bazaar,43.668999,-79.315572


Indentifying the coordinates for East Toronto.

In [139]:
address = 'East Toronto'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of East Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of East Toronto are 43.653963, -79.387207.


Create a map with markers to identify markers in East Toronto.

In [172]:
# create map of Manhattan using latitude and longitude values
map_east_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(east_toronto_data['Latitude'], east_toronto_data['Longitude'], east_toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_east_toronto)  
    
map_east_toronto

Using Foursquare.

In [142]:
CLIENT_ID = 'F54R335IJXWB4NL3LWZ5LZEH4MWSNYDEWMAGN0XUJXCTNVHV' # your Foursquare ID
CLIENT_SECRET = 'OV2NEHJYNHDK4MCEUJ5FAIQAY2BGRAMPFIPBHKQRENEYXRR3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: F54R335IJXWB4NL3LWZ5LZEH4MWSNYDEWMAGN0XUJXCTNVHV
CLIENT_SECRET:OV2NEHJYNHDK4MCEUJ5FAIQAY2BGRAMPFIPBHKQRENEYXRR3


Exploring the first neighborhood in East Toronto

In [143]:
east_toronto_data.loc[0, 'Neighborhood']

'The Beaches'

Obtaining the neighborhood's latitude and longitude valuse

In [175]:
neighborhood_latitude = east_toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = east_toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = east_toronto_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


Now, let's get the top 100 venues that are in The Beaches within a radius of 500 meters.

In [145]:
# type your answer here
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=F54R335IJXWB4NL3LWZ5LZEH4MWSNYDEWMAGN0XUJXCTNVHV&client_secret=OV2NEHJYNHDK4MCEUJ5FAIQAY2BGRAMPFIPBHKQRENEYXRR3&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

In [146]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5cce62684c1f67438f1e78a3'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4ad4c062f964a52011f820e3',
       'name': 'The Big Carrot Natural Food Market',
       'location': {'address': '125 Southwood Dr',
        'lat': 43.678879,
        'lng': -79.297734,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.678879,
          'lng': -79.297734}],
        'distance': 471,
        'postalCode': 'M4E 0B8',
   

Attaining the the categories of the venues

In [147]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Cleaning the json file and structuring it into a dataframe.

In [148]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
1,Grover Pub and Grub,Pub,43.679181,-79.297215
2,St-Denis Studios Inc.,Music Venue,43.675031,-79.288022
3,Upper Beaches,Neighborhood,43.680563,-79.292869


The number of venue returned by foursquare.

In [149]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


## EXPLORING NEIGHBORHOODS IN EAST TORONTO.

creating a function to repeat the same process to all the neighborhoods in Manhattan

In [150]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Running the above function on each neighborhood and create a new dataframe called manhattan_venues.

In [151]:
east_toronto_venues = getNearbyVenues(names=east_toronto_data['Neighbourhood'],
                                   latitudes=east_toronto_data['Latitude'],
                                   longitudes=east_toronto_data['Longitude']
                                  )

The Beaches
The Danforth West
Riverdale
The Beaches West
India Bazaar
Studio District
Business Reply Mail Processing Centre 969 Eastern


Checking the size of the dataframe 

In [178]:
print(east_toronto_venues.shape)
east_toronto_venues.head()

(190, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
2,The Beaches,43.676357,-79.293031,St-Denis Studios Inc.,43.675031,-79.288022,Music Venue
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,The Danforth West,43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [153]:
east_toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Business Reply Mail Processing Centre 969 Eastern,17,17,17,17,17,17
India Bazaar,22,22,22,22,22,22
Riverdale,44,44,44,44,44,44
Studio District,37,37,37,37,37,37
The Beaches,4,4,4,4,4,4
The Beaches West,22,22,22,22,22,22
The Danforth West,44,44,44,44,44,44


In [154]:
print('There are {} uniques categories.'.format(len(east_toronto_venues['Venue Category'].unique())))

There are 68 uniques categories.


## ANALYZING EACH NEIGHBORHOOD 

In [155]:
# one hot encoding
east_toronto_onehot = pd.get_dummies(east_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
east_toronto_onehot['Neighborhood'] = east_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [east_toronto_onehot.columns[-1]] + list(east_toronto_onehot.columns[:-1])
east_toronto_onehot = east_toronto_onehot[fixed_columns]

east_toronto_onehot.head()

Unnamed: 0,Yoga Studio,American Restaurant,Auto Workshop,Bakery,Bank,Bar,Bookstore,Brewery,Bubble Tea Shop,Burger Joint,...,Sandwich Place,Seafood Restaurant,Skate Park,Smoke Shop,Spa,Sports Bar,Stationery Store,Steakhouse,Sushi Restaurant,Trail
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [156]:
east_toronto_onehot.shape

(190, 68)

Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [157]:
east_toronto_grouped = east_toronto_onehot.groupby('Neighborhood').mean().reset_index()
east_toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,American Restaurant,Auto Workshop,Bakery,Bank,Bar,Bookstore,Brewery,Bubble Tea Shop,...,Sandwich Place,Seafood Restaurant,Skate Park,Smoke Shop,Spa,Sports Bar,Stationery Store,Steakhouse,Sushi Restaurant,Trail
0,Business Reply Mail Processing Centre 969 Eastern,0.058824,0.0,0.058824,0.0,0.0,0.0,0.0,0.058824,0.0,...,0.0,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.0
1,India Bazaar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,...,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.045455,0.0
2,Riverdale,0.022727,0.022727,0.0,0.022727,0.0,0.0,0.045455,0.022727,0.022727,...,0.0,0.0,0.0,0.0,0.022727,0.022727,0.0,0.0,0.022727,0.022727
3,Studio District,0.027027,0.054054,0.0,0.054054,0.027027,0.027027,0.027027,0.027027,0.0,...,0.027027,0.027027,0.0,0.0,0.0,0.0,0.027027,0.0,0.0,0.0
4,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,The Beaches West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,...,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.045455,0.0
6,The Danforth West,0.022727,0.022727,0.0,0.022727,0.0,0.0,0.045455,0.022727,0.022727,...,0.0,0.0,0.0,0.0,0.022727,0.022727,0.0,0.0,0.022727,0.022727


In [158]:
east_toronto_grouped.shape

(7, 68)

Print each neighborhood along with the top 5 most common venues

In [159]:
num_top_venues = 5

for hood in east_toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = east_toronto_grouped[east_toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.12
1         Yoga Studio  0.06
2          Skate Park  0.06
3       Garden Center  0.06
4              Garden  0.06


----India Bazaar----
            venue  freq
0            Park  0.09
1  Sandwich Place  0.09
2     Pizza Place  0.05
3     Coffee Shop  0.05
4  Ice Cream Shop  0.05


----Riverdale----
                    venue  freq
0        Greek Restaurant  0.18
1             Coffee Shop  0.09
2          Ice Cream Shop  0.07
3      Italian Restaurant  0.05
4  Furniture / Home Store  0.05


----Studio District----
                 venue  freq
0                 Café  0.11
1          Coffee Shop  0.08
2               Bakery  0.05
3  American Restaurant  0.05
4   Italian Restaurant  0.05


----The Beaches----
                       venue  freq
0          Health Food Store  0.25
1                        Pub  0.25
2                Music Venue  0.25
3                Yoga Studio  0.0

Creating a Dataframe.

In [160]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [161]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = east_toronto_grouped['Neighborhood']

for ind in np.arange(east_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(east_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Farmers Market,Comic Shop,Burrito Place,Park,Pizza Place,Recording Studio,Restaurant,Brewery
1,India Bazaar,Park,Sandwich Place,Liquor Store,Food & Drink Shop,Sushi Restaurant,Gym,Coffee Shop,Ice Cream Shop,Italian Restaurant,Light Rail Station
2,Riverdale,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Trail,Diner,Indian Restaurant,Grocery Store
3,Studio District,Café,Coffee Shop,Italian Restaurant,American Restaurant,Bakery,Ice Cream Shop,Gym / Fitness Center,Fish Market,Coworking Space,Convenience Store
4,The Beaches,Music Venue,Pub,Health Food Store,Trail,Dessert Shop,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Diner


## CLUSTER NEIGHBORHOODS

Creating five(5) clusters

In [162]:
# set number of clusters
kclusters = 5

east_toronto_grouped_clustering = east_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(east_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 2, 1, 4, 0, 2, 1])

In [163]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

east_toronto_merged = east_toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
east_toronto_merged = east_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

east_toronto_merged.head() # check the last columns!

Unnamed: 0,Postalcode,Borough,Neighborhood,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,The Beaches,43.676357,-79.293031,0,Music Venue,Pub,Health Food Store,Trail,Dessert Shop,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Diner
1,M4K,East Toronto,The Danforth West,The Danforth West,43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Trail,Diner,Indian Restaurant,Grocery Store
2,M4K,East Toronto,Riverdale,Riverdale,43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Trail,Diner,Indian Restaurant,Grocery Store
3,M4L,East Toronto,The Beaches West,The Beaches West,43.668999,-79.315572,2,Park,Sandwich Place,Liquor Store,Food & Drink Shop,Sushi Restaurant,Gym,Coffee Shop,Ice Cream Shop,Italian Restaurant,Light Rail Station
4,M4L,East Toronto,India Bazaar,India Bazaar,43.668999,-79.315572,2,Park,Sandwich Place,Liquor Store,Food & Drink Shop,Sushi Restaurant,Gym,Coffee Shop,Ice Cream Shop,Italian Restaurant,Light Rail Station


Visualizing the data.

In [170]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(east_toronto_merged['Latitude'], east_toronto_merged['Longitude'], east_toronto_merged['Neighborhood'], east_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## EXAMINING CLUSTERS 

In [165]:
east_toronto_merged.loc[east_toronto_merged['Cluster Labels'] == 0, east_toronto_merged.columns[[1] + list(range(5, east_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,-79.293031,0,Music Venue,Pub,Health Food Store,Trail,Dessert Shop,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Diner


In [166]:
east_toronto_merged.loc[east_toronto_merged['Cluster Labels'] == 1, east_toronto_merged.columns[[1] + list(range(5, east_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,East Toronto,-79.352188,1,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Trail,Diner,Indian Restaurant,Grocery Store
2,East Toronto,-79.352188,1,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Trail,Diner,Indian Restaurant,Grocery Store


In [167]:
east_toronto_merged.loc[east_toronto_merged['Cluster Labels'] == 2, east_toronto_merged.columns[[1] + list(range(5, east_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,East Toronto,-79.315572,2,Park,Sandwich Place,Liquor Store,Food & Drink Shop,Sushi Restaurant,Gym,Coffee Shop,Ice Cream Shop,Italian Restaurant,Light Rail Station
4,East Toronto,-79.315572,2,Park,Sandwich Place,Liquor Store,Food & Drink Shop,Sushi Restaurant,Gym,Coffee Shop,Ice Cream Shop,Italian Restaurant,Light Rail Station


In [168]:
east_toronto_merged.loc[east_toronto_merged['Cluster Labels'] == 3, east_toronto_merged.columns[[1] + list(range(5, east_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,East Toronto,-79.321558,3,Light Rail Station,Yoga Studio,Farmers Market,Comic Shop,Burrito Place,Park,Pizza Place,Recording Studio,Restaurant,Brewery


In [169]:
east_toronto_merged.loc[east_toronto_merged['Cluster Labels'] == 4, east_toronto_merged.columns[[1] + list(range(5, east_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,East Toronto,-79.340923,4,Café,Coffee Shop,Italian Restaurant,American Restaurant,Bakery,Ice Cream Shop,Gym / Fitness Center,Fish Market,Coworking Space,Convenience Store
