# Segmenting and clustering neighbourhoods in toronto

Firstly, let's download all the packages that we will require to complete the analysis
1.folium
2.seaborn

In [1]:
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 14.2MB/s eta 0:00:01
[?25hCollecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [2]:
!pip install beautifulsoup4



In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import requests
import json
from bs4 import BeautifulSoup
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline

In [5]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
resp = requests.get(url)
toronto_html = BeautifulSoup(resp.content)

In [6]:
soup = BeautifulSoup(str(toronto_html))
neighborhood_table = soup.find('table')
table_str = str(neighborhood_table.extract())
toronto_df = pd.read_html(table_str)[0]
toronto_df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [7]:
toronto_df.columns = ['PostalCode'] + list(toronto_df.columns)[1:]
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [8]:
toronto_df.shape

(180, 3)

In [9]:
toronto_df.Borough.value_counts()

Not assigned        77
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East York            5
East Toronto         5
Mississauga          1
Name: Borough, dtype: int64

In [10]:
borough_na = toronto_df[toronto_df.Borough == 'Not assigned']
toronto_df.drop(borough_na.index, inplace=True)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [11]:
toronto_df.shape

(103, 3)

combining the neighbourhoods with same postal codes 

In [12]:
toronto_df = toronto_df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: ', '.join(x)).to_frame()
toronto_df.reset_index(inplace=True)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [13]:
toronto_df[toronto_df.Neighborhood == "Not assigned"]

Unnamed: 0,PostalCode,Borough,Neighborhood


In [14]:
toronto_df.loc[85, 'Neighborhood'] = toronto_df.loc[85, 'Borough']
toronto_df[toronto_df.Neighborhood == "Not assigned"]

Unnamed: 0,PostalCode,Borough,Neighborhood


In [15]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [16]:
geo_url = "https://cocl.us/Geospatial_data"

geocode_df = pd.read_csv(geo_url)
geocode_df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
geocode_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
toronto_df = pd.merge(toronto_df, geocode_df, on='PostalCode')
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


exploring toronto dataset

In [18]:
toronto_latlong = [43.653225, -79.383186]

In [19]:
# create map of Manhattan using latitude and longitude values
toronto_map = folium.Map(location=toronto_latlong, zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='#2E7D32',
        fill=True,
        fill_color='#FFEB3B',
        fill_opacity=0.75,
        parse_html=False).add_to(toronto_map)

In [31]:
search_url = 'https://api.foursquare.com/v2/venues/search'
CLIENT_ID ='iPLZNCKKKLM41EKE10AB45JLOQYBUHWJP5SVWWJL0A2O3C5NP'# your Foursquare ID
CLIENT_SECRET = 'E2SXYPU4HYMHSSSGTILSC145BN00GCJMHY4IL3PNOLJM4QHQ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + "*****")
print('CLIENT_SECRET:' + "*****")

Your credentails:
CLIENT_ID: *****
CLIENT_SECRET:*****


In [32]:
toronto_df.loc[0, 'Borough']

'Scarborough'

In [33]:
print('Latitude and longitude values of {} are {}, {}.'.format(toronto_df.loc[0, 'Borough'], 
                                                               toronto_df.loc[0, 'Latitude'], 
                                                               toronto_df.loc[0, 'Longitude']))

Latitude and longitude values of Scarborough are 43.806686299999996, -79.19435340000001.


In [34]:
params = dict(
  client_id=CLIENT_ID,
  client_secret=CLIENT_SECRET,
  v=VERSION,
  ll=f"{toronto_df.loc[0, 'Latitude']}, {toronto_df.loc[0, 'Longitude']}",
  radius=500,
  limit=100
)

In [35]:
resp = requests.get(url=search_url, params=params)
data = resp.json()

In [36]:
data['response']['venues'][0]

KeyError: 'venues'

In [37]:
def get_venues(neighborhood, lat, long, limit=100, radius=250):
    params = dict(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET,
        v=VERSION,
        ll=f"{lat}, {long}",
        radius=radius,
        limit=limit
    )
    venues = []
    data = requests.get(url=search_url, params=params).json()
    for v in data['response']['venues']:
        venues.append([neighborhood, lat, long, v['name'], v['location']['lat'], v['location']['lng'], 
                       ', '.join(x['name'] for x in v['categories'])])
    return venues

In [42]:
nbr_venues_df = pd.DataFrame()
columns=['Neighborhood', 'NeighborhoodLat', 'NeighborhoodLong', 'VenueName', 'VenueLat', 'VenueLong', 'VenueCategory']
for name, lat, long in zip(toronto_df.Neighborhood, toronto_df.Latitude, toronto_df.Longitude):
    nbr_venues_df = nbr_venues_df.append(get_venues(name, lat, long))
#nbr_venues_df.columns = columns

#nbr_venues_df.head()

KeyError: 'venues'

In [43]:
nbr_venues_df.to_csv('toronto_boroughs_and_venues.csv')