#  Segmenting and Clustering Neighbourhoods in Toronto

## *Part 1*:

### import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import requests

1. Get the HTML page of Wiki, and using read_html we convert the html data into list of Data frame objects.

2. Remove cells which have borough not assigned.

In [2]:
wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_page = requests.get(wiki)

wiki_raw = pd.read_html(wiki_page.content, header = 0)[0]
df = wiki_raw[wiki_raw.Neighborhood != 'Not assigned']
df.reset_index(inplace = True)
df.head()

Unnamed: 0,index,Post Code,Borough,Neighborhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
df.rename(columns={'Post Code' : 'Postal Code'}, inplace= True)

In [6]:
df.groupby(['Postal Code']).first()

Unnamed: 0_level_0,index,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M1B,9,Scarborough,"Malvern, Rouge"
M1C,18,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,27,Scarborough,"Guildwood, Morningside, West Hill"
M1G,36,Scarborough,Woburn
M1H,45,Scarborough,Cedarbrae
...,...,...,...
M9N,98,York,Weston
M9P,107,Etobicoke,Westmount
M9R,116,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
M9V,143,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


#### The new Wiki link already has nieghborhood merged according to Postal Code and Borough

In [7]:
len(df['Postal Code'].unique())

103

In [8]:
df[df['Borough'] == 'Not assigned']

Unnamed: 0,index,Postal Code,Borough,Neighborhood


In [9]:
df = df.reset_index()
df.drop(['index'], axis = 'columns', inplace = True)

df.head(10)

Unnamed: 0,level_0,Postal Code,Borough,Neighborhood
0,0,M3A,North York,Parkwoods
1,1,M4A,North York,Victoria Village
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,6,M1B,Scarborough,"Malvern, Rouge"
7,7,M3B,North York,Don Mills
8,8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [10]:
df.shape

(103, 4)

## *Part 2* :

In [11]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [12]:
import geocoder

In [13]:
url = 'http://cocl.us/Geospatial_data'

In [14]:
df_geo = pd.read_csv(url)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Checking dtypes of our columns

In [15]:
df_geo.dtypes

Postal Code     object
Latitude       float64
Longitude      float64
dtype: object

In [16]:
df.dtypes

level_0          int64
Postal Code     object
Borough         object
Neighborhood    object
dtype: object

#### Shapes of both our Dataframes

In [17]:
df.shape

(103, 4)

In [18]:
df_geo.shape

(103, 3)

#### Joining both df's and doing some cleaning

In [19]:
df = df.join(df_geo.set_index('Postal Code'), on='Postal Code')
df

Unnamed: 0,level_0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,0,M3A,North York,Parkwoods,43.753259,-79.329656
1,1,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...,...
98,98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [20]:
df.shape

(103, 6)

## *Part 3 :*

Using the foursquere API to segment and cluster the neighborhoods of Toronto

In [45]:
!conda install -c conda-forge geocoder --yes


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [21]:
import geocoder
from geopy.geocoders import Nominatim 

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Printing Map

In [24]:
!conda install -c conda-forge folium=0.5.0 --yes

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [22]:
import folium

# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        ).add_to(map_Toronto)  
    
map_Toronto

#### Define Foursquare parametrs

In [23]:
CLIENT_ID = 'RAL3CFPIP3YUAMLDK0A14B1GEJ0FT4MBORICVKKYZ4KQKMCJ' # your Foursquare ID
CLIENT_SECRET = 'AFNXJCGKUP2OYAIU3P4QULJ5BD4ZUJ253BTEG5YJUA1DZGTJ' # your Foursquare Secret
VERSION = '20180604' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: RAL3CFPIP3YUAMLDK0A14B1GEJ0FT4MBORICVKKYZ4KQKMCJ
CLIENT_SECRET:AFNXJCGKUP2OYAIU3P4QULJ5BD4ZUJ253BTEG5YJUA1DZGTJ


#### Explore the data, and get the venues in 500 meters range from our first entry

In [24]:
neighborhood_latitude = df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Parkwoods are 43.7532586, -79.3296565.


#### Create the GET request URL

In [25]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=RAL3CFPIP3YUAMLDK0A14B1GEJ0FT4MBORICVKKYZ4KQKMCJ&client_secret=AFNXJCGKUP2OYAIU3P4QULJ5BD4ZUJ253BTEG5YJUA1DZGTJ&v=20180604&ll=43.7532586,-79.3296565&radius=500&limit=100'

In [27]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f159c249bded205cf01afe8'},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.757758604500005,
    'lng': -79.32343823984928},
   'sw': {'lat': 43.7487585955, 'lng': -79.33587476015072}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 245,
        'cc': 'CA',
        'c

In [28]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']


In [29]:
import json
from pandas.io.json import json_normalize

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Variety Store,Food & Drink Shop,43.751974,-79.333114
