# Segmenting and Clustering Neighborhoods in Toronto

In [270]:
import pandas as pd
from pandas.io.json import json_normalize # transform json file into pandas dataframe
import folium
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

#### Scrape data from wikipedia

In [12]:
import requests
from lxml import html

In [87]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
tree = html.document_fromstring(response.text)
d={}
d['postcode']=tree.xpath("//table[@class = 'wikitable sortable']/tbody/tr/td[1]/text()")
d['borough']=[x.xpath('string(.)') for x in tree.xpath("//table[@class = 'wikitable sortable']/tbody/tr/td[2]")]
d['neighborhood']=[x.xpath('string(.)').replace('\n','')  for x in tree.xpath("//table[@class = 'wikitable sortable']/tbody/tr/td[3]")]

df = pd.DataFrame(d)

In [236]:
df.head(20)

Unnamed: 0,postcode,borough,neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


#### Drop records with borough "Not assigned"

In [89]:
df = df.drop(df.loc[df.borough=='Not assigned'].index)

#### Rename neighborhood name with borough name where neighborhood is still "Not assigned"

In [94]:
for i in df.index:
    if df.loc[i].neighborhood == 'Not assigned':
        df.loc[i,'neighborhood']=df.loc[i].borough

In [180]:
grouped_df = df.groupby('postcode')
result_df = pd.DataFrame(columns=['Postcode','Borough','Neighborhood'])

In [190]:
for postcode in grouped_df.groups.keys():
    br = grouped_df.get_group(postcode)['borough'].iloc[0]
    nei = ','.join(grouped_df.get_group(postcode)['neighborhood'].values)
    result_df = result_df.append({'Postcode':postcode,'Borough':br,'Neighborhood':nei},ignore_index=True)

In [192]:
print(result_df.shape)
result_df


(103, 3)


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


#### Reading geographical coordinates

In [230]:
geo_df = pd.read_csv('Geospatial_Coordinates.csv')
geo_df.columns=['Postcode','Latitude','Longitude']

In [233]:
result_df = result_df.join(geo_df.set_index('Postcode'), on='Postcode')

In [264]:
result_df['n_neighborhood'] = result_df['Neighborhood'].apply(lambda x : len(x.split(',')))

In [265]:
result_df

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,n_neighborhood
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353,2
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,3
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,3
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476,1
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029,3
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577,3
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476,3
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848,2


#### Visualization of neighborhoods' location

In [252]:
result_df.Borough.unique().size
    

11

In [261]:
toronto = folium.Map([43.7153834, -79.40567840000001],zoom_start=11)

#add markers for each postcode zone on the map
for i in result_df.index:
    coordinate = [result_df.loc[i].Latitude,result_df.loc[i].Longitude]
    boro = result_df.loc[i].Borough
#     folium.Marker(coordinate,popup=str(boro)).add_to(toronto)
    folium.Circle(coordinate,radius=200,popup=str(boro)).add_to(toronto)
toronto

#### Using Foursquare api to explore these region

In [271]:
CLIENT_ID = 'GBIVTOMHIWPLGPG4HZ0EXCZZ1H3RL2BOEBV3TVMN253WU43F' 
CLIENT_SECRET = 'FAUTZDAWQ1RA4TV0F22BAROLZBUV2EDE5BKX0WWAAPWXASV1' 
VERSION = '20180605'

Create the GET request url 

#### Let' s take a look at M6G Downtown Toronto 

In [294]:
LIMIT = 100
radius = 500
neighborhood_latitude = 43.669542
neighborhood_longitude = -79.422564
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
result_4square = requests.get(url).json()



In [295]:
result_4square

{'meta': {'code': 200, 'requestId': '5d586775e0c0c90038a33035'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4adcfd7cf964a5203e6321e3-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/food_grocery_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d118951735',
         'name': 'Grocery Store',
         'pluralName': 'Grocery Stores',
         'primary': True,
         'shortName': 'Grocery Store'}],
       'id': '4adcfd7cf964a5203e6321e3',
       'location': {'address': '200 Christie St',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'at Essex St',
        'distance': 205,
        'formattedAddress': ['200 Christie St (at Essex St)',
         'Toronto ON M6G 3B6',
         'Canada'],
        'labeled

In [296]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [297]:
venues = result_4square['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

In [298]:
nearby_venues

Unnamed: 0,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,Fiesta Farms,"[{'id': '4bf58dd8d48988d118951735', 'name': 'G...",43.668471,-79.420485
1,Contra Cafe,"[{'id': '4bf58dd8d48988d16d941735', 'name': 'C...",43.669107,-79.426105
2,Vinny’s Panini,"[{'id': '4bf58dd8d48988d110941735', 'name': 'I...",43.670679,-79.426148
3,Starbucks,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",43.671585,-79.421366
4,Universal Grill,"[{'id': '4bf58dd8d48988d147941735', 'name': 'D...",43.67055,-79.426541
5,Actinolite,"[{'id': '4bf58dd8d48988d1c4941735', 'name': 'R...",43.667858,-79.428054
6,Scout and Cash Caffe,"[{'id': '4bf58dd8d48988d16d941735', 'name': 'C...",43.66736,-79.419938
7,Faema Caffe,"[{'id': '4bf58dd8d48988d16d941735', 'name': 'C...",43.671046,-79.419297
8,Loblaws,"[{'id': '4bf58dd8d48988d118951735', 'name': 'G...",43.671807,-79.421102
9,Marlenes Just Babies,"[{'id': '52f2ab2ebcbc57f1066b8b32', 'name': 'B...",43.671824,-79.420499


In [299]:
# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Fiesta Farms,Grocery Store,43.668471,-79.420485
1,Contra Cafe,Café,43.669107,-79.426105
2,Vinny’s Panini,Italian Restaurant,43.670679,-79.426148
3,Starbucks,Coffee Shop,43.671585,-79.421366
4,Universal Grill,Diner,43.67055,-79.426541


In [300]:
nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Fiesta Farms,Grocery Store,43.668471,-79.420485
1,Contra Cafe,Café,43.669107,-79.426105
2,Vinny’s Panini,Italian Restaurant,43.670679,-79.426148
3,Starbucks,Coffee Shop,43.671585,-79.421366
4,Universal Grill,Diner,43.67055,-79.426541
5,Actinolite,Restaurant,43.667858,-79.428054
6,Scout and Cash Caffe,Café,43.66736,-79.419938
7,Faema Caffe,Café,43.671046,-79.419297
8,Loblaws,Grocery Store,43.671807,-79.421102
9,Marlenes Just Babies,Baby Store,43.671824,-79.420499


#### Visualize the nearby venues

In [313]:
center_map=folium.Map([43.668471,-79.420485],zoom_start=15)
for i in nearby_venues.index:
    coordinate = [nearby_venues.loc[i].lat,nearby_venues.loc[i].lng]
    folium.Marker(coordinate,popup=str(nearby_venues.loc[i]['name'])+' \nCategory:'+str(nearby_venues.loc[i].categories)).add_to(center_map)
                  
                  
center_map