# Segmenting and Clustering Neighborhoods in Toronto
## Part1. Collect data from wikipedia and store data into a pandas dataframe

### Load libraries

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os
import requests
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
%matplotlib inline

### Collect data 

In [2]:
# Scrape data from wikipedia page with BeautifulSoup
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})
tbody = table.find('tbody')
rows = tbody.find_all('tr')
data = []
for row in rows:
    tds = row.find_all('td')
    tds = [td.text.strip() for td in tds]
    data.append([td for td in tds if td])
data

[[],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M

In [3]:
# convert the scraped data above to dataframe and set column names.
df = pd.DataFrame(data, columns = ['PostalCode', 'Borough', 'Neighborhood'])
print('dataset size: ', df.shape)
df.head()

dataset size:  (289, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### Explore and Clean Data

In [4]:
# The values of the first row  are all none, we drop rows whose values are all none
df.dropna(axis=0, how='all', inplace=True)
print('dataset size: ', df.shape)
df.head()

dataset size:  (288, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [5]:
# get a subset of data where the value of Borough is not 'Not assigned'
# df.Borough.value_counts()
df = df[df.Borough != 'Not assigned']
df.Borough.value_counts()

Etobicoke           45
North York          38
Scarborough         37
Downtown Toronto    37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64

In [6]:
# combine rows with same postalcode to one row,seperate Neighborhood by ','.
df = df.groupby(['PostalCode', 'Borough'], sort = False).agg(lambda x: ','.join(x))
df.reset_index(inplace= True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


In [7]:
# check the lines where the neighborhood value is 'Not assigned'
df[df.Neighborhood == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
4,M7A,Queen's Park,Not assigned


In [8]:
# where the neighborhood value is 'Not assigned', assign the Borough value to neighborhood value
df.loc[df.Neighborhood == 'Not assigned', 'Neighborhood'] = df[df.Neighborhood == 'Not assigned']['Borough']
df[df.Neighborhood == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [9]:
# check again the size of the new dataset
df.shape

(103, 3)

## Part2. Add coordinates to dataset

In [10]:
# load latitude and longitude dataset
ll_df = pd.read_csv('Geospatial_Coordinates.csv')
ll_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
# merge latitude and longitude info to dataset
df = df.merge(ll_df, how = 'left', left_on = 'PostalCode', right_on = 'Postal Code')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,M7A,43.662301,-79.389494


In [12]:
# drop column 'Postal Code'
df.drop('Postal Code', axis = 1, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
