# Segmenting and Clustering Neighborhoods in Toronto

## 1. Scraping Data

#### Installing beautifulsoup4

##### beautifulsoup is a package for scraping

In [2]:
!pip install beautifulsoup4

Collecting beautifulsoup4
  Downloading https://files.pythonhosted.org/packages/cb/a1/c698cf319e9cfed6b17376281bd0efc6bfc8465698f54170ef60a485ab5d/beautifulsoup4-4.8.2-py3-none-any.whl (106kB)
Collecting soupsieve>=1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.8.2 soupsieve-1.9.5


In [14]:
import bs4 as bs
import urllib
import urllib.request
import numpy as np
import pandas as pd

##### Here we are getting the url, opening the web page with its url and creating a soup to parse the html code

In [15]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)
soup = bs.BeautifulSoup(page, 'html.parser')

##### Here we are looking for the html element containing the data we need, and then we create a variable to stock it raw

In [16]:
content = ""
for table in soup.findAll('table', {'class':'wikitable'}):
    content += table.text

## 2. Creating the dataframe

##### Here we are cleaning the string to put it in an np array

In [17]:
content = content.split("\n")
content = list(filter(None, content))
content = np.asarray(content)

##### As we know we need 3 columns but don't know the number of rows : we calculate it by  dividing the size of the list by 3

In [18]:
int(len(content)/3)

288

We reshape the our variable so with have a x * 3 matrix

In [19]:
content = content.reshape(int(len(content)/3),3)

##### Now we can create our dataframe

In [20]:
df = pd.DataFrame(data=content[1:,0:], columns=content[0,0:])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


##### Here we are cleaning not assigned values for Borough

In [21]:
df = df[df.Borough != "Not assigned"].reset_index(drop=True)

In [22]:
df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


##### Then we group by Postcode and Borough so the Neighbourhoods are joined

In [23]:
df = df.groupby(['Postcode', 'Borough']).agg(', '.join).reset_index()

##### Now we need to find eventual cells where Neighbourhood is not assigned :

In [24]:
check = df['Neighbourhood'] == 'Not assigned'
ctr = 0
for i in check:
    if(i):
        print(ctr, " : ", i)
    ctr += 1

93  :  True


In [25]:
# There is the only case where a Neighbourhood is not assigned
df.iloc[93]

Postcode                  M9A
Borough          Queen's Park
Neighbourhood    Not assigned
Name: 93, dtype: object

##### And we can replace the 'Not assigned' value with the Borough value

In [26]:
# There is one case where Neighbourhoud is not assigned
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df.loc[df['Neighbourhood'] == 'Not assigned', 'Borough']

In [27]:
df.iloc[93]

Postcode                  M9A
Borough          Queen's Park
Neighbourhood    Queen's Park
Name: 93, dtype: object

In [28]:
df.shape

(103, 3)

# Part 2

In [1]:
!pip install geocoder

Collecting geocoder
  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
Collecting future (from geocoder)
Collecting requests (from geocoder)
  Using cached https://files.pythonhosted.org/packages/51/bd/23c926cd341ea6b7dd0b2a00aba99ae0f828be89d72b2190f27c11d4b7fb/requests-2.22.0-py2.py3-none-any.whl
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Collecting click (from geocoder)
  Downloading https://files.pythonhosted.org/packages/fa/37/45185cb5abbc30d7257104c434fe0b07e5a195a6847506c074527aa599ec/Click-7.0-py2.py3-none-any.whl (81kB)
Collecting idna<2.9,>=2.5 (from requests->geocoder)
  Using cached https://files.pythonhosted.org/packages/14/2c/cd551d81dbe15200be1cf41cd03869a46fe7226e7450af7a6545bfc474c9/idna-2.8-py2.py3-none-any.whl

In [29]:
import geocoder
import requests

In [30]:
# We could use this function for getting all lat and long
# But we are restrained int terms of number of request per day
def getLatLng(borough, postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google(('{}, '+borough).format(postal_code))
        lat_lng_coords = g.latlng
        
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude, longitude

In [32]:
geodata = pd.read_csv('Geospatial_Coordinates.csv')
geodata.rename(columns={'Postal Code':'Postcode'}, inplace=True)
geodata.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [34]:
test = pd.merge(df, geodata, on=['Postcode'])
print(test.shape)
test.head(20)

(103, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
