## Segmenting and Clustering Neighborhoods in Toronto -PART 1

In [45]:
import pandas as pd
import numpy as np
import bs4 as bs
import urllib.request

In [46]:
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()

In [47]:
soup = bs.BeautifulSoup(source,'html')

In [48]:
#title of the page
print(soup.title)

# get attributes:
print(soup.title.name)

# get values:
print(soup.title.string)

# beginning navigation:
print(soup.title.parent.name)

# getting specific values:
print(soup.p)

<title>List of postal codes of Canada: M - Wikipedia</title>
title
List of postal codes of Canada: M - Wikipedia
head
<p>This is a list of <a href="/wiki/Postal_codes_in_Canada" title="Postal codes in Canada">postal codes in Canada</a> where the first letter is M. Postal codes beginning with M are located within the city of <a href="/wiki/Toronto" title="Toronto">Toronto</a> in the province of <a href="/wiki/Ontario" title="Ontario">Ontario</a>. Only the first three characters are listed, corresponding to the Forward Sortation Area.
</p>


In [82]:
table = soup.find('table', { 'class' : 'wikitable' })


In [83]:
def tableDataText(table):    
    """Parses a html segment started with tag <table> followed 
    by multiple <tr> (table rows) and inner <td> (table data) tags. 
    It returns a list of rows with inner columns. 
    Accepts only one <th> (table header/data) in the first row.
    """
    def rowgetDataText(tr, coltag='td'): # td (data) or th (header)       
        return [td.get_text(strip=True) for td in tr.find_all(coltag)]  
    rows = []
    trs = table.find_all('tr') #how trs = # of rows
    headerrow = rowgetDataText(trs[0], 'th')
    if headerrow: # if there is a header row include first
        rows.append(headerrow)
        trs = trs[1:]
    for tr in trs: # for every table row
        rows.append(rowgetDataText(tr, 'td') ) # data row       
    return rows

In [84]:

list_table = tableDataText(table)
list_table[:20]
 

[['Postal code', 'Borough', 'Neighborhood'],
 ['M1A', 'Not assigned', ''],
 ['M2A', 'Not assigned', ''],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Regent Park / Harbourfront'],
 ['M6A', 'North York', 'Lawrence Manor / Lawrence Heights'],
 ['M7A', 'Downtown Toronto', "Queen's Park / Ontario Provincial Government"],
 ['M8A', 'Not assigned', ''],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Malvern / Rouge'],
 ['M2B', 'Not assigned', ''],
 ['M3B', 'North York', 'Don Mills'],
 ['M4B', 'East York', 'Parkview Hill / Woodbine Gardens'],
 ['M5B', 'Downtown Toronto', 'Garden District, Ryerson'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', ''],
 ['M8B', 'Not assigned', ''],
 ['M9B',
  'Etobicoke',
  'West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale'],
 ['M1C', 'Scarborough', 'Rouge Hill / Port Union / Highland Creek']]

In [85]:
df = pd.DataFrame(list_table,columns =['Postal code','Borough','Neighborhood'])


In [86]:
df.drop(df.loc[df['Postal code']=="Postal code"].index, inplace=True)

In [87]:
df.drop(df.loc[df['Borough']=="Not assigned"].index,inplace=True)


In [88]:
dfgrouped = df.groupby('Postal code')

In [89]:
dfgrouped

<pandas.core.groupby.DataFrameGroupBy object at 0x0000021F7C64DD68>

In [90]:
df.replace(to_replace=r'/', value=',', regex=True)

Unnamed: 0,Postal code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park , Harbourfront"
6,M6A,North York,"Lawrence Manor , Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,"Malvern , Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill , Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


In [91]:
df.shape

(103, 3)

In [92]:
#Get value in specific row of df
x = df.loc[df['Postal code'] == "M5V"]
x

Unnamed: 0,Postal code,Borough,Neighborhood
140,M5V,Downtown Toronto,CN Tower / King and Spadina / Railway Lands / ...


## Segmenting and Clustering Neighborhoods in Toronto -- PART2

In [93]:
import geocoder # import geocoder
ind = 0
for ind in df.index: 
    # print(df['Name'][ind], df['Stream'][ind]) 
    # initialize your variable to None
    lat_lng_coords = None
    print("we are here", ind)
# loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format('Postal code'))
        print ('g=',g)
        lat_lng_coords = g.latlng
    
    print("found coords",ind)

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    df['latitude'] = latitude
    df['longitude'] = longitude


we are here 3
g= <[REQUEST_DENIED] Google - Geocode [empty]>
g= <[REQUEST_DENIED] Google - Geocode [empty]>
g= <[REQUEST_DENIED] Google - Geocode [empty]>
g= <[REQUEST_DENIED] Google - Geocode [empty]>
g= <[REQUEST_DENIED] Google - Geocode [empty]>
g= <[REQUEST_DENIED] Google - Geocode [empty]>
g= <[REQUEST_DENIED] Google - Geocode [empty]>
g= <[REQUEST_DENIED] Google - Geocode [empty]>
g= <[REQUEST_DENIED] Google - Geocode [empty]>
g= <[REQUEST_DENIED] Google - Geocode [empty]>
g= <[REQUEST_DENIED] Google - Geocode [empty]>
g= <[REQUEST_DENIED] Google - Geocode [empty]>
g= <[REQUEST_DENIED] Google - Geocode [empty]>


KeyboardInterrupt: 

In [94]:
df

Unnamed: 0,Postal code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront
6,M6A,North York,Lawrence Manor / Lawrence Heights
7,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Malvern / Rouge
12,M3B,North York,Don Mills
13,M4B,East York,Parkview Hill / Woodbine Gardens
14,M5B,Downtown Toronto,"Garden District, Ryerson"


In [95]:
dfgeo = pd.read_csv("C:/Users/santjie.duplessis/Geospatial_Coordinates.csv")

In [96]:
#Get value in specific row of df
x = dfgeo.loc[dfgeo['Postal Code'] == "M5V"]
x

Unnamed: 0,Postal Code,Latitude,Longitude
68,M5V,43.628947,-79.39442


In [97]:
dfgeo.shape

(103, 3)

In [98]:
df_merged = pd.merge(df, dfgeo, left_on=['Postal code'],
              right_on=['Postal Code'],
              how='inner')

In [99]:
df_merged = df_merged.drop(['Postal Code'], axis=1)
df_merged

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.654260,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,Parkview Hill / Woodbine Gardens,43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Segmenting and Clustering Neighborhoods in Toronto -- PART3