# <center>Applied Data Science Capstone - Peer-graded Assignment 2</center>
# <center>Segmenting and Clustering Neighborhoods in Toronto </center>

### 1. Get postal codes from Wikipedia

In [1]:
import requests

# Get html content:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).content

In [2]:
from scrapy import Selector

# Parse html and find tags containing requested info:
sel = Selector(text=html)
rows = sel.css('table.wikitable > tbody > tr')

In [3]:
# Create list to store parsed rows:
table_rows = []

# Iterate over rows and extract information:
for row in rows[1:]:  # Ommit the first row, it is the table header
    text = row.css('td ::text').extract()  # returns list of strings, could be more than 3 items
    postcode = text[0]
    borough  = text[1]
    neighbor = text[2].replace('\n','')
    
    # If Neighbourhood isn't assigned, replace it with Borough name:
    neighbor = borough if neighbor == 'Not assigned' else neighbor
    
    # Append only codes where Borough is assigned:
    if borough != 'Not assigned':
        table_rows.append(
            {'Postcode'     : postcode,
             'Borough'      : borough,
             'Neighbourhood': neighbor
            }
        )

In [4]:
import pandas as pd

# Convert the list into the pandas DataFrame:
df = pd.DataFrame(table_rows, columns=['Postcode','Borough','Neighbourhood'])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [5]:
# Group neighbourhoods by Postcodes:
result = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ', '.join(x)).to_frame().reset_index()
result.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### 2. Get coordinades

In [6]:
# !pip3 install geocoder

In [7]:
import geocoder

# for pcode in result.Postcode.unique():
#     answer = None
#     while(answer is None):
#         answer = geocoder.google('{}, Toronto, Ontario'.format(pcode)).latlng
#     print('Successfully retrieved coordinates for {}'.format(pcode))
#     result.loc[result.Postcode == pcode, 'Latitude'] = answer[0]
#     result.loc[result.Postcode == pcode, 'Longitude'] = answer[1]

Geocoder didn't return any results, let's try another way: 

In [8]:
# !pip3 install geopy

In [10]:
from geopy.geocoders import Nominatim

# Make a copy of the dataframe:
result_c = result.copy()

geolocator = Nominatim(user_agent="foursquare_agent")

for pcode in result_c.Postcode.unique():
    location = geolocator.geocode('{}, Toronto, Ontario'.format(pcode))
    try:
        result_c.loc[result.Postcode == pcode, 'Latitude']  = location.latitude
        result_c.loc[result.Postcode == pcode, 'Longitude'] = location.longitude
        print('Successfully retrieved coordinates for {}'.format(pcode))
    except:
        print('Didn\'t find coordinates for {}'.format(pcode))

Successfully retrieved coordinates for M1B
Successfully retrieved coordinates for M1C
Didn't find coordinates for M1E
Successfully retrieved coordinates for M1G
Didn't find coordinates for M1H
Didn't find coordinates for M1J
Didn't find coordinates for M1K
Didn't find coordinates for M1L
Didn't find coordinates for M1M
Didn't find coordinates for M1N
Didn't find coordinates for M1P
Didn't find coordinates for M1R
Didn't find coordinates for M1S
Didn't find coordinates for M1T
Didn't find coordinates for M1V
Successfully retrieved coordinates for M1W
Didn't find coordinates for M1X
Didn't find coordinates for M2H
Successfully retrieved coordinates for M2J
Didn't find coordinates for M2K
Didn't find coordinates for M2L
Successfully retrieved coordinates for M2M
Successfully retrieved coordinates for M2N
Didn't find coordinates for M2P
Didn't find coordinates for M2R
Didn't find coordinates for M3A
Didn't find coordinates for M3B
Successfully retrieved coordinates for M3C
Successfully ret

Not so well. We'll have to use provided csv file to extract coordinates

In [11]:
coord = pd.read_csv('Geospatial_Coordinates.csv')
coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
result = pd.merge(left=result,
                  right=coord,
                  left_on='Postcode',
                  right_on='Postal Code',
                  how='left').drop(columns='Postal Code')

result.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
