## Capstone Project Week 3 - Segmenting and Clustering Neighborhoods in Toronto (together Part 1&2 of 3)

## Web scraping using BeautifulSoup

In [3]:
import requests
from bs4 import BeautifulSoup

import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis

In [4]:
result = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
src = result.content
soup = BeautifulSoup(src, 'html.parser')

### grab the first table - postal code table

In [5]:
#print(soup.prettify())
postalcode_html = soup.find_all('table')[0]

### Put html table into dataframe

In [6]:
table_rows = postalcode_html.find_all('tr')
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)

column_names = ['PostalCode', 'Borough', 'Neighborhood'] 
postalcode_table = pd.DataFrame(l, columns=column_names)

### Remove rows that have no proper values in Borough

In [7]:
# Drop any rows with no values in Borough
postalcode_table.dropna(subset=['Borough'], axis=0, inplace=True)    

# Get indexes for which Borough column has 'Not assigned', then delete rows from dataframe
indexNoBorough = postalcode_table[ postalcode_table['Borough'] == 'Not assigned' ].index
postalcode_table.drop(indexNoBorough , inplace=True)

# reset index after dropped rows
postalcode_table.reset_index(drop=True, inplace=True)
postalcode_table

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods\n
1,M4A,North York,Victoria Village\n
2,M5A,Downtown Toronto,Harbourfront\n
3,M5A,Downtown Toronto,Regent Park\n
4,M6A,North York,Lawrence Heights\n
5,M6A,North York,Lawrence Manor\n
6,M7A,Queen's Park,Not assigned\n
7,M9A,Etobicoke,Islington Avenue\n
8,M1B,Scarborough,Rouge\n
9,M1B,Scarborough,Malvern\n


### Clean up Neighborhood values and modify 'Not assigned' Neighborhood with Borough values

In [8]:
# Remove column Neighorhood '\n' carriage return
postalcode_table['Neighborhood'] = postalcode_table['Neighborhood'].str.replace('\n','')

# Neighborhood cell set to same as borough for Neighborhood with value 'Not assigned'
postalcode_table.loc[postalcode_table['Neighborhood'] == 'Not assigned', 'Neighborhood'] = postalcode_table['Borough']
postalcode_table.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


### Combine Neighborhood based on same PostalCode and Borough

In [9]:
# Combine Neighborhood based on same Postalcode and Borough
postalcode_table = postalcode_table.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x : x.sum() if x.dtype=='float64' else ','.join(x))
postalcode_table

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [10]:
postalcode_table.shape

(103, 3)

## Read postal .csv file by the url using pandas and assign it to df

In [11]:
csv_path = "http://cocl.us/Geospatial_data"
df = pd.read_csv(csv_path)
df.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
df.shape

(103, 3)

## join both dataframes

In [18]:
postalcd_Lat_Lng_table = postalcode_table.join(df, how='outer')
postalcd_Lat_Lng_table.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


## remove extra 'Postal Code' column

In [20]:
del postalcd_Lat_Lng_table['Postal Code']
postalcd_Lat_Lng_table

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [21]:
postalcd_Lat_Lng_table.shape

(103, 5)