# COURSERA Applied Data Science Capstone

## Week 3: Segmenting and Clustering Neighborhoods in Toronto

### Part 2

Import all the required libraries

In [1]:
#!conda install -c conda-forge beautifulsoup4 --yes # Only needed one time
#!conda install -c conda-forge geocoder --yes # Only needed one time

import pandas as pd
import requests
from bs4 import BeautifulSoup

print('Libraries imported.')

Libraries imported.


Consolidated code from Part 1

In [2]:
#Scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M , in order to obtain the data that is in the table of postal codes  
#Parse using BeautifulSoup
wikiPage = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
rawWikiPage = requests.get(wikiPage).text
parseWiki = BeautifulSoup(rawWikiPage, 'html.parser')

print('Wiki page scraped and parsed')

#Extract the table and put into Pandas dataframe  
#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
postalCodesTable = []
for tr in parseWiki.tbody.find_all('tr'):
    postalCodesTable.append([ td.get_text().strip() for td in tr.find_all('td')])
rawWikiDF = pd.DataFrame(postalCodesTable, columns=['PostalCode', 'Borough', 'Neighborhood'])

#Keep only the cells that have an assigned borough. Drop records that are all "None" and ignore cells with a borough that is "Not assigned".
workingWikiDF = rawWikiDF.dropna()
workingWikiDF = workingWikiDF[workingWikiDF.Borough != "Not assigned"].reset_index(drop=True)


#Combine rows of neighborhoods that have the same postal code area.
workingWikiDF_groupNeighborhoods = workingWikiDF.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))

#Checking the value of postal code M7A for next step
#print(workingWikiDF_groupNeighborhoods[workingWikiDF_groupNeighborhoods['PostalCode']=='M7A'])

#Assign the borough name as the neighborhood for all neighborhoods with "Not assigned" as neighborhood name.
for index, row in workingWikiDF_groupNeighborhoods.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]       
wikiDF = workingWikiDF_groupNeighborhoods

#Checking the value of postal code M7A to ensure last step worked
#print(wikiDF[wikiDF['PostalCode']=='M7A'])

# Size of the Data Frame
print('The size of the wikiDF data frame is:')
wikiDF.shape

Wiki page scraped and parsed
The size of the wikiDF data frame is:


(103, 3)

The Geocoder package was inconsistent so I chose to add Latitude and Longitude to the dataframe using the data in the .csv provided at  
http://cocl.us/Geospatial_data

Load the csv into a dataframe, and change the name of the 'Postal Code' column to 'PostalCode' to match wikiDF

In [3]:
geoCoord = pd.read_csv("Geospatial_Coordinates.csv")
# change the name of the 'Postal Code' column to 'PostalCode' to match wikiDF
geoCoord.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
# Preview the first 5 lines of the loaded data 
geoCoord.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Join dataframes wikiDF and geoCoord to add the Lat and Long to the data set.

In [4]:
geoWikiDF = pd.merge(wikiDF, geoCoord, on='PostalCode', how='left')
geoWikiDF

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


Export the new dataframe as csv to check the data

In [5]:
geoWikiDF.to_csv('geoWiki.csv', index=False)