# Segmenting and Clustering Neighborhoods in Toronto

## 1. Extracting, cleaning and displaying data from Toronto neighborhoods via Wikipedia

In [1]:
# import required libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

Scrape data from Wikipedia website

In [2]:
# read the webpage from the wiki
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text

In [3]:
# process and convert html data
soup = BeautifulSoup(source, "html5lib")
# find table
table = soup.find("table").tbody

Create a Dataframe, ignoring cells with Borough that is **Not assigned**.

In [4]:
table_cleaned = []
for row in table.find_all("td"):
    cell = {}
    if row.span.text == "Not assigned":
        pass
    else:
        cell["PostalCode"] = row.p.text[:3]
        cell["Borough"] = row.span.text.split("(")[0]
        cell["Neighborhood"] = row.span.text.split("(")[1].strip(")").replace(" /", ",") # replace "(", ")", " /" with ", "
        table_cleaned.append(cell)

df = pd.DataFrame(table_cleaned)

In [5]:
# First 11 rows of cleaned dataframe
df.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills)North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
# Dimension of cleaned dataframe
df.shape

(103, 3)

## 2. Adding the latitude and longitude coordinates to the dataframe

In [7]:
csv_path = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv"
df1 = pd.read_csv(csv_path)
df1.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge the two dataframes, merge matching rows in the right dataframe to the left dataframe, thus keeping all rows present in the left dataframe.

In [8]:
df2 = pd.merge(left=df, right=df1, left_on="PostalCode", right_on="Postal Code")
df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,M7A,43.662301,-79.389494


Drop Postal Code column.

In [9]:
df2 = df2.drop(columns = ["Postal Code"])

In [10]:
# First 11 rows of new dataframe
df2.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills)North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [11]:
# Dimension of new dataframe
df2.shape

(103, 5)