# Segmenting and Clustering Neighborhoods in Toronto

**To Do:**
 * Create link to notebook using: https://nbviewer.jupyter.org/

**Goals:**
* Scrape neighborhood data from Wikipedia page
* Retrieve lat/lng coordinates for each neighborhood
* Explore and cluster neighborhoods

#### Import Required Libraries

In [91]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

#### Scrape Wiki Page

In [92]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
soup = BeautifulSoup(source, 'html.parser')
# print(soup.prettify()) # Uncomment to view scraped HTML

In [93]:
table = soup.find("table", class_="wikitable")
table_body = table.tbody
table_rows = table_body.findAll("tr")

#### Process Table HTML

In [94]:
cols = ["PostalCode", "Borough", "Neighborhood"]
data = []

for row in table_rows:
    cells = row.findAll("td")
    if len(cells) != 0:
        postcode = cells[0].text.strip()
        borough = cells[1].text.strip()
        neighborhood = cells[2].text.strip()
        if borough != "Not assigned":
            if neighborhood == "Not assigned":
                neighborhood = borough
            data.append([postcode, borough, neighborhood])
df = pd.DataFrame(data)
df.columns = cols
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### Group by PostalCode

In [95]:
# Technique 1
df_grouped = df.groupby(['PostalCode','Borough'])['Neighborhood'].agg(lambda col: ', '.join(col))

# Technique 2
# df_grouped = df.groupby(['PostalCode','Borough']).Neighborhood.unique().apply(lambda x: ', '.join(x)).reset_index()

toronto = df_grouped.to_frame().reset_index()
toronto

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### Display Number of Rows

In [96]:
print("The Toronto dataframe has {} rows.".format(toronto.shape[0]))

The Toronto dataframe has 103 rows.


In [97]:
toronto.loc[toronto["PostalCode"] == "M5A",:]

Unnamed: 0,PostalCode,Borough,Neighborhood
53,M5A,Downtown Toronto,"Harbourfront, Regent Park"


#### Get Coordinates of Postal Codes
(Imported from csv as geocoder did not work for me)

In [98]:
## Add two empty columns for lat/lng
toronto["Latitude"], toronto["Longitude"] = np.nan, np.nan

**Attempted to use Geocoder:**

In [99]:
import geocoder

In [100]:
def get_coords(postal_code):
    coords = None

    while (coords is None):
        g = geocoder.google("{}, Toronto, Ontario".format(postal_code))
        coords = g.latlng

    lat = coords[0]
    lng = coords[1]
    return lat, lng

In [101]:
## Apply get coords function to each postal code ** Did not work **
# toronto["Latitude"], toronto["Longitude"] = zip(*toronto["PostalCode"].apply(get_coords))

**Pulled Lat/Lngs from CSV**

In [102]:
coords = pd.read_csv("assets/Geospatial_Coordinates.csv").set_index("Postal Code")
coords.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [103]:
unique_postal_codes = toronto["PostalCode"].unique()
toronto = toronto.set_index("PostalCode")

for p in unique_postal_codes:
    toronto.loc[p, "Latitude"], toronto.loc[p, "Longitude"] = coords.loc[p, "Latitude"], coords.loc[p, "Longitude"]
    
toronto = toronto.reset_index()

In [111]:
toronto.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


#### Get the Lat/Lng of Toronto

In [115]:
from geopy.geocoders import Nominatim
import folium

In [116]:
address = "Toronto, Ontario"

geolocator = Nominatim(user_agent="ca_user")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinate of Toronto, Ontario are 43.653963, -79.387207.


#### Create Map Plotting Helper Function

In [140]:
def plot_locations(center_lat, center_lng, locations_df):
    toronto_map = folium.Map(location=[center_lat, center_lng], zoom_start=11)

    folium.vector_layers.CircleMarker(
        [center_lat, center_lng],
        radius=5,
        color="red",
        fill=True,
        fill_color="red",
        fill_opacity=0.6,
        tooltip="Toronto"
    ).add_to(toronto_map)

    for lat, lng, label in zip(locations_df["Latitude"], locations_df["Longitude"], locations_df["Neighborhood"]):
        folium.vector_layers.CircleMarker(
            [lat, lng],
            radius=3,
            color="blue",
            fill=True,
            fill_color="red",
            fill_opacity=0.6,
            popup=label
        ).add_to(toronto_map)
        
    return toronto_map

#### Map All Neighborhoods

In [145]:
plot_locations(latitude, longitude, toronto)

#### For Simplicity & Speed, Let's Look at a Subset of the Boroughs

In [148]:
segment = toronto[toronto["Borough"].str.contains("Toronto")]
segment.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [150]:
plot_locations(latitude, longitude, segment)

#### Connect to Foursquare

In [151]:
# import Foursquare developer credentials
%run "credentials.py"