In [38]:
# Importing libraries 

import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize  # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

print("Libraries are imported!! :) ")

Libraries are imported!! :) 


In [8]:
# Making the raw table

source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source, 'lxml')

table = soup.find("table")
table_rows = table.tbody.find_all("tr")

res = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [tr.text for tr in td]
    
    # Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
    if row != [] and row[1] != "Not assigned":
        # If a cell has a borough but a "Not assigned" neighborhood, then the neighborhood will be the same as the borough.
        if "Not assigned" in row[2]: 
            row[2] = row[1]
        res.append(row)

# Dataframe with 3 columns
df = pd.DataFrame(res, columns = ["PostalCode", "Borough", "Neighborhood"])
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
5,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
6,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"
7,M8A\n,Not assigned\n,Not assigned\n
8,M9A\n,Etobicoke\n,"Islington Avenue, Humber Valley Village\n"
9,M1B\n,Scarborough\n,"Malvern, Rouge\n"


In [9]:
# Remove the unnecessary "\n" at the end of each string in the column "Neighbourhood" 

df["Neighborhood"] = df["Neighborhood"].str.replace("\n","")
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned
1,M2A\n,Not assigned\n,Not assigned
2,M3A\n,North York\n,Parkwoods
3,M4A\n,North York\n,Victoria Village
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"
5,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights"
6,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government"
7,M8A\n,Not assigned\n,Not assigned
8,M9A\n,Etobicoke\n,"Islington Avenue, Humber Valley Village"
9,M1B\n,Scarborough\n,"Malvern, Rouge"


In [11]:
# Grouping the neighbourhoods along with their postal codes

df = df.groupby(["PostalCode", "Borough"])["Neighborhood"].apply(", ".join).reset_index()
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned
1,M1B\n,Scarborough\n,"Malvern, Rouge"
2,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek"
3,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill"
4,M1G\n,Scarborough\n,Woburn
5,M1H\n,Scarborough\n,Cedarbrae
6,M1J\n,Scarborough\n,Scarborough Village
7,M1K\n,Scarborough\n,"Kennedy Park, Ionview, East Birchmount Park"
8,M1L\n,Scarborough\n,"Golden Mile, Clairlea, Oakridge"
9,M1M\n,Scarborough\n,"Cliffside, Cliffcrest, Scarborough Village West"


In [12]:
# Print the size of the dataset

print("Shape: ", df.shape)

Shape:  (180, 3)


In [18]:
# Find the latitude and the longitude coordinates of the neighbourhoods

df_geo_coor = pd.read_csv("http://cocl.us/Geospatial_data")
df_geo_coor.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [19]:
# Complie 2 dataframes "df" and "df_geo_coor" into 1

df_toronto = pd.merge(df, df_geo_coor, how="left", left_on = "PostalCode", right_on = "Postal Code")
# remove the "Postal Code" column
df_toronto.drop("Postal Code", axis=1, inplace=True)
df_toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1A\n,Not assigned\n,Not assigned,,
1,M1B\n,Scarborough\n,"Malvern, Rouge",,
2,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek",,
3,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill",,
4,M1G\n,Scarborough\n,Woburn,,
5,M1H\n,Scarborough\n,Cedarbrae,,
6,M1J\n,Scarborough\n,Scarborough Village,,
7,M1K\n,Scarborough\n,"Kennedy Park, Ionview, East Birchmount Park",,
8,M1L\n,Scarborough\n,"Golden Mile, Clairlea, Oakridge",,
9,M1M\n,Scarborough\n,"Cliffside, Cliffcrest, Scarborough Village West",,


In [21]:
# Explore and cluster the neighbourhoods in Toronto

# 1. step: Find the latitude and longitude values of Toronto

address = "Toronto, ON"

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print("The geograpical coordinate of Toronto is {}, {}." .format(latitude, longitude))

The geograpical coordinate of Toronto is 43.6534817, -79.3839347.


In [37]:
# import folium 

!pip install folium
import pandas as pd
import folium

Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 2.3 MB/s eta 0:00:011
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.11.0


In [39]:
# Get the map of Toronto by using the above latitude and longitude values

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
map_toronto

In [53]:
# Map of a part of Toronto City

# "denc" = [D]owntown Toronto, [E]ast Toronto, [N]orth Toronto, [C]entral Toronto
df_toronto_denc = df_toronto[df_toronto["Borough"].str.contains("Toronto")].reset_index(drop=True)
df_toronto_denc.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E\n,East Toronto\n,The Beaches,,
1,M4K\n,East Toronto\n,"The Danforth West, Riverdale",,
2,M4L\n,East Toronto\n,"India Bazaar, The Beaches West",,
3,M4M\n,East Toronto\n,Studio District,,
4,M4N\n,Central Toronto\n,Lawrence Park,,
5,M4P\n,Central Toronto\n,Davisville North,,
6,M4R\n,Central Toronto\n,"North Toronto West, Lawrence Park",,
7,M4S\n,Central Toronto\n,Davisville,,
8,M4T\n,Central Toronto\n,"Moore Park, Summerhill East",,
9,M4V\n,Central Toronto\n,"Summerhill West, Rathnelly, South Hill, Forest...",,


In [None]:
# Working with FourSquare

CLIENT_ID = ''
CLIENT_SECRET = ''
VERSION = ''