In [1]:
#STEP 1- Scraping WikiPedia and Creating Data Frame

# install and import libraries
#neighbourhood maps from open.toronto.ca


# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install bs4
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install requests
!{sys.executable} -m pip install folium
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install lxml
!{sys.executable} -m pip install html5lib 

!{sys.executable} -m pip install geopy
!{sys.executable} -m pip install sklearn
!{sys.executable} -m pip install Matplotlib
!{sys.executable} -m pip install pgeocode

import pgeocode
#library to handle JSON files
import json

# Convert an address  into latitude and longitude values
from geopy.geocoders import Nominatim 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# Import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library


from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import folium

print('Libraries installed & imported.')

# Get the dataset metadata by passing package_id to the package_search endpoint
# For example, to retrieve the metadata for this dataset:

url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
params = { "id": "4def3f65-2a65-4a4f-83c4-b2a4aed72d46"}
package = requests.get(url, params = params).json()


Libraries installed & imported.


In [2]:
# Getting the data, preprocessing, and cleaning the data

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url)
postal_code_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source.text, 'html')

#using soup object, iterate the wikitable to get the data from the HTML page and store it into a list
data = []
columns = []
table = soup.find("tbody")

#function for finding italics 
def has_italics_not_assigned(tag):
    return tag.has_attr('i') and not tag.has_attr('a')

def extract_postal(pcode):
    return pcode[0:3]

def extract_borough(borough):
    borough = borough[3:len(borough)]
    borough = borough.rpartition("(")[0]
    return borough

def extract_neighbourhood(nbhood):
    nbhood = nbhood.rpartition("(")[2]
    nbhood = nbhood.rpartition(")")[0]
    return nbhood

postals = []
boroughs = []
neighbourhoods =  []

for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
            td_string = td.text.strip()
            borough = td_string[3:len(td_string)]
            
            if (borough != "Not assigned"):
                section.append(extract_postal(td_string) + " " + borough)
                
                postals.append(extract_postal(td_string))
                boroughs.append(extract_borough(td_string))
                neighbourhoods.append(extract_neighbourhood(td_string))
            
            else:
                section.append("")
    #First row of data is the header
    """if (index == 0):
        columns = ['M1_', 'M2_', 'M3_', 'M4_', 'M5_', 'M6_', 'M7_', 'M8_', 'M9_']
    else:
        data.append(section)"""

#convert list into Pandas DataFrame
"""Toronto_cleaned_df = pd.DataFrame(data = data, columns = columns)
Toronto_df"""

data = {"Postal Code": postals, "Borough": boroughs, "Neighbourhood": neighbourhoods}
Toronto_cleaned_df = pd.DataFrame(data)
Toronto_cleaned_df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


In [3]:
# STEP 2: Getting the coordinates of each neighbourhood

# getting the coordinates in a new data frame
pcode = Toronto_cleaned_df['Postal Code'].tolist()
nomi = pgeocode.Nominatim('ca')
New_df = nomi.query_postal_code(pcode)
New_df

# adding coordinate from New_df to the Toronto_cleaned_df 
Toronto_cleaned_df['Latitude'] = New_df['latitude']
Toronto_cleaned_df['Longitude'] = New_df['longitude']


#Correcting the coordinates for rows 76. 
Toronto_cleaned_df.loc[76,['Latitude', 'Longitude']] = 43.6364,-79.6157

Toronto_cleaned_df




Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.3300
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.6555,-79.3626
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.7223,-79.4504
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889
...,...,...,...,...,...
98,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North,43.6518,-79.5076
99,M4Y,Downtown Toronto,Church and Wellesley,43.6656,-79.3830
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L,43.7804,-79.2505
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...,43.6325,-79.4939


In [4]:
#Step 3 - Clustering Neighboorhood

# Get coordinates of Toronto 
address = 'Toronto, ON, CA'

geolocator = Nominatim(user_agent="TR_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))



#making a list of boroughs in Toronto to be used in visualization 
boroughs = list(set(Toronto_cleaned_df['Borough']))

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)
colors= ['#0000FF','#FF4040', '#66CD00', '#00CDCD', '#CAFF70', '#9932CC', '#EEC900', '#808080', '#FF69B4', '#FFFFE0', '#FF0000', '#FFA500', '#BDB76B', '#006400', '#00FF00' , '#AFEEEE' , '#4169E1' ]

# add markers to map
for lat, lng, borough, neighbourhood in zip(Toronto_cleaned_df['Latitude'], Toronto_cleaned_df['Longitude'], Toronto_cleaned_df['Borough'], Toronto_cleaned_df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    for i in range(0, len(boroughs)) :
        if boroughs[i] == borough : 
            c = colors[i]
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color= c,
        fill=True,
        fill_color= c,
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [5]:
#North York Clustering
nyork_data = Toronto_cleaned_df[Toronto_cleaned_df['Borough'] == 'North York'].reset_index(drop=True)
nyork_data.head()

# Getting coordinates of North York
address = 'North York, ON, CA'

geolocator = Nominatim(user_agent="nyork_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

# create map of North York using latitude and longitude values
map_nyork = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(nyork_data['Latitude'], nyork_data['Longitude'], nyork_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7,
        parse_html=False).add_to(map_nyork)  
    
map_nyork

The geograpical coordinate of North York are 43.7543263, -79.44911696639593.


In [7]:
#Create function for FSQ API calls
APIKEY = 'fsq3Sl1rMVbkkKn6h2Of0j//V/kM4JJOxpn5Y/SMMgsvoRQ='

def getNearbyVenues(names, latitudes, longitudes, radius, limit):
    headers = {
        'Accept': 'application/json',
        'Authorization': 'APIKEY'
    }

    URL = 'https://api.foursquare.com/v3/places/search?ll={}&radius={}&limit={}'

    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)

        url = URL.format(str(lat) + '%2C' + str(lng), radius, limit)
        print(url)
        #response = requests.get(url, headers=headers)
        response = requests.request("GET", url, headers=headers)
        #data = requests.get(url, headers=headers).json()
        data = json.loads(response.text)
        # do stuff here to process venues_list
        for record in data['results']:
            venues_list.append({
                'Neighborhood': name,
                'Neighborhood Latitude': lat,
                'Neighborhood Longitude': lng,
                'Venue': record['name'],
                'Venue Latitude': record['geocodes']['main']['latitude'],
                'Venue Longitude': record['geocodes']['main']['longitude'],
                'Venue Category': record['categories'][0]['name'],
            })

    # do stuff here to create your dataframe
    nearby_venues = pd.DataFrame(venues_list)

    return(nearby_venues)

print(data)
getNearbyVenues(nyork_data['Neighbourhood'], nyork_data['Latitude'], nyork_data['Longitude'], 500, 50)
#getNearbyVenues(Toronto_cleaned_df['Neighbourhood'], Toronto_cleaned_df['Latitude'], Toronto_cleaned_df['Longitude'], 500, 50)


{'Postal Code': ['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B', 'M5B', 'M6B', 'M9B', 'M1C', 'M3C', 'M4C', 'M5C', 'M6C', 'M9C', 'M1E', 'M4E', 'M5E', 'M6E', 'M1G', 'M4G', 'M5G', 'M6G', 'M1H', 'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M1J', 'M2J', 'M3J', 'M4J', 'M5J', 'M6J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M1L', 'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M9L', 'M1M', 'M2M', 'M3M', 'M4M', 'M5M', 'M6M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N', 'M6N', 'M9N', 'M1P', 'M2P', 'M4P', 'M5P', 'M6P', 'M9P', 'M1R', 'M2R', 'M4R', 'M5R', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M5S', 'M6S', 'M1T', 'M4T', 'M5T', 'M1V', 'M4V', 'M5V', 'M8V', 'M9V', 'M1W', 'M4W', 'M5W', 'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X', 'M4Y', 'M7Y', 'M8Y', 'M8Z'], 'Borough': ['North York', 'North York', 'Downtown Toronto', 'North York', "Queen's Park", 'Etobicoke', 'Scarborough', 'North York', 'East York', 'Downtown Toronto', 'North York', 'Etobicoke', 'Scarborough', 'North York(Don Mills)South', 'East York', 'Downtown T

JSONDecodeError: Expecting value: line 1 column 1 (char 0)