## Step 0: Import libraries

In [1]:
# Importing the essential libraries
import requests    # to use get method of requests library to send a GET request
import pandas as pd     # to organize the data into dataframes
from bs4 import BeautifulSoup    # to parse html web pages

## Step 1: Send a get request to Wikipedia page

In [2]:
#Sending a GET request to 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

## Step 2: Parsing the output html page

In [3]:
# Parsing the output using BeautifulSoup and html parser
complete_soup = BeautifulSoup(response.content, 'html.parser')

# Extracting the wikipedia postal index table
table = complete_soup.find('table', class_ = 'wikitable sortable')

# Find all the table headings using 'th' tags i.e. Postal Code, Borough, Neighbourhood
table_headings = table.find_all('th')

# Storing table headings in 'indexes' array and stripping any unwanted spaces
indexes = []
for heading in table_headings:
    indexes.append(heading.text.strip())

# Finding all the rows using 'tr' tags
table_rows = table.find_all('tr')

# Storing all the data of postal code table in array
table_data_array = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [t.text.strip() for t in td]
    table_data_array.append(row)

# Creating a dataframe from array and indexes
df = pd.DataFrame(table_data_array, columns=indexes)

## Step 3: Cleaning the dataframe

In [4]:
# Dropping rows with a borough that is Not assigned
filtered_df = df[(df['Borough'] != 'Not assigned')]
# Dropping rows that contains NA values
filtered_df = filtered_df.dropna()

In [5]:
# Resetting the index and dropping the old index after the cleaning
filtered_df = filtered_df.reset_index(drop=True)

In [6]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
filtered_df['Neighbourhood'] = filtered_df['Neighbourhood'].replace(to_replace='Not assigned', value=filtered_df['Borough'])

## Step 4: Checking the shape of dataframe

In [7]:
# Checking shape of dataframe
filtered_df.shape

(103, 3)

In [8]:
# Install geopy and folium
!pip install geopy
!pip install folium==0.5.0

# Make all the rows visible in dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [9]:
## Uncomment below lines to run Nominatim server to get location coordinates. It only gives information for 13 out 103 locations.


# def get_coordinates(postal_code):
#     address = f'{postal_code}, Toronto, Ontario'
#     latitude = 'NA'
#     longitude = 'NA'
#     geolocator = Nominatim(user_agent="toronto_explorer")
#     location = geolocator.geocode(address)
#     if(location != None):
#         latitude = location.latitude
#         longitude = location.longitude

#     return [latitude, longitude]

# k = filtered_df['Postal Code'].head().apply(lambda x: get_coordinates(x))
# print(k)

In [10]:
# Getting coordinates data for above postal codes
coordinates_df = pd.read_csv('https://cocl.us/Geospatial_data')

In [11]:
# Merging filtered and coordinates dataframe based on Postal Code column
merged_df = pd.merge(filtered_df, coordinates_df, on='Postal Code')
# Check head of the merged dataframe
merged_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [12]:
# Check the number of unique "Borough" and each one's corresponding count
merged_df['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
York                 5
East Toronto         5
Mississauga          1
Name: Borough, dtype: int64

In [13]:
# Getting coordinates of Toronto, to center our Map at Toronto
def get_coordinates_of_toronto():
    address = 'Toronto, Ontario'
    latitude = 'NA'
    longitude = 'NA'
    geolocator = Nominatim(user_agent="toronto_explorer")
    location = geolocator.geocode(address)
    # Taking care of the case when the given address does not exist on https://www.openstreetmap.org/#map=4/21.84/82.79
    if(location != None):
        latitude = location.latitude
        longitude = location.longitude

    return [latitude, longitude]

In [14]:
# Storing Toronto coordinates in a variable
toronto_coordinates = get_coordinates_of_toronto()
latitude = toronto_coordinates[0]
longitude = toronto_coordinates[1]

In [15]:
# Filtering only the Borough that contain Toronto
filtered_merged_df = merged_df[merged_df['Borough'].apply(lambda x: 'toronto' in x.lower())]
# Resetting and dropping the old index
filtered_merged_df.reset_index(drop=True, inplace= True)
# Check head of the filtered dataframe
filtered_merged_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
