#Importing Libraries

In [1]:
#importing Libraries
import requests
import lxml.html as lh
import bs4 as bs
import urllib.request
import numpy as np 
import pandas as pd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns
from geopy.geocoders import Nominatim  # convert an address into latitude and longitude values
import folium # map rendering library

#Extracting The Toronto Data

In [2]:


#Getting the data from url
url = "http://zims-en.kiwix.campusafrica.gos.orange.com/wikipedia_en_all_nopic/A/List_of_postal_codes_of_Canada:_M"
res = requests.get(url)
soup = bs.BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
data = pd.read_json(df[0].to_json(orient='records'))
#First 30 records
data.head(30)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Not assigned
8,M8A,Not assigned,Not assigned
9,M9A,Downtown Toronto,Queen's Park


#Grouping according to "Borough"

In [3]:
#Choosing only data where field Borough doesn't have not assigned value
raw_data_selected = data[data['Borough'] != 'Not assigned']
#Grouping Data
raw_data_selected = raw_data_selected.groupby(['Borough', 'Postcode'], as_index=False).agg(','.join)
raw_data_selected.head(30)

Unnamed: 0,Borough,Postcode,Neighbourhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,North Toronto West
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park,Summerhill East"
5,Central Toronto,M4V,"Deer Park,Forest Hill SE,Rathnelly,South Hill,..."
6,Central Toronto,M5N,Roselawn
7,Central Toronto,M5P,"Forest Hill North,Forest Hill West"
8,Central Toronto,M5R,"The Annex,North Midtown,Yorkville"
9,Downtown Toronto,M4W,Rosedale


In [26]:
raw_data_selected['Neighbourhood'] = np.where(raw_data_selected['Neighbourhood'] == 'Not assigned', raw_data_selected['Borough'], raw_data_selected['Neighbourhood'])

In [5]:
raw_data_selected.head(30)

Unnamed: 0,Borough,Postcode,Neighbourhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,North Toronto West
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park,Summerhill East"
5,Central Toronto,M4V,"Deer Park,Forest Hill SE,Rathnelly,South Hill,..."
6,Central Toronto,M5N,Roselawn
7,Central Toronto,M5P,"Forest Hill North,Forest Hill West"
8,Central Toronto,M5R,"The Annex,North Midtown,Yorkville"
9,Downtown Toronto,M4W,Rosedale


#Extracting Geo-spacial data

In [6]:
geospatial_url = "https://cocl.us/Geospatial_data"
geospatial_data = pd.read_csv(geospatial_url)
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#Merging Geo-spacial data with existing data

In [7]:
geospatial_data.columns = ['Postcode', 'Latitude', 'Longitude']
merged_data = pd.merge(raw_data_selected, geospatial_data, on='Postcode')
merged_data.head()

Unnamed: 0,Borough,Postcode,Neighbourhood,Latitude,Longitude
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197
2,Central Toronto,M4R,North Toronto West,43.715383,-79.405678
3,Central Toronto,M4S,Davisville,43.704324,-79.38879
4,Central Toronto,M4T,"Moore Park,Summerhill East",43.689574,-79.38316


In [8]:
merged_data['Coordinates'] = list(zip(merged_data['Latitude'], merged_data['Longitude']))
merged_data.head()

Unnamed: 0,Borough,Postcode,Neighbourhood,Latitude,Longitude,Coordinates
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879,"(43.7280205, -79.3887901)"
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197,"(43.7127511, -79.3901975)"
2,Central Toronto,M4R,North Toronto West,43.715383,-79.405678,"(43.7153834, -79.40567840000001)"
3,Central Toronto,M4S,Davisville,43.704324,-79.38879,"(43.7043244, -79.3887901)"
4,Central Toronto,M4T,"Moore Park,Summerhill East",43.689574,-79.38316,"(43.6895743, -79.38315990000001)"


In [9]:
merged_data['Coordinates'] = merged_data['Coordinates'].apply(Point)

In [10]:
gdf = gpd.GeoDataFrame(merged_data, geometry='Coordinates')
gdf.head()

Unnamed: 0,Borough,Postcode,Neighbourhood,Latitude,Longitude,Coordinates
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879,POINT (43.72802 -79.38879)
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197,POINT (43.71275 -79.39020)
2,Central Toronto,M4R,North Toronto West,43.715383,-79.405678,POINT (43.71538 -79.40568)
3,Central Toronto,M4S,Davisville,43.704324,-79.38879,POINT (43.70432 -79.38879)
4,Central Toronto,M4T,"Moore Park,Summerhill East",43.689574,-79.38316,POINT (43.68957 -79.38316)


#Showing the map of toronto with clusters

In [11]:
address = 'Toronto, TOR'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 43.7370584, -79.2442535.


In [24]:
# create map of Manhattan using latitude and longitude values
TOR_map = folium.Map(location=[latitude, longitude], zoom_start=10.5)

# add markers to map
for lat, lng, label in zip(merged_data['Latitude'], merged_data['Longitude'], merged_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.2,
        parse_html=False).add_to(TOR_map)  
    
TOR_map