# Notebook used for scraping data and using API for Canada postal codes and locations

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

### URL for wikipedia - postal codes of canada

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

### Retrieve the data from the wiki_url and get the first table(which contains the data for postal codes)

In [3]:
postals = requests.get(wiki_url).text
postals_frame = pd.read_html(wiki_url,flavor='bs4')[0] #First table is the data that we want

### Remove cells that are 'Not assigned'

In [4]:
postals_frame = postals_frame[postals_frame['Borough']!='Not assigned'] #Remove Borough == Not assigned cells

### Join neighbourhoods based on columns "Postal Code" and "Borough"

* separated by comma

In [5]:
postals_frame=postals_frame.pivot_table(index=['Postal Code','Borough'],values='Neighbourhood', aggfunc=lambda x:', '.join(x)).reset_index()

### If a cell has borough data but the column "Neighbourhood" is "Not assigned" then the neighbourhood will be the same as borough

In [6]:
postals_frame['Neighbourhood'] = postals_frame['Neighbourhood'].apply(lambda x:postals_frame['Borough'] if x =='Not assigned' else x)

### Shape of dataframe

In [7]:
postals_frame.shape

(103, 3)

### Since this script is not working (taking too long), we will use the .csv instead

import geocoder # import geocoder

 *initialize your variable to None
lat_lng_coords = None

 * loop until you get the coordinates
for postal_code in postals_frame['Postal Code']:
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    break

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

### Reading from .csv (included in git)

In [8]:
latlong_frame = pd.read_csv('Geospatial_Coordinates.csv')

In [9]:
latlong_frame.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Joining data to get the Latitude and Longitude for postal codes

In [10]:
geo_data = pd.merge(postals_frame,latlong_frame,on='Postal Code', how='left')

### No null values (for lat and long) after join

In [11]:
geo_data.isnull().sum()

Postal Code      0
Borough          0
Neighbourhood    0
Latitude         0
Longitude        0
dtype: int64

### Data of Postal Codes with Latitude and Longitude

In [12]:
geo_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [18]:
from geopy.geocoders import Nominatim
import folium

In [19]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Plot of toronto data from geo_data (Postal codes)

In [22]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(geo_data['Latitude'], geo_data['Longitude'], geo_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto