# Capstone Week 3
Install and Import pgeocode package to retrieve coordinates of Canada Postcodes

Install and import libxml2, libxslt, lxml to read html pages

# Screen Scrape Postcodes into a Panda DataFrame


In [111]:
%%capture
! pip install pgeocode

In [112]:
# read url directly into a pandas dataframe, get Postal Code, Borough and Neighborhood columns directly using loc
import pandas as pd

html_string = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

!conda install libxml2 --yes
!conda install libxslt --yes
!conda install lxml --yes

import lxml

raw_html = pd.read_html(html_string)
postcodes = raw_html[0]
postcodes = postcodes.loc[:,['Postal Code','Borough','Neighborhood']]
#Filter out unassigned postcodes and ensure there are no NaNs
postcodes_ass = postcodes[postcodes.Borough != 'Not assigned']
postcode_ass = postcodes_ass.dropna()
postcodes_ass.head()

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - libxml2


The following packages will be SUPERSEDED by a higher-priority channel:

  ca-certificates    conda-forge::ca-certificates-2020.4.5~ --> pkgs/main::ca-certificates-2020.1.1-0
  certifi            conda-forge::certifi-2020.4.5.1-py36h~ --> pkgs/main::certifi-2020.4.5.1-py36_0
  openssl            conda-forge::openssl-1.1.1g-h516909a_0 --> pkgs/main::openssl-1.1.1g-h7b6447c_0


Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [113]:
postcodes_ass.shape

(103, 3)

# Retrieve postcode coordinates

Request denied on Google Geocoder, then pgeocode used instead, pgeocode is straightforward method of retrievng the coordinates of all the postcodes

After some data wrangling merge the postcodes with the retrieved coords.

In [114]:
#pgeocode Geocoder used instead, as Google Geocoder to declined query requests
import pgeocode

#Retrieve Cananda locations
nomi = pgeocode.Nominatim('ca')

#convert dataframe to lust and use as argument to retrieve locations
coords = nomi.query_postal_code(postcodes_ass.loc[:,'Postal Code'].tolist())
coords = coords[['postal_code','latitude','longitude']]
coords.rename(columns={'postal_code':'Postal Code',
                          'latitude':'Latitude',
                          'longitude':'Longitude'}, 
                 inplace=True)
# Merge postcodes and coorindates dataframes
pc_concat = pd.merge(postcodes_ass,coords, on= 'Postal Code')
pc_concat.dropna(inplace=True)
pc_concat.tail(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
93,M8W,Etobicoke,"Alderwood, Long Branch",43.6021,-79.5402
94,M9W,Etobicoke,"Northwest, West Humber - Clairville",43.7144,-79.5909
95,M1X,Scarborough,Upper Rouge,43.834,-79.2069
96,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.6684,-79.3689
97,M5X,Downtown Toronto,"First Canadian Place, Underground city",43.6492,-79.3823
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.6518,-79.5076
99,M4Y,Downtown Toronto,Church and Wellesley,43.6656,-79.383
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.7804,-79.2505
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.6325,-79.4939
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,...",43.6256,-79.5231


# Toronto Map and postcodes clustering

Install Fulcrum, geopy and geopandas

In [115]:
%%capture
! pip install geopy
! pip install geopandas --verbose
!conda install -c conda-forge folium=0.11.0 --yes

Import Nominatim, geopandas, geopy
Dropped google's geolocation service as it was denying requests
Used Nominatim instead

In [116]:
# Let's starting putting the dots on the map
# define the Toronto map usng Folium package

import folium
import geopandas
import geopy

# Used geopy to retrieve Toronto's coordinates

from geopy.geocoders import Nominatim
locator = Nominatim(user_agent='myGeocoder')
location = locator.geocode('Toronto, Ontario')

markers = folium.map.FeatureGroup()
pc_concat.shape

(102, 5)

In [117]:
## Test to ensure that no NaN values are present in the dataframe
pc_concat.dropna(inplace=True)
pc_concat.isnull().sum().sum()

0

In [118]:
# loop through the postcode and add each to the markers feature group
for lab, lat, lng, in zip(pc_concat['Postal Code'],pc_concat.Latitude, pc_concat.Longitude):
    markers.add_child(
        folium.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            color='yellow',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6,
            popup = lab
        )
    )
    
toron_map = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)
# Add Markers to map and display
markers.add_to(toron_map)
toron_map