In [5]:
import pandas as pd
import numpy as np 
from bs4 import BeautifulSoup as bs
import requests as req

**Scraping Data From Website**

In [7]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
web = req.get(url)

**Checking if data has been scraped from website**

In [8]:
web.status_code

200

**Extracting the content of the scraped data**

In [10]:
content = web.content
content

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of postal codes of Canada: M - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"ff1dbaab-0815-40f9-9a0d-4f9528406a80","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":969510799,"wgRevisionId":969510799,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Communications in Onta

**Creating a Manageble form of the content using BeautifulSoup**

In [12]:
soup = bs(content,"lxml")

**Data Wrangling to get only the important features**

In [128]:
info = soup.find_all('td') #getting everything with td tag
everything= []
postal_code = [] 
borough = []
neighbourhood = []
for i in info:
  information = str(i)[4:-6] # removing td from the front and the start leaving us with exact data of interest
  everything.append(information) # storing all the important information in 'everything'
  

In [129]:
# from a little observation we see the data comes in batches of 3
for i in range(int(len(everything)/3)):
  if len(everything[3*i]) > 3: #as important information is stored after and with postal codes as observed
    break # if the length of the information is more than 3 we break, as we dont have any relevant info after finishing of postal codes in the table
  else:
    postal_code.append(everything[3*i])
    borough.append(everything[1+3*i])
    neighbourhood.append(everything[2+3*i])


    

**Converting into Pandas DataFrame**

In [134]:
df = pd.DataFrame({
    "Postal Code": postal_code,
    "Borough": borough,
    "Neighborhood": neighbourhood
},columns = ["Postal Code","Borough","Neighborhood"])

In [135]:
df # Looking at our newly formed dataframe

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


**Removing unassigned rows**

As we see there are a lot of unassigned rows, that are of no use, lets remove them.

In [139]:
Df = df[df.Neighborhood != 'Not assigned']
Df = Df.reset_index(drop = True)

*Lets check if any borough value is missing*

In [142]:
Df['Borough'].isnull().value_counts()

False    103
Name: Borough, dtype: int64

*We see there is no Borough value which is missing, so our Table is complete.*

**Checking the shape of our table**

In [145]:
Df.shape

(103, 3)

**Showing furst five rows of the data obtained**

In [150]:
Df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


**Using Geocoders to get the latitude and longitude**

In [144]:
!pip install geocoder
!pip install geopy

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |███▎                            | 10kB 17.9MB/s eta 0:00:01[K     |██████▋                         | 20kB 1.7MB/s eta 0:00:01[K     |██████████                      | 30kB 2.2MB/s eta 0:00:01[K     |█████████████▎                  | 40kB 2.5MB/s eta 0:00:01[K     |████████████████▋               | 51kB 2.0MB/s eta 0:00:01[K     |████████████████████            | 61kB 2.2MB/s eta 0:00:01[K     |███████████████████████▎        | 71kB 2.5MB/s eta 0:00:01[K     |██████████████████████████▋     | 81kB 2.7MB/s eta 0:00:01[K     |██████████████████████████████  | 92kB 2.8MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 2.3MB/s 
Collecting ratelim
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad4

In [146]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

KeyboardInterrupt: ignored

*As geocoder was taking too much time and was unable to give proper values, so lets use the csv file given instead*

In [148]:
file = pd.read_csv("Geospatial_Coordinates.csv")
file.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [149]:
file.shape # checking the shape

(103, 3)

In [151]:
file['Postal Code'].isin(Df['Postal Code']).value_counts() # all the postal codes we got are in the file as well

True    103
Name: Postal Code, dtype: int64

**Final Dataset**

In [152]:
DF = pd.merge(Df, file[["Postal Code","Latitude","Longitude"]], on = 'Postal Code')

In [153]:
DF.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


**Downloading the Final Dataset for further use**

In [155]:
from google.colab import files
files.download('Postal_codes_final.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Checking the final shape**

In [156]:
DF.shape

(103, 5)