# Webscraping the table

### import all the necessary libraries

In [9]:
pip install geopandas


Note: you may need to restart the kernel to use updated packages.


In [10]:
!pip install geopy
from geopy.geocoders import Nominatim




In [11]:
import requests
import pandas as pd
import csv
import urllib.request
from bs4 import BeautifulSoup



### Define url and request

In [12]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
print(response.status_code)

200


### parse html

In [15]:
soup = BeautifulSoup(response.text,"html.parser")
table = soup.findAll('table',{"class":"sortable"})[0]
tr = table.findAll(['tr'])

### create CSV file

In [17]:
csvFile = open("postalcodes.csv",'wt',newline='', encoding='utf-8')

writer = csv.writer(csvFile)
for cell in tr:
            th = cell.find_all("th")
            th_data = [col.text.strip('\n') for col in th]
            td = cell.find_all("td")
            row = [i.text.replace('\n','') for i in td]
            writer.writerow(th_data+row)      



In [18]:
df=pd.read_csv('postalcodes.csv')
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### drop all 'not assigned'

In [19]:
indexNames = df[ df['Borough'] == 'Not assigned' ].index


In [20]:
df.drop(indexNames, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### shape

In [21]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [22]:
df.shape

(103, 3)

### getting coordinates and make a new dataframe 

In [23]:
latitude=[]
longitude=[]
postal_code=df['Postal Code']
#for code in adress: 
#    g = geocode.arcgis('{}, Toronto, Ontario'.format(code))
#    print(code, g.latlng)
#   while (g.latlng is None):
#        g = geocode.arcgis('{}, Toronto, Ontario'.format(code))
#        print(code, g.latlng)
#   latlng = g.latlng
#    latitude.append(latlng[0])
#    longitude.append(latlng[1])
    
    

# initialize your variable to None
g=None
geolocator = Nominatim(user_agent="foursquare_agent")
# loop until you get the coordinates
for code in df['Postal Code']:
    g = geolocator.geocode('{}, Toronto, Ontario'.format(code))
   
    if (g is None):
        g = geolocator.geocode('{}, Toronto, Ontario'.format(code))
        print(code, g)
        latitude.append('Not available')
        longitude.append('Not available')
    else: 
        print(code, g.latitude, g.longitude)
        latitude.append(g.latitude)
        longitude.append(g.longitude)
        

M3A 43.6534817 -79.3839347
M4A None
M5A None
M6A None
M7A 43.6534817 -79.3839347
M9A None
M1B 43.6534817 -79.3839347
M3B None
M4B None
M5B None
M6B None
M9B None
M1C 43.6534817 -79.3839347
M3C 43.7328216 -79.3469614
M4C None
M5C None
M6C None
M9C 43.64410993066079 -79.58890692194828
M1E None
M4E None
M5E 43.6421064 -79.3774455
M6E None
M1G 43.76571676956549 -79.22189842824983
M4G None
M5G None
M6G None
M1H None
M2H None
M3H None
M4H None
M5H 43.649900813008564 -79.3837183950501
M6H None
M1J None
M2J 43.7797719 -79.3661846
M3J None
M4J None
M5J 43.63925859999999 -79.38283994999999
M6J 43.6522219 -79.40753862886237
M1K None
M2K None
M3K None
M4K None
M5K None
M6K 43.63709690913502 -79.4349584391061
M1L None
M2L None
M3L None
M4L 43.6727601 -79.30405834999999
M5L None
M6L None
M9L None
M1M None
M2M 43.7859621 -79.4160307769213
M3M None
M4M None
M5M None
M6M None
M9M None
M1N None
M2N 43.6449033 -79.3818364
M3N None
M4N None
M5N None
M6N 43.6449033 -79.3818364
M9N None
M1P None
M2P None
M4

In [24]:
df['Latitude']=latitude
df['Longitude']=longitude

In [25]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.6535,-79.3839
1,M4A,North York,Victoria Village,Not available,Not available
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",Not available,Not available
3,M6A,North York,"Lawrence Manor, Lawrence Heights",Not available,Not available
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6535,-79.3839
