# Using BeautifulSoup to pull the data from wikipedia HTML page

In [1]:
import requests 
from bs4 import BeautifulSoup 
import pandas as pd
column_names=['Postal Code','Borough','Neighborhood']
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL) 
  
soup = BeautifulSoup(r.content, 'html5lib') 

# Seperating required data from whole page

In [2]:
table = soup.find('table', attrs = {'class':"wikitable sortable"}) 

1. dataframe which of three columns: PostalCode, Borough, and Neighborhood
2. Ignored cells with a borough that is Not assigned
3. Combined neighborhoods with same postal code
4. Duplicated neighborhoods with borough for Not assigned neighborhoods

In [3]:
import re
# dataframe which of three columns: Postal Code, Borough, and Neighborhood
table_of_postal_codes=pd.DataFrame(columns=column_names)
for row in table.findAll('tr'):
    if row.td != None:
        tDatas=row.findAll('td')
        if tDatas[0].a != None:
            PostalCode=tDatas[0].a.text.strip()
        else:
            PostalCode=tDatas[0].text.strip()
        if tDatas[1].a != None:
            Borough=tDatas[1].a.text.strip()
        # Ignored cells with a borough that is Not assigned
        elif tDatas[1].text.strip() == 'Not assigned':
            continue
        else:
            Borough=tDatas[1].text.strip()
        if tDatas[2].a != None:
            Neighborhood=tDatas[2].a.text.strip()
        # Duplicated neighborhoods with borough for Not assigned neighborhoods
        elif tDatas[2].text.strip() == 'Not assigned':
            Neighborhood=Borough
        else:
            Neighborhood=tDatas[2].text.strip()
        # Combined neighborhoods with same postal code
        if (table_of_postal_codes["Postal Code"].isin([PostalCode])).sum()==1:
            index=table_of_postal_codes.index[table_of_postal_codes["Postal Code"]==PostalCode]
            table_of_postal_codes.loc[index,'Neighborhood']=table_of_postal_codes.loc[index,'Neighborhood']+', '+Neighborhood
        else:
            table_of_postal_codes=table_of_postal_codes.append({'Postal Code':PostalCode,'Borough':Borough,'Neighborhood':Neighborhood}, ignore_index=True)

In [4]:
table_of_postal_codes.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [5]:
table_of_postal_codes.shape

(103, 3)

### tried to get lat,lon through geo-code but the API is not working

In [6]:

# import geocoder # import geocoder

# # initialize your variable to None
# lat_lng_coords = None
# postal_code='M1E'
# i=0
# # loop until you get the coordinates
# while(lat_lng_coords is None):
#     print(i)
#     g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#     lat_lng_coords = g.latlng
#     i=i+1

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

In [7]:
import pandas as pd
lat_long=pd.read_csv('Geospatial_Coordinates.csv')

In [8]:
lat_long.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
table_of_postal_codesWithLongLat=pd.merge(table_of_postal_codes, lat_long, on='Postal Code')

In [13]:
table_of_postal_codesWithLongLat.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [15]:
table_of_postal_codesWithLongLat.shape

(103, 5)