In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import lxml.html as lh

In [2]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'html.parser')

### Using Beautiful Soup to pull out text from the website, into a list


In [3]:
postalCode = []
borough = []
neighborhood= []

for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCode.append(cells[0].text)
        borough.append(cells[1].text)
        neighborhood.append(cells[2].text.rstrip('\n'))

### Converting the lists into a dataframe

In [4]:
Ndict = [('Postal Code', postalCode),
                      ('Borough', borough),
                      ('Neighborhood', neighborhood)]
df = pd.DataFrame.from_dict(dict(Ndict))
df


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### Dropping all the rows where the borough is not defined


In [5]:
df.drop(df[df.Borough == 'Not assigned'].index, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Using the agg and join function to combine rows where the code and borough name is same.

In [7]:
dfgrouped = df.groupby(['Postal Code','Borough']).agg(lambda x: ','.join(x))
dfgrouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighborhood
Postal Code,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
M1N,Scarborough,"Birch Cliff,Cliffside West"


In [8]:
dfgrouped.shape

(103, 1)

In [16]:
dflat = pd.read_csv('Geospatial_Coordinates.csv')
dfnew = pd.merge(dfgrouped,dflat,on='Postal Code')

In [18]:
dfnew.head()

Unnamed: 0,Postal Code,Neighborhood,Latitude,Longitude
0,M1B,"Rouge,Malvern",43.806686,-79.194353
1,M1C,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Woburn,43.770992,-79.216917
4,M1H,Cedarbrae,43.773136,-79.239476


In [19]:
dfnew.shape

(103, 4)