In [3]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd

### I download the Wiki page and read it with BeautifoulSoup

In [4]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(URL)
soup = BeautifulSoup(response.text,'html.parser')

In [5]:
table = soup.find('table', {'class':'wikitable sortable'}).tbody

In [6]:
rows = table.find_all('tr')

In [7]:
columns = [v.text.replace('\n', '') for v in rows[0].find_all('th')]

In [8]:
print(columns)

['Postcode', 'Borough', 'Neighbourhood']


### I create the dataframe with the postal codes.
### The last line of the cell is the only way I found to skip di "\n" from the Neighbourhood

In [9]:
df = pd.DataFrame(columns=columns)

In [12]:
for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds) == 4:
        values = [tds[0].text, tds[1].text,'', tds[2].text.replace('\n',''), tds[3].replace('\n','')]
    else:
        values = [td.text.replace('\n','') for td in tds]
  

    df = df.append(pd.Series(values, index = columns), ignore_index=True)
    df['Neighbourhood'].replace('\n','')
    

In [11]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### I drop the lines where the Borough value is "Not assigned"

In [21]:
df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)

In [22]:
df.head()

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Harbourfront
3,5,M5A,Downtown Toronto,Regent Park
4,6,M6A,North York,Lawrence Heights


In [19]:
df.head()

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Harbourfront
3,5,M5A,Downtown Toronto,Regent Park
4,6,M6A,North York,Lawrence Heights


In [30]:
df.drop(['index'], axis = 1)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


### I group all the neighbourhoods with the same Borough

In [33]:
df1 = df.groupby("Postcode").agg(lambda x:','.join(set(x)))

In [34]:
df1.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek"
M1E,Scarborough,"Guildwood,West Hill,Morningside"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [35]:
df1.reset_index()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek"
2,M1E,Scarborough,"Guildwood,West Hill,Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park,Ionview,East Birchmount Park"
7,M1L,Scarborough,"Oakridge,Golden Mile,Clairlea"
8,M1M,Scarborough,"Scarborough Village West,Cliffcrest,Cliffside"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


### Finally I assign the value in the Borough cell to Neighbouhoods with "Not assigned" value

In [36]:
df1.loc[df1['Neighbourhood']=="Not assigned",'Neighbourhood']=df1.loc[df1['Neighbourhood']=="Not assigned",'Borough']

### Finally, I check the shape of the dataframe

In [37]:
df1.shape

(103, 2)

In [17]:
df1.to_csv('toronto.csv')

### I download the csv file with the coordinates codes in a dataframe

In [32]:
coord = pd.read_csv('geospatial_coordinates.csv')

### I rename the Postal Code columns to allow the merge with the other database

In [34]:
coord = coord.rename(columns = {'Postal Code':'Postcode'})

In [38]:
df1_new = pd.merge(df1, coord,  how = 'outer', on='Postcode')

### Here is the new dataframe

In [39]:
df1_new.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Morningside,West Hill,Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [40]:
df1_new.shape

(103, 5)