In [217]:
from bs4 import BeautifulSoup
import requests
import numpy
import pandas as pd

### Reading the data from web using requests

In [373]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

### Converting the text into indented xml

In [374]:
soup = BeautifulSoup(source, 'lxml')

In [375]:
stat_table = soup.find_all('table',class_='wikitable')

In [376]:
len(stat_table)

1

In [377]:
stat_table = stat_table[0]

### Storing the data into list & then converting to Dataframe

In [378]:
list = []
for row in stat_table.find_all('tr'):
    for cell in row.find_all('td'):
        list.append(cell.text.replace("\n",""))

In [480]:
num = numpy.array(list)
num.shape

(867,)

In [380]:
reshape = num.reshape(289,3)
Headers = ["Postcode","Borough","Neighbourhood"]
Canada_df = pd.DataFrame(reshape, columns = Headers)
Canada_df


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### Grouping the data by Borough to remove Not assigned rows

In [383]:
Canada_df = Canada_df.set_index("Borough")
Canada_df

Unnamed: 0_level_0,Postcode,Neighbourhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Not assigned,M1A,Not assigned
Not assigned,M2A,Not assigned
North York,M3A,Parkwoods
North York,M4A,Victoria Village
Downtown Toronto,M5A,Harbourfront
Downtown Toronto,M5A,Regent Park
North York,M6A,Lawrence Heights
North York,M6A,Lawrence Manor
Queen's Park,M7A,Not assigned
Not assigned,M8A,Not assigned


In [384]:
Canada_df = Canada_df.drop('Not assigned',axis=0)
Canada_df

Unnamed: 0_level_0,Postcode,Neighbourhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
North York,M3A,Parkwoods
North York,M4A,Victoria Village
Downtown Toronto,M5A,Harbourfront
Downtown Toronto,M5A,Regent Park
North York,M6A,Lawrence Heights
North York,M6A,Lawrence Manor
Queen's Park,M7A,Not assigned
Etobicoke,M9A,Islington Avenue
Scarborough,M1B,Rouge
Scarborough,M1B,Malvern


### Resetting the index & Column names

In [385]:
Canada_df=Canada_df.reset_index()
Canada_df

Unnamed: 0,Borough,Postcode,Neighbourhood
0,North York,M3A,Parkwoods
1,North York,M4A,Victoria Village
2,Downtown Toronto,M5A,Harbourfront
3,Downtown Toronto,M5A,Regent Park
4,North York,M6A,Lawrence Heights
5,North York,M6A,Lawrence Manor
6,Queen's Park,M7A,Not assigned
7,Etobicoke,M9A,Islington Avenue
8,Scarborough,M1B,Rouge
9,Scarborough,M1B,Malvern


In [369]:
Canada_df=Canada_df.reindex(columns=Headers)
Canada_df

Unnamed: 0_level_0,Postcode,Borough,Neighbourhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
North York,M3A,,Parkwoods
North York,M4A,,Victoria Village
Downtown Toronto,M5A,,Harbourfront
Downtown Toronto,M5A,,Regent Park
North York,M6A,,Lawrence Heights
North York,M6A,,Lawrence Manor
Queen's Park,M7A,,Not assigned
Etobicoke,M9A,,Islington Avenue
Scarborough,M1B,,Rouge
Scarborough,M1B,,Malvern


### Replacing Neighbourhood values with Not assigned with Borough values

In [386]:
Canada_df['Neighbourhood']= numpy.where(Canada_df['Neighbourhood']=='Not assigned',Canada_df['Borough'],Canada_df['Neighbourhood'])
Canada_df.head()


Unnamed: 0,Borough,Postcode,Neighbourhood
0,North York,M3A,Parkwoods
1,North York,M4A,Victoria Village
2,Downtown Toronto,M5A,Harbourfront
3,Downtown Toronto,M5A,Regent Park
4,North York,M6A,Lawrence Heights
5,North York,M6A,Lawrence Manor
6,Queen's Park,M7A,Queen's Park
7,Etobicoke,M9A,Islington Avenue
8,Scarborough,M1B,Rouge
9,Scarborough,M1B,Malvern


In [465]:
Canada_df=Canada_df.reindex(columns=Headers)
Canada_df1 =Canada_df
Canada_df=Canada_df.reindex(columns = ['Borough','Neighbourhood','Postcode'])
Canada_df
Canada_df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### Using Groupby & lambda to update Neighbourhood column with comma separated values

In [469]:
Canada_df1=Canada_df1.groupby(['Postcode','Borough'],as_index=False).agg(lambda x: ", ".join(x))

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [478]:
Canada_df1.shape

(103, 3)