Importing Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

Getting the information of Cannada, from the wikipedia url

In [2]:
cannada_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
cannada = requests.get(cannada_url).text

In [3]:
soup = BeautifulSoup(cannada, 'xml')

Finding the table from the text

In [4]:
table =soup.find('table')

Creating a dataframe with the required columns

In [5]:
#dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
column_names = ['Postalcode','Borough','Neighborhood']
toronto = pd.DataFrame(columns = column_names)

In [6]:
# Search all the postcode, borough, neighborhood 
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        toronto.loc[len(toronto)] = row_data

In [7]:
toronto.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Removing the "Not Assigned"

In [8]:
toronto=toronto[toronto['Borough']!='Not assigned']
toronto.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Join the rows where PostalCode is same and Neighborhood is different

In [9]:
temp_toronto=toronto.groupby('Postalcode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
temp_toronto=temp_toronto.reset_index(drop=False)

Renaming the column name of Neighborhood

In [10]:
temp_toronto.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)

In [11]:
temp_toronto.head()

Unnamed: 0,Postalcode,Neighborhood_joined
0,M1B,"Malvern, Rouge"
1,M1C,"Rouge Hill, Port Union, Highland Creek"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae


In [12]:
toronto.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Merging the two dataframes for updated information

In [13]:
toronto_merge = pd.merge(toronto, temp_toronto, on='Postalcode')

In [14]:
toronto_merge.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Neighborhood_joined
0,M3A,North York,Parkwoods,Parkwoods
1,M4A,North York,Victoria Village,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront","Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights","Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government","Queen's Park, Ontario Provincial Government"


Dropping the extra column, Neighborhood

In [15]:
toronto_merge.drop(['Neighborhood'],axis=1,inplace=True)

Dropping the duplicates.

In [16]:
toronto_merge.drop_duplicates(inplace=True)

Renaming the column the 'Neighborhood_joined'as'Neighborhood'

In [17]:
toronto_merge.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)

In [18]:
toronto_merge.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Shape of the final Dataframe

In [19]:
toronto_merge.shape

(103, 3)