In [1]:
#Importing Libraries
from bs4 import BeautifulSoup  #Package for scraping data
import pandas as pd
import requests  #To get the desired web page

## Data Scraping

In [2]:
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
wiki=BeautifulSoup(website_url,'lxml')
my_table=wiki.find('table',{'class':'wikitable sortable'}) #Function/parameters used to get the table

## Converting into a DataFrame

In the following section the conversion to data frame is divided into 2 parts:
 - Title
 - Body

### Creating the Title List

In [3]:
TitleLink=my_table.findAll('th') # In HTML the title font is found in the <th></th> 
TitleLink=list(TitleLink)
TitleLink=list(map(str,TitleLink))
Title=[]
for i in range(len(TitleLink)):
    Title.append(TitleLink[i][4:-5]) # Slicing the required part 
Title[2]=Title[2][:-1]               # To remove '/n' at the end of neighborhood 
print(Title)

['Postcode', 'Borough', 'Neighbourhood']


### Creating the Body List

In [4]:
TableLink=my_table.find_all('td')   # In HTML the Row font is found in the <td></td> 


In [5]:
row1=[] #List containing 'Postcode'
row2=[] #List containing 'Borough'
row3=[] #List containing 'Neighbourhood'


for i in range(0,len(TableLink),3):   #Every 3 list element contained values of one row
    row1.append(TableLink[i])
    row2.append(TableLink[i+1])
    row3.append(TableLink[i+2])


row1=list(map(str,row1))              #Converting values to string
row2=list(map(str,row2))    
row3=list(map(str,row3))

In [6]:
for i in range(len(row1)):
    row1[i]=row1[i][4:-5]         #To remove <td></td> and /n
    row2[i]=row2[i][4:-5]
    row3[i]=row3[i][4:-6]



row1[0:5]
row2[0:5]
row3[0:5]

['Not assigned',
 'Not assigned',
 '<a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>',
 '<a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>',
 '<a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>']

In [7]:
for i,v in enumerate(row2):
    if v.startswith('<a')==True:
        row2[i]=row2[i][row2[i].index('>')+1:-4]  #The strings are sliced from the last letter of Borough to the character '>'
                                                  #All values for slicing is based on observation
row2[0:5]

['Not assigned',
 'Not assigned',
 'North York',
 'North York',
 'Downtown Toronto']

In [8]:
for i,v in enumerate(row3):
    if v.startswith('<a')==True:
        row3[i]=row3[i][row3[i].index('>')+1:-4] # Same as above
    
row3[0:5]

['Not assigned',
 'Not assigned',
 'Parkwoods',
 'Victoria Village',
 'Harbourfront']

## Data Manipulation



In [9]:
data=pd.DataFrame({'1':row1,'2':row2,'3':row3})  # Creating a Data frame
data.columns=Title
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [10]:
data = data[data.Borough != 'Not assigned'] #Removing all rows with 'Not assigned' Borough
data.reset_index(drop=True,inplace=True)
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [11]:
for i in range(2,len(data)):                # Changing all 'Not assigned' Neighborhood with same name as Borough
    if data.Neighbourhood[i]=='Not assigned':
        data.iloc[i]['Neighbourhood']=data.iloc[i]['Borough']

In [12]:
data = data.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()  #To join Neighbourhood column based on Postcode
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [13]:
data.shape

(103, 3)