#### Import all the relevant libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

import requests
import os
import codecs
import pandas as pd
import numpy as np

#### Import the web scraping library; BeautifulSoup

In [2]:
from bs4 import BeautifulSoup

#### Define the wikipedia page. 

In [3]:
wikipage = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

html5lib, in the above function plays the role of parsers, html parsers e.g lxml

These offer an interface for programmers to easily access and modify of the "HTML string code"

In [4]:
def get_wikipedia(wikipage):
    """
        This function does a HTTP get request to retrieve the wikipedia page. 
    """
    page = requests.get(wikipage).text
    soup = BeautifulSoup(page, 'html5lib')

    return soup

The function below trims out all the unnecessary html tags to give the dataset corresponding to 
Postcode, borough and neighbourhood. It also saves the resulting set to a csv file. 

In [5]:
def convert_table(html_soup, name='wiki_table', return_df=True):
    """
       This function converts the BeautifulSoup html object 
       to a pandas dataframe, saves the resulting table to a csv file. 
       
    """
    tables = html_soup.findAll("table", { "class" : "wikitable" })
    for tn in range(len(tables)):
        table=tables[tn]
        # Initialize list of lists
        rows=table.findAll("tr")
        row_lengths=[len(r.findAll(['th','td'])) for r in rows]
        ncols=max(row_lengths)
        nrows=len(rows)
        data=[]
        for i in range(nrows):
            rowD=[]
            for j in range(ncols):
                rowD.append('')
            data.append(rowD)

        # processing the html
        for i in range(len(rows)):
            row=rows[i]
            rowD=[]
            cells = row.findAll(["td","th"])
            for j in range(len(cells)):
                cell=cells[j]

                #lots of cells span cols and rows so lets deal with that
                col_span=int(cell.get('colspan',1))
                row_span=int(cell.get('rowspan',1))
                for k in range(row_span):
                    for l in range(col_span):
                        data[i+k][j+l]+=cell.text

            data.append(rowD)

        # write data to a file
            page=name.split('/')[-1]
        fname='table_{}_{}.csv'.format(tn, page)
        f = codecs.open(fname, 'w')
        for i in range(nrows):
            rowStr=','.join(data[i])
            rowStr=rowStr.replace('\n','')
            f.write(rowStr+'\n')    
    
    f.close()
    
    if return_df:
        return pd.read_csv(fname)
    
    return fname

The function below is responsible for dealing with not assigned values within the postcode entries. It also eliminates all duplicates by using a group by clause. 

In [6]:
def postal_codes(raw_df):
    """
        This function replaces the 'Not assigned' entries with Not a Number
        and than those entries are filled in with the Borough column entries.
        Returns: a dataframe grouped by Postcode and Borough.
    """
    postal_codes = raw_df.replace(to_replace='Not assigned', value=np.nan)
    
    postal_codes['Neighbourhood'] = postal_codes.Neighbourhood.fillna(postal_codes["Borough"])
    
    postal_codes_df = (postal_codes
            .dropna(axis=0)
            .sort_values('Neighbourhood')
            .groupby(['Postcode', 'Borough'], 
                     as_index=False,
                     sort=False
                    )['Neighbourhood']
            .agg(lambda col: ', '.join(col)))
    
    return postal_codes_df

The below function combines all the functions defined above to form a web scraping pipeline. 

In [7]:
def web_scraping_pipeline(page):
    """
        This function calls the above functions that form a web scraping pipeline. 
    """
    page_html = get_wikipedia(page)
    table_df = convert_table(page_html, return_df=True)
    return postal_codes(table_df)

In [8]:
# Call the pipeline
postcodes_df = web_scraping_pipeline(wikipage)

#### Sanity check. Print the first 20 entries. 

In [9]:
postcodes_df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M5H,Downtown Toronto,"Adelaide, King, Richmond"
1,M1S,Scarborough,Agincourt
2,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St..."
3,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
4,M8W,Etobicoke,"Alderwood, Long Branch"
5,M3H,North York,"Bathurst Manor, Downsview North, Wilson Heights"
6,M5V,Downtown Toronto,"Bathurst Quay, CN Tower, Harbourfront West, Is..."
7,M2K,North York,Bayview Village
8,M5M,North York,"Bedford Park, Lawrence Manor East"
9,M5E,Downtown Toronto,Berczy Park


In [10]:
## Print the shape of the dataframe
print(postcodes_df.shape)

(103, 3)
