# Part 1 - Scraping Wikipedia

In [97]:
# Import Modules
from html.parser import HTMLParser
import requests
import re
import pandas as pd

In [98]:
# Fetch target HTML page
canada_postal_codes_url = r"https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = requests.get(canada_postal_codes_url)
page_html = response.content.decode("utf-8")

In [99]:
# Extract Toronto FSAs
pattern = "\<table([\S\s]*)\<\/table\>"
matches = re.search(pattern, page_html)
table_body = matches.group(0)

In [100]:
# Clean table body text
# Clean Text
table_body = table_body.replace('<table class="wikitable sortable">', '')
table_body = table_body.replace('</table>', '')
table_body = table_body.strip()

In [101]:
# Define custom HTML parser
class Parser(HTMLParser):
    # Initialize tracker variables
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.rows = [ ]
        self.col = [ ]
        self.buffer = [ ]

    # method to track the start tr tag.
    def handle_starttag(self, tag, attrs):
        if tag == "tr":
            self.buffer = [ ]
        else:
            pass

    # method to track the end tr tag.
    def handle_endtag(self, tag):
        if tag == "tr":
            self.col = self.buffer[:]
            self.rows.append(self.col)
        else:
            pass

    # method to append the data between the tr tags to the buffer.
    def handle_data(self, data):
        self.buffer.append(data.strip())

    # Clean rows and return rows.
    def get_rows(self):
        cleaned_rows=[ ]
        for row in parser.rows:
            if len(row)<=6:
                _row = [row[x] for x in range(len(row)) if x%2!=0]
                cleaned_rows.append(_row)
        return cleaned_rows


In [102]:
# Scrape all data items in HTML
parser = Parser()
parser.feed(table_body)
rows = parser.get_rows()

### Data Preprocessing

**Condition** : If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [103]:
rd = {}
# Initialize Cleaning Dictionary
for row in rows[1:]:
    # If a cell has a borough but a Not assigned neighborhood,
    # then the neighborhood will be the same as the borough.
    if row[1] and not row[2]:
        rd[row[0]] = [row[1],[row[1]]]
    else:
        rd[row[0]]=[row[1],[row[2]]]

**Condition** : More than one neighborhood can exist in one postal code area 

In [104]:
for row in rows[1:]:
    # More than one neighborhood can exist in one postal code area 
    if not rd[row[0]][1]:
        rd[row[0]][1].append(row[2])    
    else:
        rd[row[0]][1] = [row[2]]

In [105]:
# Create DataFrame
df = pd.DataFrame(columns = rows[0])
for key, value_list in rd.items():
    data={k:v for k,v in zip(df.columns, (key, value_list[0], ",".join(value_list[1])))}
    df=df.append(data, ignore_index=True)

**Cleaning Dataframe**

In [106]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df = df[df['Borough']!= "Not assigned"]

In [107]:
# Reset Indexes
df.reset_index(drop=True, inplace=True)

In [108]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [109]:
df.shape

(103, 3)