# Web Scraping

In [48]:
#Web scraping tutorial
# https://simpleanalytical.com/how-to-web-scrape-wikipedia-python-urllib-beautiful-soup-pandas
    
# import the library we use to open URLs
import urllib.request

# specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

# import the BeautifulSoup library so we can parse HTML and XML documents
from bs4 import BeautifulSoup

# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")

# use the 'find_all' function to bring back all instances of the 'table' tag in the HTML and store in 'all_tables' variable
all_tables=soup.find_all("table")
# all_tables

right_table=soup.find('table', class_='wikitable sortable')
# right_table

#Import the data from the table 
A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):   #tr tag specifies a row in HTML 
    cells=row.findAll('td')             #td tag defines the cell of the table
    if len(cells) == 3 and cells[1].find(text = True) != 'Not assigned':
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

# print(A)
# print(B)
# print(C)

import pandas as pd
import numpy as np

#Dataframe consists of three columns
df = pd.DataFrame(A, columns = ['Postal Code'])
df['Borough'] = B
df['Neighborhood'] = C
df.shape #(180,3)

(180, 3)

# Data Cleaning

In [56]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df = df.replace('Not assigned',np.nan, regex=True)
df.dropna(subset=["Borough"], axis=0, inplace=True)
df = df.reset_index(drop=True)

#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

#Checking to see if there are any with Neighborhood as 'Not assigned'; there are zero records
df[df['Neighborhood'] == 'Not Assigned'] 
df['Neighborhood'].isnull().sum()

#If there would be any Neighborhood with 'Not assigned', then replace with Borough value
df['Neighborhood']=df['Neighborhood'].replace('Not assigned', df['Borough'])

# df.head()
df.shape

(103, 3)