## Import relevant functionalities

In [13]:
from selenium import webdriver
import pandas as pd
import numpy as np

### 'selenium' is a web automation tool that is very useful for scraping as well. 
1. A driver object is created with the help of webdriver. 
2. No. of rows and columns are found by counting all the 'tr' elements in the table and 'td' elements in a row, respectively.
3. A loop runs to scrape each row of the table on wikipedia, sequentially appending each row to a list.
4. A dataframe is then constructed with this list and column names.

In [26]:
driver_path = 'chromedriver.exe'
driver = webdriver.Chrome(driver_path)
driver.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

rows = len(driver.find_elements_by_xpath('//*[@id="mw-content-text"]/div/table[1]/tbody/tr'))
cols = len(driver.find_elements_by_xpath('//*[@id="mw-content-text"]/div/table[1]/tbody/tr[3]/td'))

#--- iterate over rows and populate the dataframe
list_of_rows = []
for row in range(1,rows+1):
    base_xpath = '//*[@id="mw-content-text"]/div/table[1]/tbody/tr[{}]/td'
    xpath = base_xpath.format(row)
    row_data = [data.text for data in (driver.find_elements_by_xpath(xpath))]
    list_of_rows.append(row_data)  

#--- return the data frame
df = pd.DataFrame(list_of_rows, columns = ['PostalCode','Borough','Neighborhood'])
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"


### Observations before wrangling
1. Some PostalCodes do not have a Borough assigned. Replace them by NaN and drop them.
2. Using the isnull() method on the 'Neighborhood' column, it was found that there were no null values in the column. Meaning all PostalCodes with assigned Boroughs have at least 1 Neighborhood.
3. Index values were affected due to the drop method. That is taken care of by resetting index and dropping the old one

In [52]:
#--- replace all missing values and 'not assigned' Boroughs by Nan ---
df.replace('',np.nan,inplace = True)
df.replace('Not assigned',np.nan,inplace = True)

#--- drop rows with 'not assigned' Boroughs ---
df.dropna(subset = ['Borough'], axis = 0, inplace = True)

#--- Set up the index correctly
df.reset_index(inplace = True)
df.drop('index',axis = 1, inplace = True)

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Shape of the data frame

In [51]:
df.shape

(103, 3)