### Import libraries

In [197]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import io
import numpy as np
import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

### Read the Wikipedia URL

In [116]:
#Get the document
wikiURL='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(wikiURL).text
wikiText = BeautifulSoup(source, 'lxml')

### Read the codes HTML table into a dataframe

In [193]:
table = wikiText.find('table')
table_rows = table.find_all('tr')

l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)

l.pop(0)
td = pd.DataFrame(l, columns=["PostalCode", "Borough","Neighborhood"])
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
pCodes = td[(td.Borough != 'Not assigned') & (td.Borough != '')]
pCodes = pCodes.reset_index(drop ='True')

#Clean values
pCodes['PostalCode']= pCodes['PostalCode'].str.strip()
pCodes['Borough']= pCodes['Borough'].str.strip()
pCodes['Neighborhood']= pCodes['Neighborhood'].str.strip()

### Format the dataframe

In [194]:
# Order the dataset to process
pCodesSort = pCodes.sort_values(by=['PostalCode','Borough','Neighborhood'])

#Create empty DataFrame
pCodesFinal = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])

#Loop thru the original DataSet to group data
for index, row in pCodes.iterrows():
    cPCode = row['PostalCode']
    cBorough = row['Borough']
    cNeigh = row['Neighborhood']
    
    #If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
    if cNeigh=='Not assigned':
        cNeigh = cBorough
    
    if index==0: #First iteration
        pPCode = cPCode
        pBorough = cBorough
        pNeigh = cNeigh
    else:
        if cPCode != pPCode:
            #print( pPCode, pBorough, pNeigh)
            pCodesFinal.loc[index-1] = [pPCode , pBorough , pNeigh] #flush into the new DataSet
            pPCode = cPCode
            pBorough = cBorough
            pNeigh = cNeigh
        else:
            #These two rows will be combined into one row with the neighborhoods separated with a comma 
            pNeigh = pNeigh +", "+ cNeigh

pCodesFinal.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,"Harbourfront, Regent Park"
5,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Rouge, Malvern"
10,M3B,North York,Don Mills North
12,M4B,East York,"Woodbine Gardens, Parkview Hill"
14,M5B,Downtown Toronto,"Ryerson, Garden District"


In [219]:
pCodesFinal.shape

(102, 3)