# Notebook for Toronto City clustering project

##### We will work on extracting borough & neighbourhood information + cleaning + clustering & visualising with Folium

#### 1. Establish environment

In [2]:
import urllib.request, urllib.parse, urllib.error
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re

#### 2. Ignore SSL errors

In [3]:
cntxt = ssl.create_default_context()
cntxt.check_hostname = False
cntxt.verify_mode = ssl.CERT_NONE

#### 3. Obtain URL

In [4]:
url = input('Please enter the website to obtain data from: ')
if len(url) < 1: url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

Please enter the website to obtain data from:  


In [5]:
print('You want data from >>>\n', url, '\n<<<')

You want data from >>>
 https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M 
<<<


#### 4. Open and parse the url

In [6]:
#Use a file-handle like object to open the url
html = urlopen(url, context= cntxt).read() #Read slurps everything in #Note that is additional function written at end

#Use BeautifulSoup to parse
soup = BeautifulSoup(html, 'html.parser')

#### 5. Explore

In [7]:
type(soup)

bs4.BeautifulSoup

In [8]:
#Try retrieving 'tr' tags
#print(soup)
tags = soup('tr')
print('Total tags extracted: ', len(tags),'\n')

#Look at the tag extracted
count = 0 #Initialise counter to count iterations & if rqd, help break out of loop
dict = {} #Initialise empty dictionary to store postcode as KEY, boroughs and neighbourhoods as VALUES
list = [] #Initialise empty list to hold the borough + neighbourhood info

#Loop through the tags
for i in tags:
    count = count + 1 #Increase counter value beginning through each iteration
    if count == 290: break #For limiting output and stopping the loop from running + length of tags for some reason does not correspond to actual number of elements
    
    #Skip if Borough is Not Assigned
    if i.contents[3].text == 'Not assigned':
        continue
        
    #Skip if text is Postcode
    if i.contents[1].text == 'Postcode':
        continue
    
    #If pincode already exists, eg M5A, M6A, append new data
    if i.contents[1].text in dict: 
        
        # Append the new data to the existing array at this slot with the following NEW SYNTAX
        #dict[existing_key].append(value)
        dict[i.contents[1].text].append(i.contents[5].text.rstrip())
        
    #If borough exists but not the neighbourhood, run the following    
    elif re.search('[a-z]', i.contents[3].text) and re.search('No.*', i.contents[5].text):
        list.append(i.contents[3].text), list.append(i.contents[3].text)
        dict[i.contents[1].text] = list
    else:
        list.append(i.contents[3].text)
        list.append(i.contents[5].text.rstrip())#rstrip gets rid of newline char.
        dict[i.contents[1].text] = list
    
    #Would need to reset the list to empty after each round otherwise each successive iteration will bloat up the key-value
    list = []
    

#print(dict)

print('\nTotal post-codes with borough info:', len(dict))

Total tags extracted:  294 


Total post-codes with borough info: 103


#### 6. Convert to pandas dataframe

In [48]:
#6.1 Import the library
import pandas as pd

In [49]:
#6.2 Convert dictionary to dataframe
data = pd.DataFrame.from_dict(dict, orient= 'index')#index when keys are row labels

In [50]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
M9A,Etobicoke,Islington Avenue,,,,,,,
M5A,Downtown Toronto,Harbourfront,Regent Park,,,,,,
M1N,Scarborough,Birch Cliff,Cliffside West,,,,,,
M5X,Downtown Toronto,First Canadian Place,Underground city,,,,,,
M2J,North York,Fairview,Henry Farm,Oriole,,,,,


In [51]:
#6.3 Assign row name to postcodes
data.index.name = 'PostalCode'
data.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
M9A,Etobicoke,Islington Avenue,,,,,,,
M5A,Downtown Toronto,Harbourfront,Regent Park,,,,,,
M1N,Scarborough,Birch Cliff,Cliffside West,,,,,,
M5X,Downtown Toronto,First Canadian Place,Underground city,,,,,,
M2J,North York,Fairview,Henry Farm,Oriole,,,,,


In [52]:
#6.4 Reset index
data.reset_index(inplace= True)
data.head()

Unnamed: 0,PostalCode,0,1,2,3,4,5,6,7,8
0,M9A,Etobicoke,Islington Avenue,,,,,,,
1,M5A,Downtown Toronto,Harbourfront,Regent Park,,,,,,
2,M1N,Scarborough,Birch Cliff,Cliffside West,,,,,,
3,M5X,Downtown Toronto,First Canadian Place,Underground city,,,,,,
4,M2J,North York,Fairview,Henry Farm,Oriole,,,,,


In [53]:
#6.5 Set column 2 i.e. after Postcode to Borough
data.columns.values[1] = 'Borough'#We need to drill down to the array, accessed with .values
data.head()

Unnamed: 0,PostalCode,Borough,1,2,3,4,5,6,7,8
0,M9A,Etobicoke,Islington Avenue,,,,,,,
1,M5A,Downtown Toronto,Harbourfront,Regent Park,,,,,,
2,M1N,Scarborough,Birch Cliff,Cliffside West,,,,,,
3,M5X,Downtown Toronto,First Canadian Place,Underground city,,,,,,
4,M2J,North York,Fairview,Henry Farm,Oriole,,,,,


In [54]:
#6.6 Convert columns to list form so as to remove hidden characters that interfer with column extraction by name 
data.columns = data.columns.tolist()
data[['Borough']].head() #data[[5]] for the column numbered 5

Unnamed: 0,Borough
0,Etobicoke
1,Downtown Toronto
2,Scarborough
3,Downtown Toronto
4,North York


In [55]:
#6.7 Remove None values
## It is vital to try this in the beginning when only specific cells have None
### It becomes a bit complex if we try to remove None after merging, as each cell then has several values 

#data_sorted = data.apply(sorted,key=pd.isnull)
#data_sorted.head()
data_clean = data[~pd.isnull(data).all(1)].fillna('')
data_clean.head()

Unnamed: 0,PostalCode,Borough,1,2,3,4,5,6,7,8
0,M9A,Etobicoke,Islington Avenue,,,,,,,
1,M5A,Downtown Toronto,Harbourfront,Regent Park,,,,,,
2,M1N,Scarborough,Birch Cliff,Cliffside West,,,,,,
3,M5X,Downtown Toronto,First Canadian Place,Underground city,,,,,,
4,M2J,North York,Fairview,Henry Farm,Oriole,,,,,


In [56]:
#6.8 Concatenate the neighbourhood columns
## Obtain column position after which concatenation is to take place
source_col_loc = data_clean.columns.get_loc('Borough') # column position starts from 0
source_col_loc

#Create new column that will merge all neighbourhoods
data_clean['Neighbourhood'] = data_clean.iloc[:,source_col_loc+1:source_col_loc+8].apply(
   lambda x: ",".join(x.astype(str)), axis=1)
data_clean.head()

Unnamed: 0,PostalCode,Borough,1,2,3,4,5,6,7,8,Neighbourhood
0,M9A,Etobicoke,Islington Avenue,,,,,,,,"Islington Avenue,,,,,,"
1,M5A,Downtown Toronto,Harbourfront,Regent Park,,,,,,,"Harbourfront,Regent Park,,,,,"
2,M1N,Scarborough,Birch Cliff,Cliffside West,,,,,,,"Birch Cliff,Cliffside West,,,,,"
3,M5X,Downtown Toronto,First Canadian Place,Underground city,,,,,,,"First Canadian Place,Underground city,,,,,"
4,M2J,North York,Fairview,Henry Farm,Oriole,,,,,,"Fairview,Henry Farm,Oriole,,,,"


In [57]:
#6.9 Drop columns 1 through 8
import numpy as np
cols_to_remove = np.arange(1,9)
#print(cols_to_remove, '\n')

#Drop
data_clean.drop(cols_to_remove, axis=1, inplace = True)
#data_clean.head()

In [58]:
data_clean.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M9A,Etobicoke,"Islington Avenue,,,,,,"
1,M5A,Downtown Toronto,"Harbourfront,Regent Park,,,,,"
2,M1N,Scarborough,"Birch Cliff,Cliffside West,,,,,"
3,M5X,Downtown Toronto,"First Canadian Place,Underground city,,,,,"
4,M2J,North York,"Fairview,Henry Farm,Oriole,,,,"


In [59]:
data_clean[data_clean['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M5A,Downtown Toronto,"Harbourfront,Regent Park,,,,,"


In [44]:
data_clean.shape

(103, 3)