## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import requests
post_codes_canada = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
print(post_codes_canada) # Response [200] means we successfully got response from the website for our request

<Response [200]>


In [2]:
# Lets print the first 500 characters of the HTML
print(post_codes_canada.text[0:500])


<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"528632b9-cea7-47f1-b2


#### We need to process this using BeautifulSoup
* This is a dynamic process

* It needs to be monitored in each step so we get what we require

In [3]:
from bs4 import BeautifulSoup

In [4]:
soup_post_codes_canada = BeautifulSoup(post_codes_canada.text, 'lxml')

#### This is the time that we need to visit website and check what we need so we can find that in the SOUP
* If you are using chrome as your browser then right click and select 'View page source'
* ctl F and find Postal Code in the tbody
* We do not need to scrape 'Postal Code', 'Borough', 'Neighborhood' but contents in these columns

In [13]:
# First element that we need is 'M1A', ctl F the same on source page
# tag 1 (closest to 'M1A') is <td>
# tag 2 is <tr>
# tag 3 is <tbody>
# Let's find this in the soup
# but it is better to hashtag the below code it after reviewing

# soup_post_codes_canada.find('tbody') 

# next we need to find <tr> 
soup_post_codes_canada.find('tbody').find('tr')
# THis gives first tr tag in <tbody> tag 
# It turn out we don't need this

<tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>

In [32]:
# Let's check out all the <tr> tags in <tbody>
# As the results are big we only get first three results
soup_post_codes_canada.find('tbody').find_all('tr')[:3]
# we need the second <tr> tag onwards

[<tr>
 <th>Postal Code
 </th>
 <th>Borough
 </th>
 <th>Neighborhood
 </th></tr>,
 <tr>
 <td>M1A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>,
 <tr>
 <td>M2A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>]

In [15]:
soup_post_codes_canada.find('tbody').find_all('tr')[1] # items in second <tr> tag

<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>

In [16]:
# Similarly if we add .find('td') on top of this
# then it will return the first value of <tr>
soup_post_codes_canada.find('tbody').find_all('tr')[1].find_all('td')

[<td>M1A
 </td>,
 <td>Not assigned
 </td>,
 <td>Not assigned
 </td>]

In [28]:
# To get the text from each <td> tag we have to use the following command
print(soup_post_codes_canada.find('tbody').find_all('tr')[1].find_all('td')[0])#.text[0:-1]


<td>M1A
</td>


In [29]:
soup_post_codes_canada.find('tbody').find_all('tr')[1].find_all('td')[0].text

'M1A\n'

In [30]:
soup_post_codes_canada.find('tbody').find_all('tr')[1].find_all('td')[0].text[0:-1]

'M1A'

In [33]:
# Let's check the first row elements and confirm from the website 
# Website is: 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
print(soup_post_codes_canada.find('tbody').find_all('tr')[1].find_all('td')[0].text[0:-1])
print(soup_post_codes_canada.find('tbody').find_all('tr')[1].find_all('td')[1].text[0:-1])
print(soup_post_codes_canada.find('tbody').find_all('tr')[1].find_all('td')[2].text[0:-1])

M1A
Not assigned
Not assigned


In [47]:
# Let's check the last row elements and confirm from the website
# for last element we could sure use -1 as the element but it is also useful to find how many elements are there
print(len(soup_post_codes_canada.find('tbody').find_all('tr')))
print('*******')
# We know we do not need the first row (or 0th row)
print('First row is:',soup_post_codes_canada.find('tbody').find_all('tr')[1])
print('*******')
print('Last row is:',soup_post_codes_canada.find('tbody').find_all('tr')[180])

181
*******
First row is: <tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
*******
Last row is: <tr>
<td>M9Z
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>


In [36]:
print(soup_post_codes_canada.find('tbody').find_all('tr')[180].find_all('td')[0].text[0:-1])
print(soup_post_codes_canada.find('tbody').find_all('tr')[180].find_all('td')[1].text[0:-1])
print(soup_post_codes_canada.find('tbody').find_all('tr')[180].find_all('td')[2].text[0:-1])

M9Z
Not assigned
Not assigned


In [37]:
import pandas as pd
import numpy as np

In [50]:
result = soup_post_codes_canada.find('tbody').find_all('tr')[1:]

In [75]:
# Here we will collect all the items needed and ignore all the results with Borough = 'Not assigned'
# Also we will change the value of Neighborhood = value of Borough for valid Borough but Neighborhood= 'Not assigned'
all_records_postcode = []
for result in all_records:
    if result.find_all('td')[1].text[0:-1] != 'Not assigned':
        postalcode = result.find_all('td')[0].text[0:-1]
        borough = result.find_all('td')[1].text[0:-1]
        if result.find_all('td')[2].text[0:-1] == 'Not assigned':
            neighborhood = borough
        else:
            neighborhood = result.find_all('td')[2].text[0:-1]
        all_records_postcode.append((postalcode,borough,neighborhood))

In [76]:
# Check first five and last five elements and confirm with the website
all_records_postcode[:5]

[('M3A', 'North York', 'Parkwoods'),
 ('M4A', 'North York', 'Victoria Village'),
 ('M5A', 'Downtown Toronto', 'Regent Park, Harbourfront'),
 ('M6A', 'North York', 'Lawrence Manor, Lawrence Heights'),
 ('M7A', 'Downtown Toronto', "Queen's Park, Ontario Provincial Government")]

In [72]:
all_records_postcode[-5:]

[('M8X', 'Etobicoke', 'The Kingsway, Montgomery Road, Old Mill North'),
 ('M4Y', 'Downtown Toronto', 'Church and Wellesley'),
 ('M7Y',
  'East Toronto',
  'Business reply mail Processing Centre, South Central Letter Processing Plant Toronto'),
 ('M8Y',
  'Etobicoke',
  "Old Mill South, King's Mill Park, Sunnylea, Humber Bay, Mimico NE, The Queensway East, Royal York South East, Kingsway Park South East"),
 ('M8Z',
  'Etobicoke',
  'Mimico NW, The Queensway West, South of Bloor, Kingsway Park South West, Royal York South West')]

In [77]:
# Let's build a dataframe out of this
df_postcode = pd.DataFrame(all_records_postcode, columns=['PostalCode', 'Borough', 'Neighborhood'])

In [78]:
df_postcode.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [79]:
df_postcode.shape

(103, 3)