# <b>Segmenting and Clustering Neighborhoods in Toronto</b>

## Importing required libraries

In [1]:
# import the library we use to open URLs
import urllib.request
# specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)
# import the BeautifulSoup library so we can parse HTML and XML documents
from bs4 import BeautifulSoup
# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")
#Then we use Beautiful Soup to parse the HTML data we stored in our ‘url’ variable and store it in a new variable called ‘soup’ in the Beautiful Soup format. 
#Jupyter Notebook prefers we specify a parser format so we use the “lxml” library option
#print(soup.prettify())
#to beautify the way data is presented

In [2]:
soup.title

<title>List of postal codes of Canada: M - Wikipedia</title>

In [3]:
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

## Finding and Storing Table

In [4]:
tbl=soup.find_all("table")
#tbl
#shows a list with 1 item at index = 0; stored inside []

## Extracting Table data using 'td' 

In [5]:
data= tbl[0].find_all('td')
#data

In [6]:
l=len(data)
l

180

## Extract text and remove unnecessary data

In [7]:
a=[]
for i in range(l):
    k=data[i].text.strip()#extract text and remove \n
    a.append(k)
a
#len(a)

['M1ANot assigned',
 'M2ANot assigned',
 'M3ANorth York(Parkwoods)',
 'M4ANorth York(Victoria Village)',
 'M5ADowntown Toronto(Regent Park / Harbourfront)',
 'M6ANorth York(Lawrence Manor / Lawrence Heights)',
 "M7AQueen's Park / Ontario Provincial Government",
 'M8ANot assigned',
 'M9AEtobicoke(Islington Avenue)',
 'M1BScarborough(Malvern / Rouge)',
 'M2BNot assigned',
 'M3BNorth York(Don Mills)North',
 'M4BEast York(Parkview Hill / Woodbine Gardens)',
 'M5BDowntown Toronto(Garden District, Ryerson)',
 'M6BNorth York(Glencairn)',
 'M7BNot assigned',
 'M8BNot assigned',
 'M9BEtobicoke(West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale)',
 'M1CScarborough(Rouge Hill / Port Union / Highland Creek)',
 'M2CNot assigned',
 'M3CNorth York(Don Mills)South(Flemingdon Park)',
 'M4CEast York(Woodbine Heights)',
 'M5CDowntown Toronto(St. James Town)',
 'M6CYork(Humewood-Cedarvale)',
 'M7CNot assigned',
 'M8CNot assigned',
 'M9CEtobicoke(Eringate / Bloordale Gardens / Old B

## Removing 'Not assigned' data groups

In [8]:
b=[]
l= len(a)
l
for i in range(l-1):
    if a[i].endswith('Not assigned'):
        continue
    else:
        b.append(a[i])
b
#len(b)

['M3ANorth York(Parkwoods)',
 'M4ANorth York(Victoria Village)',
 'M5ADowntown Toronto(Regent Park / Harbourfront)',
 'M6ANorth York(Lawrence Manor / Lawrence Heights)',
 "M7AQueen's Park / Ontario Provincial Government",
 'M9AEtobicoke(Islington Avenue)',
 'M1BScarborough(Malvern / Rouge)',
 'M3BNorth York(Don Mills)North',
 'M4BEast York(Parkview Hill / Woodbine Gardens)',
 'M5BDowntown Toronto(Garden District, Ryerson)',
 'M6BNorth York(Glencairn)',
 'M9BEtobicoke(West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale)',
 'M1CScarborough(Rouge Hill / Port Union / Highland Creek)',
 'M3CNorth York(Don Mills)South(Flemingdon Park)',
 'M4CEast York(Woodbine Heights)',
 'M5CDowntown Toronto(St. James Town)',
 'M6CYork(Humewood-Cedarvale)',
 'M9CEtobicoke(Eringate / Bloordale Gardens / Old Burnhamthorpe / Markland Wood)',
 'M1EScarborough(Guildwood / Morningside / West Hill)',
 'M4EEast Toronto(The Beaches)',
 'M5EDowntown Toronto(Berczy Park)',
 'M6EYork(Caledonia-Fa

## Extracting Postal codes 

(as per the available table)

In [9]:
postalcode=[]
d=[]
for e in b:
    postalcode.append(e[:3])
    d.append(e[3:])
postalcode
#d
#len(postalcode)
#len(d)
#to pshow in a single line
for x in range(len(postalcode)): 
    print (postalcode[x],end=" ")

M3A M4A M5A M6A M7A M9A M1B M3B M4B M5B M6B M9B M1C M3C M4C M5C M6C M9C M1E M4E M5E M6E M1G M4G M5G M6G M1H M2H M3H M4H M5H M6H M1J M2J M3J M4J M5J M6J M1K M2K M3K M4K M5K M6K M1L M2L M3L M4L M5L M6L M9L M1M M2M M3M M4M M5M M6M M9M M1N M2N M3N M4N M5N M6N M9N M1P M2P M4P M5P M6P M9P M1R M2R M4R M5R M6R M7R M9R M1S M4S M5S M6S M1T M4T M5T M1V M4V M5V M8V M9V M1W M4W M5W M8W M9W M1X M4X M5X M8X M4Y M7Y M8Y M8Z 

## Extracting boroughs and neighbours

#### 1) BOROUGHS

In [10]:
e=[]
for i in range (len(d)):
    e.append(d[i].split('('))
#e
#len(e)

In [11]:
bo=[]
for i in range (len(e)):
    bo.append(e[i][0])

#len(bo)
#bo
#TO SHOW IN A SINGLE LINE
for x in range(len(bo)): 
    print (bo[x],end=", ")

North York, North York, Downtown Toronto, North York, Queen's Park / Ontario Provincial Government, Etobicoke, Scarborough, North York, East York, Downtown Toronto, North York, Etobicoke, Scarborough, North York, East York, Downtown Toronto, York, Etobicoke, Scarborough, East Toronto, Downtown Toronto, York, Scarborough, East York, Downtown Toronto, Downtown Toronto, Scarborough, North York, North York, East York, Downtown Toronto, West Toronto, Scarborough, North York, North York, East YorkEast Toronto, Downtown Toronto, West Toronto, Scarborough, North York, North York, East Toronto, Downtown Toronto, West Toronto, Scarborough, North York, North York, East Toronto, Downtown Toronto, North York, North York, Scarborough, North York, North York, East Toronto, North York, York, North York, Scarborough, North York, North York, Central Toronto, Central Toronto, York, York, Scarborough, North York, Central Toronto, Central Toronto, West Toronto, Etobicoke, Scarborough, North York, Central T

#### 2) NEIGHBOURHOODS

(As per the data present which required some changes to be made in the 4th cell)

In [12]:
del e[4]
#e
d2=[]
for i in range(len(e)):
    d2.append(e[i][1])
#d2
#len(d2)

In [13]:
d3=[]
d3.append(d2[:4]+['Queen\'s Park / Ontario Provincial Government)']+d2[4:])
d3
d4=[]
p=len(d3[0])
for i in range (p):
    d4.append(d3[0][i])
#d4
#len(d4)

In [14]:
nh=[]
for i in range (len(d4)):
    nh.append(d4[i].replace(')',''))
#nh
#len(nh)
#TO SHOW IN A SINGLE LINE
for x in range(len(nh)): 
    print (nh[x],end=",   ")

Parkwoods,   Victoria Village,   Regent Park / Harbourfront,   Lawrence Manor / Lawrence Heights,   Queen's Park / Ontario Provincial Government,   Islington Avenue,   Malvern / Rouge,   Don MillsNorth,   Parkview Hill / Woodbine Gardens,   Garden District, Ryerson,   Glencairn,   West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale,   Rouge Hill / Port Union / Highland Creek,   Don MillsSouth,   Woodbine Heights,   St. James Town,   Humewood-Cedarvale,   Eringate / Bloordale Gardens / Old Burnhamthorpe / Markland Wood,   Guildwood / Morningside / West Hill,   The Beaches,   Berczy Park,   Caledonia-Fairbanks,   Woburn,   Leaside,   Central Bay Street,   Christie,   Cedarbrae,   Hillcrest Village,   Bathurst Manor / Wilson Heights / Downsview North,   Thorncliffe Park,   Richmond / Adelaide / King,   Dufferin / Dovercourt Village,   Scarborough Village,   Fairview / Henry Farm / Oriole,   Northwood Park / York University,   The Danforth  East,   Harbourfront East 

# CREATING THE DATAFRAME

In [15]:
import numpy as np
import pandas as pd
data={'PostalCode': postalcode,
      'Borough': bo,
      'Neighbourhood': nh}
df= pd.DataFrame(data)
df


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Queen's Park / Ontario Provincial Government,Queen's Park / Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern / Rouge
7,M3B,North York,Don MillsNorth
8,M4B,East York,Parkview Hill / Woodbine Gardens
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## SIZE

In [16]:
df.shape

(103, 3)