# Segmenting and Clustering Neighborhoods in Toronto

## Using lxml.html library

### Import Libraries

In [17]:
import requests
import lxml.html as lh
import pandas as pd

### Scrape Table Cells

In [18]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')


In [31]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]
table_size = len(T) 

### Parse Table Header

In [20]:
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d: %s' % (i,name))
    col.append((name,[]))

1: Postcode
2: Borough
3: Neighbourhood



### Creating Pandas DataFrame

In [32]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=table_size:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [33]:
[len(C) for (title,C) in col]

[289, 289, 289]

In [90]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

df= df.replace('\n','', regex=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Using BeautifulSoup library

In [44]:
import requests
import pandas as pd

In [45]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [46]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(url,'lxml')

In [47]:
My_table = soup.find('table',{'class':'wikitable sortable'})

In [48]:
links = My_table.findAll('tr')
#links

In [84]:
tr_elements = links
table_size = len(tr_elements)
print(table_size)
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=table_size:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data = t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1


290


In [87]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

df.rename(columns = {'Neighbourhood\n':'Neighbourhood'}, inplace = True)
df= df.replace('\n','', regex=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
