# Segmenting and Clustering Neighborhoods in Toronto part 1

Extracting the  data of Toronto neighborhoods from Wikipedia.The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood.Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import lxml.html as lh

In [2]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

#Create a handle, page, to handle the contents of the website
page = requests.get(url)

#Store the contents of the website under doc
doc = lh.fromstring(page.content)

#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [3]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [4]:
# Parse the first row as our header
tr_elements = doc.xpath('//tr')

#Create empty list
col=[]
i=0

#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postal code
"
2:"Borough
"
3:"Neighborhood
"


In [5]:

#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [6]:
# Check the length of each column. Ideally, they should all be the same
[len(C) for (title,C) in col]

[181, 181, 181]

In [7]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [8]:
df.head(14)

Unnamed: 0,Postal code\n,Borough\n,Neighborhood\n
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n
5,M6A\n,North York\n,Lawrence Manor / Lawrence Heights\n
6,M7A\n,Downtown Toronto\n,Queen's Park / Ontario Provincial Government\n
7,M8A\n,Not assigned\n,\n
8,M9A\n,Etobicoke\n,Islington Avenue\n
9,M1B\n,Scarborough\n,Malvern / Rouge\n


In [9]:
#Cleaning the messy string in the Borough column
df = df.replace('\n',' ', regex=True)
df.head()

Unnamed: 0,Postal code\n,Borough\n,Neighborhood\n
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [10]:
#Dropping all cells with a borough that is Not assigned
df.drop(df.index[df['Borough\n'] == 'Not assigned'], inplace = True)

# Reset the index and dropping the previous index
df = df.reset_index(drop=True)

df.head(10)

Unnamed: 0,Postal code\n,Borough\n,Neighborhood\n
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge


In [11]:
df = df.groupby(['Postal code\n', 'Borough\n'])['Neighborhood\n'].apply(','.join).reset_index()
df.columns = ['Postal code\n','Borough\n','Neighborhood\n']
df.head(10)

Unnamed: 0,Postal code\n,Borough\n,Neighborhood\n
0,,Canadian postal codes,
1,M1A,Not assigned,
2,M1B,Scarborough,Malvern / Rouge
3,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
4,M1E,Scarborough,Guildwood / Morningside / West Hill
5,M1G,Scarborough,Woburn
6,M1H,Scarborough,Cedarbrae
7,M1J,Scarborough,Scarborough Village
8,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
9,M1L,Scarborough,Golden Mile / Clairlea / Oakridge


In [12]:
#Removing any space in the start of the string
df['Neighborhood\n'] = df['Neighborhood\n'].str.strip()

In [13]:
#Assigning Borough values to the Neignbourhood where vlaue is "Not assigned"¶
df.loc[df['Neighborhood\n'] == 'Not assigned', 'Neighborhood\n'] = df['Borough\n']

In [14]:
df.shape

(181, 3)

In [16]:
df.to_csv(r'df_first.csv')