# Clustering of Toronto neighbourhoods

Importing relevant packages and scraping the Wikipedia page. With prettify we can identify the parts of the page that refer to the table and see how they are delimited.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(url)

In [None]:
soup = BeautifulSoup(r.content, 'html5lib') 
#print(soup.prettify())

In the next cell we isolate the table for the rest, to better understand what we need to do to isolate the information we need.

In [5]:
My_table = soup.find('table',{'class':'wikitable sortable'})
My_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

In the next cell we extract the table and put it into a database.

In [6]:
table_df = pd.DataFrame(columns=["Code", "Borough", "Neighbourhood"])

for tr in soup.findAll('tr')[2:]:
    tds = tr.find_all('td')
    #print("Code: %s, Borough: %s, Neighbourhood: %s" % \
    #      (tds[0].text, tds[1].text, tds[2].text))
    table_df = table_df.append({
    "Code": tds[0].text,
    "Borough":  tds[1].text,
    "Neighbourhood": tds[2].text
      }, ignore_index=True)
    if tds[0].text == "M9Z":
        break
    #print("Code: %s, Borough: %s, Neighbourhood: %s" % \
     #     (tds[0].text, tds[1].text, tds[2].text))

The next command is to remove the \n line breaks.

In [7]:
table_df = table_df.replace('\n','', regex=True)
table_df.head()

Unnamed: 0,Code,Borough,Neighbourhood
0,M2A,Not assigned,Not assigned
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M5A,Downtown Toronto,Regent Park


The next cell extracts all the rows with a defined value for "Borough", leaving the ones with "Not assigned" value out. 

In [8]:
table_df=table_df[(table_df[['Borough']]!='Not assigned').all(axis=1)]

In [9]:
table_df

Unnamed: 0,Code,Borough,Neighbourhood
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M5A,Downtown Toronto,Regent Park
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Not assigned
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern


Next cell is to check that I can extract values from the dataframe.

In [10]:
table_df.loc[table_df['Code'] == "M1B", "Borough"].values[0]

'Scarborough'

In this next cell the neighbourhoods of the table are grouped together by zone code, and separated by comma, as instructed.

In [11]:
table_s =table_df.groupby("Code")["Neighbourhood"].apply(list)
table_s[:]
table_grouped = table_df
table_grouped_df = pd.DataFrame(table_s)
table_grouped_df["Neighbourhood"]=table_grouped_df["Neighbourhood"].apply(lambda x: ",".join(x) if isinstance(x, list) else x)
#table_grouped_df=table_grouped_df.replace('','', regex=True)

table_grouped_df

Unnamed: 0_level_0,Neighbourhood
Code,Unnamed: 1_level_1
M1B,"Rouge,Malvern"
M1C,"Highland Creek,Rouge Hill,Port Union"
M1E,"Guildwood,Morningside,West Hill"
M1G,Woburn
M1H,Cedarbrae
M1J,Scarborough Village
M1K,"East Birchmount Park,Ionview,Kennedy Park"
M1L,"Clairlea,Golden Mile,Oakridge"
M1M,"Cliffcrest,Cliffside,Scarborough Village West"
M1N,"Birch Cliff,Cliffside West"


We then need to reintroduce the "Borough" column in the dataframe, and swap them to put them in the correct order.

In [12]:
table_final = table_grouped_df
Borough_l = [table_df.loc[table_df['Code'] == cc, "Borough"].values[0] for cc in table_final.index]
Borough_l
table_final["Borough"]=Borough_l
#table_final.head()
#(lambda row: row.Cost -(row.Cost * 0.1), axis = 1) 

table_final

Unnamed: 0_level_0,Neighbourhood,Borough
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,"Rouge,Malvern",Scarborough
M1C,"Highland Creek,Rouge Hill,Port Union",Scarborough
M1E,"Guildwood,Morningside,West Hill",Scarborough
M1G,Woburn,Scarborough
M1H,Cedarbrae,Scarborough
M1J,Scarborough Village,Scarborough
M1K,"East Birchmount Park,Ionview,Kennedy Park",Scarborough
M1L,"Clairlea,Golden Mile,Oakridge",Scarborough
M1M,"Cliffcrest,Cliffside,Scarborough Village West",Scarborough
M1N,"Birch Cliff,Cliffside West",Scarborough


In [13]:
columnsTitles=["Borough","Neighbourhood"]
table_final=table_final.reindex(columns=columnsTitles)

table_final

Unnamed: 0_level_0,Borough,Neighbourhood
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
M1N,Scarborough,"Birch Cliff,Cliffside West"


Last operation on the table is to copy the value from "Borough" on the "neighbourhood" field when "Not assigned".

In [14]:
table_final_b=table_final
for xx in range(len(table_final_b.Neighbourhood)):
    if table_final_b.Neighbourhood[xx]=="Not assigned":
        table_final_b.Neighbourhood[xx]=table_final_b.Borough[xx]
    #table_final.Neighbourhood == "Not assigned").sum()

In [15]:
table_final_b

Unnamed: 0_level_0,Borough,Neighbourhood
Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
M1N,Scarborough,"Birch Cliff,Cliffside West"


In [17]:
table_final_b.shape

(103, 2)

Note that the zone code is used as index.

In [25]:
aaa= table_final_b.loc["M1G"]
aaa

Borough          Scarborough
Neighbourhood         Woburn
Name: M1G, dtype: object