# ACNiT: Analysing and Clustering Neighbourhoods in Toronto
### Part 1: Scrapping and cleaning data

In [2]:
import urllib.request
import pandas as pd
import numpy as np

In [5]:
pip install BeautifulSoup4


Note: you may need to restart the kernel to use updated packages.


In [6]:
from bs4 import BeautifulSoup

## 1) Scraping the data from  wikipedia page: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [7]:
#specify the url
wiki = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [8]:
#Query the website and return the html to the variable 'page'
page = urllib.request.urlopen(wiki)

In [9]:
#Parse the html in the 'page' variable, and store it in Beautiful Soup format
soup = BeautifulSoup(page)

In [7]:
# test the requested data
#print(soup.prettify())
#right_table=soup.find('table', class_= "wikitable sortable")
#right_table

### 1-1) Extracting the right information from soup and form it as an array

In [10]:
# extract the right information from soup and form it as an array

data = []   #will contain the table information
columns = []  #names of columns' table 

#find the right table in the webpage
right_table=soup.find('table', class_='wikitable sortable')

# extract the name of the columns from table's header
table_header = right_table.find_all('th')  
for th in table_header:
    #table_header = right_table.find_all('th')  
    columns.append(th.get_text(strip=True))

#try clause to skip any mis-formatted table with missing/empty tables
try:
#loop through table, grab each of the 3 columns text and append to the list of tuples. 
    for row in right_table.find_all('tr'):
        cols = row.find_all('td')
        if len(cols) == 3:
            data.append(( cols[0].text.strip(), cols[1].text.strip(), cols[2].text.strip()))
except: pass

#convert output to new array, check length
data = np.asarray(data)
print(len(data))

288


### 1-2) Creating the dataframe from the array, rename the columns and the index: 

In [11]:
#creat the dataframe, rename the columns 
df_data = pd.DataFrame(data)
df_data.columns = columns

#Rename index column
inx=df_data.index
inx.rename(name='index',inplace=True)

#and check output
df_data.head(10)


Unnamed: 0_level_0,Postcode,Borough,Neighbourhood
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### 1-3) Cleanning the data in the dataframe

In [12]:
#Remove/drop rows with unassigned Boroughs' values and reset the index

df_data = df_data[df_data.Borough!='Not assigned']
df_data.reset_index(drop= True, inplace=True)
df_data.head(10)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [13]:
#Aggregate rows with simillar postcodes

combine_data_neigh=lambda neigh: " , ".join(neigh)
combine_data_Boro= lambda boro: " , ".join(boro)

df_combinedData=df_data.groupby('Postcode').aggregate({'Borough': combine_data_Boro, 'Neighbourhood': combine_data_neigh}).reset_index()
df_combinedData.head(20)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,"Scarborough , Scarborough","Rouge , Malvern"
1,M1C,"Scarborough , Scarborough , Scarborough","Highland Creek , Rouge Hill , Port Union"
2,M1E,"Scarborough , Scarborough , Scarborough","Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,"Scarborough , Scarborough , Scarborough","East Birchmount Park , Ionview , Kennedy Park"
7,M1L,"Scarborough , Scarborough , Scarborough","Clairlea , Golden Mile , Oakridge"
8,M1M,"Scarborough , Scarborough , Scarborough","Cliffcrest , Cliffside , Scarborough Village West"
9,M1N,"Scarborough , Scarborough","Birch Cliff , Cliffside West"


In [14]:
# clean Borough column 
j=0
for i in range(len(df_combinedData)):
    x = df_combinedData.iloc[i]['Borough'].split(',')  
    #print (x[j])
    df_combinedData.at[i,'Borough'] = x[j]
df_combinedData.tail(30)

Unnamed: 0,Postcode,Borough,Neighbourhood
73,M6C,York,Humewood-Cedarvale
74,M6E,York,Caledonia-Fairbanks
75,M6G,Downtown Toronto,Christie
76,M6H,West Toronto,"Dovercourt Village , Dufferin"
77,M6J,West Toronto,"Little Portugal , Trinity"
78,M6K,West Toronto,"Brockton , Exhibition Place , Parkdale Village"
79,M6L,North York,"Downsview , North Park , Upwood Park"
80,M6M,York,"Del Ray , Keelesdale , Mount Dennis , Silverthorn"
81,M6N,York,"The Junction North , Runnymede"
82,M6P,West Toronto,"High Park , The Junction South"


In [15]:
# To address borough that has 'Not Assigned' neighborhood: The neighborhood will be the same as the borough
j=0
for i in range(len(df_combinedData)):
    if (df_combinedData.iloc[i]['Neighbourhood'] == 'Not assigned'):
        df_combinedData.at[i,'Neighbourhood'] = df_combinedData.at[i,'Borough']
df_combinedData.tail(20) 

Unnamed: 0,Postcode,Borough,Neighbourhood
83,M6R,West Toronto,"Parkdale , Roncesvalles"
84,M6S,West Toronto,"Runnymede , Swansea"
85,M7A,Queen's Park,Queen's Park
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
88,M8V,Etobicoke,"Humber Bay Shores , Mimico South , New Toronto"
89,M8W,Etobicoke,"Alderwood , Long Branch"
90,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
91,M8Y,Etobicoke,"Humber Bay , King's Mill Park , Kingsway Park ..."
92,M8Z,Etobicoke,"Kingsway Park South West , Mimico NW , The Que..."


In [16]:
df_combinedData.shape

(103, 3)

# Part 2) Add cordinations to the dataframe

### 2-1) Retrive geographical cordinations of each borough from provided link

In [17]:

filename = "http://cocl.us/Geospatial_data"
#headers = ["Postcode","Latitude","Longitude"]
geographical_cordination = pd.read_csv(filename)
geographical_cordination=geographical_cordination[0:]
#geographical_cordination.reset_index(inplace=True)
geographical_cordination.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 2-2)  add cordinations to the dataframe

In [18]:
# add cordinations to the dataframe
df_combinedData['Latitude'] = geographical_cordination['Latitude']
df_combinedData['Longitude'] = geographical_cordination['Longitude']
df_combinedData.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge , Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [19]:
df_combinedData.shape

(103, 5)