<h1><center>Segmenting and Clustering Neighborhoods in Toronto - Part 1</center></h1>

## import libraries

In [1]:
import pandas as pd 
import numpy as np 
from bs4 import BeautifulSoup
import requests

## scraping table from wikipedia

### getting table from wikipedia and assign to an object

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' # define an url
html_content = requests.get(url).text # make a get request to an url
wiki = BeautifulSoup(html_content, 'lxml') # assign html content to an object 'wiki'

### filtering data only table part

In [3]:
postal_table = wiki.find('table', attrs={'class': 'wikitable'}) # find a table with attributes include class as wikitable
postal_table_data = postal_table.tbody.find_all('tr') # get all tr tags which include header and all rows in a table

### get table header as a list of column names

In [4]:
headings = []

for th in postal_table_data[0].find_all('th'): # getting a name of each columns 
    # get only a text, remove the newlines and white spaces on left and right, and append into a list
    headings.append(th.text.replace('\n', ' ').strip()) 

print(headings)

['Postal Code', 'Borough', 'Neighborhood']


### get table body

In [5]:
body = [] # a list that will contain each rows in a table
row = [] # a buffer list of each element in a row

for postal in range(len(postal_table_data)-1): # iterate through each rows
    for td in postal_table_data[postal+1].find_all('td'): # iterate through each column
        row.append(td.text.replace('\n', ' ').strip())
    body.append(row) # append each rows into a list
    row = [] # clear buffer for next iteration

trt_raw = pd.DataFrame(body, columns=headings) # merge a list of each rows into a dataframe
trt_raw.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## cleaning dataframe

### deal with borough column and prepare neighborhood column for further processes

In [6]:
trt_raw = trt_raw[trt_raw['Borough'] != 'Not assigned'] # drop a 'not assigned' borough
trt_raw['Neighborhood'] = trt_raw['Neighborhood'].str.split(',') # seperate different neighborhood with a comma
trt_raw.reset_index(drop=True, inplace=True)
trt_raw.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,[Parkwoods]
1,M4A,North York,[Victoria Village]
2,M5A,Downtown Toronto,"[Regent Park, Harbourfront]"
3,M6A,North York,"[Lawrence Manor, Lawrence Heights]"
4,M7A,Downtown Toronto,"[Queen's Park, Ontario Provincial Government]"


### processing neighborhood column

In [7]:
trt_list = [] # list containing each row in a new table

for n in range(len(trt_raw)): # iterate through every row in a table
    # if more than one neighborhood exist in a row
    if len(trt_raw.loc[n, 'Neighborhood']) > 1:
        for nb in range(len(trt_raw.loc[n, 'Neighborhood'])): # split into one row per neighborhood
            trt_list.append([trt_raw.loc[n, 'Postal Code'], trt_raw.loc[n, 'Borough'], trt_raw.loc[n, 'Neighborhood'][nb]])

    # if neighborhood contain 'not assigned' value
    elif trt_raw.loc[n, 'Neighborhood'][0] == 'Not assigned':
        trt_list.append([trt_raw.loc[n, 'Postal Code'], trt_raw.loc[n, 'Borough'], trt_raw.loc[n, 'Borough']])

    # if only one neighborhood in each row
    else:
        trt_list.append([trt_raw.loc[n, 'Postal Code'], trt_raw.loc[n, 'Borough'], trt_raw.loc[n, 'Neighborhood'][0]])

toronto = pd.DataFrame(trt_list, columns=headings) # merge a list of each rows into a dataframe

## final dataframe

### let's see how final dataframe looks like

In [8]:
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park
3,M5A,Downtown Toronto,Harbourfront
4,M6A,North York,Lawrence Manor


In [9]:
toronto.shape

(217, 3)