<h1 align='center'>Segmenting and Clustering Neighborhoods in Toronto - Part 1</h1>

## Import Libraries

In [1]:
import pandas as pd 
import numpy as np 
from bs4 import BeautifulSoup
import requests

## Scraping Table From Wikipedia

#### getting table from wikipedia and assign to an object

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' # define an url
html_content = requests.get(url).text # make a get request to an url
wiki = BeautifulSoup(html_content, 'lxml') # assign html content to an object 'wiki'

#### filtering data only table part

In [3]:
postal_table = wiki.find('table', attrs={'class': 'wikitable'}) # find a table with attributes include class as wikitable
postal_table_data = postal_table.tbody.find_all('tr') # get all tr tags which include header and all rows in a table

#### get table header as a list of column names

In [4]:
headings = []

for th in postal_table_data[0].find_all('th'): # getting a name of each columns 
    # get only a text, remove the newlines and white spaces on left and right, and append into a list
    headings.append(th.text.replace('\n', ' ').strip()) 

print(headings)

['Postal Code', 'Borough', 'Neighborhood']


#### get table body

In [5]:
body = [] # a list that will contain each rows in a table
row = [] # a buffer list of each element in a row

for postal in range(len(postal_table_data)-1): # iterate through each rows
    for td in postal_table_data[postal+1].find_all('td'): # iterate through each column
        row.append(td.text.replace('\n', ' ').strip())
    body.append(row) # append each rows into a list
    row = [] # clear buffer for next iteration

toronto = pd.DataFrame(body, columns=headings) # merge a list of each rows into a dataframe
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Cleaning Dataframe

In [6]:
toronto = toronto[toronto['Borough'] != 'Not assigned'] # drop a 'not assigned' borough
toronto.reset_index(drop=True, inplace=True)

# if neighborhood contain 'not assigned' value
for n in range(len(toronto)):
    if toronto.loc[n, 'Neighborhood'] == 'Not assigned':
        toronto.loc[n, 'Neighborhood'] = toronto.loc[n, 'Borough'] # neighborhood will be the same as borough

## Final Dataframe

#### let's see how final dataframe looks like

In [7]:
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
# export dataframe to use in another notebook
toronto.to_csv('Toronto.csv', index=False, header=True)

In [9]:
print('number of rows:', toronto.shape[0])

number of rows: 103
