# Segmenting and Clustering Neighborhoods in Toronto

In [None]:
!pip install lxml
!pip install bs4

import requests
import pandas as pd
from bs4 import BeautifulSoup

## Step 1: importing and cleaning the neighborhoods data in Toronto

### 1. Scraping the web page by using the BeautifulSoup package

In [328]:
# tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
# canada = tables[0]

website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')
My_table = soup.find('table',{'class':'wikitable sortable'})

### 2. Importing the html data into the Python dataframe - canada

In [329]:
PostalCode=[]
for row in My_table.findAll('tr')[1:]:
    PostalCode_cell=row.findAll('td')[0]
    PostalCode.append(PostalCode_cell.text)    

Borough=[]
for row in My_table.findAll('tr')[1:] :
    Borough_cell=row.findAll('td')[1]
    Borough.append(Borough_cell.text)   

Neighbourhood=[]
for row in My_table.findAll('tr')[1:]:
    Neighbourhood_cell=row.findAll('td')[2]
    Neighbourhood_cell.text.rstrip('\n')
    Neighbourhood.append(Neighbourhood_cell.text)

canada=pd.DataFrame({'PostalCode':PostalCode,'Borough':Borough,'Neighborhood':Neighbourhood})
canada['Neighborhood'] = canada.Neighborhood.str.replace('(\n)','')

print(canada.head())
print('\n')
print(canada.describe())
print('\n')
print(canada['Borough'].value_counts())
print('\n')
print("Dataset Shape: ", canada.shape)

  PostalCode           Borough      Neighborhood
0        M1A      Not assigned      Not assigned
1        M2A      Not assigned      Not assigned
2        M3A        North York         Parkwoods
3        M4A        North York  Victoria Village
4        M5A  Downtown Toronto      Harbourfront


       PostalCode       Borough  Neighborhood
count         288           288           288
unique        180            12           209
top           M9V  Not assigned  Not assigned
freq            8            77            78


Not assigned        77
Etobicoke           45
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64


Dataset Shape:  (288, 3)


### 3. Cleaning data:
####    1) deleting data with the borough that is Not assigned.
####    2) combining the rows with same postal code into one row with the neighborhoods separated with a comma.
####    3) If data has a borough but a Not assigned neighborhood, the neighborhood will be the same as the borough.

In [327]:
canada = canada[canada['Borough'] != 'Not assigned']
canada.reset_index(drop=True, inplace = True)

dup_postal = canada['PostalCode'].value_counts().to_frame()
dup_postal = dup_postal[dup_postal['PostalCode'] > 1]

del_index=[]
for dup in dup_postal.index.values:
    index=[]
    for i in range(canada.shape[0]):
        if canada.loc[i,'PostalCode'] == dup :
            index = index +[i]
        if i == (canada.shape[0]-1) :
            canada.loc[index[0],'Neighborhood'] = ', '.join(canada.loc[index,'Neighborhood'].values)
            del_index = del_index + index[1:]

canada.drop(index = del_index,axis = 0, inplace=True)
canada.reset_index(drop=True, inplace = True)

for index, borough in enumerate(canada['Neighborhood']):
    if borough == 'Not assigned':
        canada.loc[index, 'Neighborhood'] = canada.loc[index, 'Borough']
        
print(canada.head())
print('\n')
print(canada.describe())
print('\n')
print("Dataset Shape: ", canada.shape)

  PostalCode           Borough                      Neighborhood
0        M3A        North York                         Parkwoods
1        M4A        North York                  Victoria Village
2        M5A  Downtown Toronto         Harbourfront, Regent Park
3        M6A        North York  Lawrence Heights, Lawrence Manor
4        M7A      Queen's Park                      Queen's Park


       PostalCode     Borough Neighborhood
count         103         103          103
unique        103          11          103
top           M4G  North York    Northwest
freq            1          24            1


Dataset Shape:  (103, 3)
