# Segmenting and Clustering Neighborhoods in Toronto
### This is the 1st notebook of this project.

In [82]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

####  Import the url through requests and Beautiful Soup

In [83]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html=requests.get(url)
soup=BeautifulSoup(html.content, 'html.parser')

#### Append the table list through find_all of Beautiful Soup

In [84]:
table=[]
for i in soup.table.tbody.find_all('td'):
    table.append(i.text.strip())
table[0:12]

['M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A',
 'North York',
 'Parkwoods',
 'M4A',
 'North York',
 'Victoria Village']

#### Dataframe Creation

In [85]:
dict={'PostalCode':table[0::3], 'Borough':table[1::3], 'Neighborhood':table[2::3]}
df=pd.DataFrame(dict)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


#### Drop the rows with 'Not assigned' Boroughs

In [86]:
toronto_df=df[df.Borough!='Not assigned'].reset_index(drop=True)
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Assign the 'Not assigned' Neighborhoods to name of the Boroughs

In [87]:
toronto_df['Neighborhood']=toronto_df.apply(lambda x: x['Borough'] if x['Neighborhood']=='Not assigned' else x['Neighborhood'], axis=1)
toronto_df.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Group the neighborhoods by Postal Code and merge the dataframes

In [88]:
toronto_df_grouped=toronto_df.groupby(['PostalCode'])
toronto_df_1=toronto_df_grouped['Neighborhood'].apply(', '.join)
toronto_df_1=pd.DataFrame(toronto_df_1)
toronto_df_2=toronto_df[['PostalCode', 'Borough']].drop_duplicates(subset='PostalCode')
toronto_df=pd.merge(toronto_df_2, toronto_df_1, on='PostalCode', how='left')
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Get the number of rows in the Dataframe

In [90]:
toronto_df.shape

(103, 3)