# Segmenting and clustering neighborhoods in Toronto

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Web Scraping from Wikipedia Page
Load the list of post codes into a dataframe

In [2]:
List_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(List_url).text

soup = BeautifulSoup(source, 'xml')

table = soup.find('table')

In [3]:
#dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
column_names = ['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns = column_names)

In [4]:
# Search all the postcode, borough, neighborhood 
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [5]:
df.head(6)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights


### Data Cleaning
Remove rows with Borough is "Not Assigned"<br/>
If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.<br/>
Combining multiple postal code listings into by separating the neighborhoods by comma operator.

In [6]:
#For Borough==Not Assigned
df=df[df['Borough']!='Not assigned']
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [7]:
df[df['Neighborhood']=='Not assigned']

Unnamed: 0,Postalcode,Borough,Neighborhood


<font size="3"> The obtained dataframe from the above expression is empty so we can conclude that the cleaned version of dataframe does not consist of any 'non-assigned' neighborhood.</font>

In [19]:
#Group the neighborhood column according to the grouping of the postal code column
#Then combining neighborhoods with same postalcode by separating them with comma operator
temp_df=df.groupby('Postalcode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
temp_df

Postalcode
M1B                                       Rouge, Malvern
M1C               Highland Creek, Rouge Hill, Port Union
M1E                    Guildwood, Morningside, West Hill
M1G                                               Woburn
M1H                                            Cedarbrae
                             ...                        
M9N                                               Weston
M9P                                            Westmount
M9R    Kingsview Village, Martin Grove Gardens, Richv...
M9V    Albion Gardens, Beaumond Heights, Humbergate, ...
M9W                                            Northwest
Name: Neighborhood, Length: 103, dtype: object

In [20]:
temp_df=temp_df.reset_index(drop=False)
temp_df

Unnamed: 0,Postalcode,Neighborhood
0,M1B,"Rouge, Malvern"
1,M1C,"Highland Creek, Rouge Hill, Port Union"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae
...,...,...
98,M9N,Weston
99,M9P,Westmount
100,M9R,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [22]:
temp_df.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)
temp_df

Unnamed: 0,Postalcode,Neighborhood_joined
0,M1B,"Rouge, Malvern"
1,M1C,"Highland Creek, Rouge Hill, Port Union"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae
...,...,...
98,M9N,Weston
99,M9P,Westmount
100,M9R,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [23]:
#merge Neighborhood with the Neighborhood_Joined column for comparison
df_merge = pd.merge(df, temp_df, on='Postalcode')
df_merge

Unnamed: 0,Postalcode,Borough,Neighborhood,Neighborhood_joined
0,M3A,North York,Parkwoods,Parkwoods
1,M4A,North York,Victoria Village,Victoria Village
2,M5A,Downtown Toronto,Harbourfront,Harbourfront
3,M6A,North York,Lawrence Heights,"Lawrence Heights, Lawrence Manor"
4,M6A,North York,Lawrence Manor,"Lawrence Heights, Lawrence Manor"
...,...,...,...,...
205,M8Z,Etobicoke,Kingsway Park South West,"Kingsway Park South West, Mimico NW, The Queen..."
206,M8Z,Etobicoke,Mimico NW,"Kingsway Park South West, Mimico NW, The Queen..."
207,M8Z,Etobicoke,The Queensway West,"Kingsway Park South West, Mimico NW, The Queen..."
208,M8Z,Etobicoke,Royal York South West,"Kingsway Park South West, Mimico NW, The Queen..."


In [24]:
#Drop the Neighborhood column to get duplicate rows
df_merge.drop(['Neighborhood'],axis=1,inplace=True)
df_merge

Unnamed: 0,Postalcode,Borough,Neighborhood_joined
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
...,...,...,...
205,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen..."
206,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen..."
207,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen..."
208,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen..."


In [26]:
#Drop the duplicate rows to minimize redundancy
df_merge.drop_duplicates(inplace=True)
df_merge

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
5,M7A,Downtown Toronto,Queen's Park
...,...,...,...
192,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
195,M4Y,Downtown Toronto,Church and Wellesley
196,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
197,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."


In [25]:
df_merge.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)

In [27]:
df_merge.head(15)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Etobicoke,Islington Avenue
7,M1B,Scarborough,"Rouge, Malvern"
9,M3B,North York,Don Mills North
10,M4B,East York,"Woodbine Gardens, Parkview Hill"
12,M5B,Downtown Toronto,"Ryerson, Garden District"


In [28]:
df_merge.shape

(103, 3)