## Segmentation and Clustering Neighborhoods in Toronto -- Part 1

In [1]:
# import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# read the webpage from the wiki
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
#Process and convert html data
data = response.text
soup = BeautifulSoup(data,'html.parser')
wiki_table=soup.find('table')
#develop dataframe
df = pd.read_html(str(wiki_table))[0]
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M1ANot assigned,M2ANot assigned,M3ANorth York(Parkwoods),M4ANorth York(Victoria Village),M5ADowntown Toronto(Regent Park / Harbourfront),M6ANorth York(Lawrence Manor / Lawrence Heights),M7AQueen's Park(Ontario Provincial Government),M8ANot assigned,M9AEtobicoke(Islington Avenue)
1,M1BScarborough(Malvern / Rouge),M2BNot assigned,M3BNorth York(Don Mills)North,M4BEast York(Parkview Hill / Woodbine Gardens),"M5BDowntown Toronto(Garden District, Ryerson)",M6BNorth York(Glencairn),M7BNot assigned,M8BNot assigned,M9BEtobicoke(West Deane Park / Princess Garden...
2,M1CScarborough(Rouge Hill / Port Union / Highl...,M2CNot assigned,M3CNorth York(Don Mills)South(Flemingdon Park),M4CEast York(Woodbine Heights),M5CDowntown Toronto(St. James Town),M6CYork(Humewood-Cedarvale),M7CNot assigned,M8CNot assigned,M9CEtobicoke(Eringate / Bloordale Gardens / Ol...
3,M1EScarborough(Guildwood / Morningside / West ...,M2ENot assigned,M3ENot assigned,M4EEast Toronto(The Beaches),M5EDowntown Toronto(Berczy Park),M6EYork(Caledonia-Fairbanks),M7ENot assigned,M8ENot assigned,M9ENot assigned
4,M1GScarborough(Woburn),M2GNot assigned,M3GNot assigned,M4GEast York(Leaside),M5GDowntown Toronto(Central Bay Street),M6GDowntown Toronto(Christie),M7GNot assigned,M8GNot assigned,M9GNot assigned


In [3]:
# convert the above table to list
my_list = df.values.tolist()

In [4]:
# create a new list called my_new_list -- convert multiple lists to a single list
my_new_list = []
# Next we want to iterate over the outer list
for sub_list in my_list:
    # Now go over each item of the sublist
    for item in sub_list:
        # append it to our new list
        my_new_list.append(item)

In [5]:
# check my_new_list
print(my_new_list)

['M1ANot assigned', 'M2ANot assigned', 'M3ANorth York(Parkwoods)', 'M4ANorth York(Victoria Village)', 'M5ADowntown Toronto(Regent Park / Harbourfront)', 'M6ANorth York(Lawrence Manor / Lawrence Heights)', "M7AQueen's Park(Ontario Provincial Government)", 'M8ANot assigned', 'M9AEtobicoke(Islington Avenue)', 'M1BScarborough(Malvern / Rouge)', 'M2BNot assigned', 'M3BNorth York(Don Mills)North', 'M4BEast York(Parkview Hill / Woodbine Gardens)', 'M5BDowntown Toronto(Garden District, Ryerson)', 'M6BNorth York(Glencairn)', 'M7BNot assigned', 'M8BNot assigned', 'M9BEtobicoke(West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale)', 'M1CScarborough(Rouge Hill / Port Union / Highland Creek)', 'M2CNot assigned', 'M3CNorth York(Don Mills)South(Flemingdon Park)', 'M4CEast York(Woodbine Heights)', 'M5CDowntown Toronto(St. James Town)', 'M6CYork(Humewood-Cedarvale)', 'M7CNot assigned', 'M8CNot assigned', 'M9CEtobicoke(Eringate / Bloordale Gardens / Old Burnhamthorpe / Markland Woo

In [6]:
# split postalcode and Borough
def split_var(s, n=3):
    frist_N = s[:n] # select the first 3 characters from left
    after_N = s[n:] # select the rest of characters
    return frist_N, after_N

In [7]:
# create postalcode list and Borough list
Postalcode_list = []
Borough_list = []
for i in my_new_list:
    Postalcode_value, Borough_value = split_var(i, n=3)
    
    Postalcode_list.append(Postalcode_value)
    Borough_list.append(Borough_value)

In [8]:
# split Borough and Neigbourhood
def split_var2(s):
    if s == "Not assigned":
        value_1 = s
        value_3 = s
    else:    
        first_N = s.split('(') # split value by '('
        value_1 = first_N[0]
        value_2 = first_N[1].replace(')', ' ') # replace ')' with blank
        value_3 = value_2.replace(' /', ',') # replace '/' with ','
    return value_1, value_3

In [9]:
# create Borough list and Neighborhood list
Borough_2_list= []
Neighborhood_list= []
for i in Borough_list:
    Borough_2_value, Neighborhood_value = split_var2(i)
    
    Borough_2_list.append(Borough_2_value)
    Neighborhood_list.append(Neighborhood_value)

In [10]:
df_clean = pd.DataFrame()
df_clean['Postalcode'] = Postalcode_list
df_clean['Borough'] = Borough_2_list
df_clean['Neighborhood'] = Neighborhood_list

In [11]:
# check the clean dataset shape
print(df_clean.shape)

# check the clean dataset
df_clean.head(12)

(180, 3)


Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"


In [12]:
# remove not assigned from Borough
df_clean_final = df_clean[df_clean['Borough']!= 'Not assigned']

In [13]:
# check the final clean dataset 
df_clean_final.head(12)

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills North
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [14]:
# check the final clean dataset shape
df_clean_final.shape

(103, 3)

In [15]:
# check number of unique Postalcode
print('Number of Postalcode in the table is: ' + str(df_clean_final.shape[0]))
print('Number of Unique Postalcode value is: ' + str(df_clean_final.Postalcode.nunique()))

Number of Postalcode in the table is: 103
Number of Unique Postalcode value is: 103


In [16]:
# save the clean dataset
df_clean_final.to_csv('capstone_part1.csv')
print('Saved Successfully!')

Saved Successfully!
