## Segmenting and Clustering Neighborhoods in Toronto

 ###### Code to Scrape Wikipedia page

In [470]:
# Import Libraries
import pandas as pd
import requests

In [471]:
# Form URL to fetch the file
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
file = requests.get(url)

In [489]:
# Webpage is extracted into a Text file
file

<Response [200]>

##### Use pandas read_html() command to scrape and parse 'HTML and XML code into Pandas Dataframe

In [499]:
file_df = pd.read_html(file.text)

In [500]:
len(file_df)

3

In [501]:
type(file_df)

list

##### Use the first element of the list which contains required data.

In [566]:
toronto_df = file_df[0]
print(toronto_df.head())
toronto_df.shape


  Postcode           Borough     Neighbourhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront


(288, 3)

##### Processing cells with assigned 'Borough'. Filter out 'Not assigned'

In [567]:
toronto_df_filtered =toronto_df[toronto_df['Borough'] != 'Not assigned']

In [569]:
print (toronto_df_filtered[0:10])
toronto_df_filtered.shape

   Postcode           Borough     Neighbourhood
2       M3A        North York         Parkwoods
3       M4A        North York  Victoria Village
4       M5A  Downtown Toronto      Harbourfront
5       M5A  Downtown Toronto       Regent Park
6       M6A        North York  Lawrence Heights
7       M6A        North York    Lawrence Manor
8       M7A      Queen's Park      Not assigned
10      M9A         Etobicoke  Islington Avenue
11      M1B       Scarborough             Rouge
12      M1B       Scarborough           Malvern


(211, 3)

##### Further replace cells having value as 'Not Assigned' Neighborhood, with the same as the Borough.


In [576]:
toronto_df_neigh_replaced = toronto_df_filtered[:]
cond = toronto_df_neigh_replaced.Neighbourhood == 'Not assigned'
toronto_df_neigh_replaced.loc[cond, 'Neighbourhood'] = toronto_df_neigh_replaced.Borough
print(toronto_df_neigh_replaced[0:10])

   Postcode           Borough     Neighbourhood
2       M3A        North York         Parkwoods
3       M4A        North York  Victoria Village
4       M5A  Downtown Toronto      Harbourfront
5       M5A  Downtown Toronto       Regent Park
6       M6A        North York  Lawrence Heights
7       M6A        North York    Lawrence Manor
8       M7A      Queen's Park      Queen's Park
10      M9A         Etobicoke  Islington Avenue
11      M1B       Scarborough             Rouge
12      M1B       Scarborough           Malvern


##### Concatenate Neighbourhoods with same Postcodes and Boroughs.
##### Use Groupby method on 'Postcode' and 'Borough' and apply 'function lambda' to concatenate Neighbourhood (separated by ',')

In [583]:
toronto_final_df = toronto_df_neigh_replaced.groupby(["Postcode","Borough"])["Neighbourhood"].apply(lambda x:"%s" % ', '.join(x))
print(toronto_final_df.head(20))

Postcode  Borough    
M1B       Scarborough                                       Rouge, Malvern
M1C       Scarborough               Highland Creek, Rouge Hill, Port Union
M1E       Scarborough                    Guildwood, Morningside, West Hill
M1G       Scarborough                                               Woburn
M1H       Scarborough                                            Cedarbrae
M1J       Scarborough                                  Scarborough Village
M1K       Scarborough          East Birchmount Park, Ionview, Kennedy Park
M1L       Scarborough                      Clairlea, Golden Mile, Oakridge
M1M       Scarborough      Cliffcrest, Cliffside, Scarborough Village West
M1N       Scarborough                          Birch Cliff, Cliffside West
M1P       Scarborough    Dorset Park, Scarborough Town Centre, Wexford ...
M1R       Scarborough                                    Maryvale, Wexford
M1S       Scarborough                                            Agincourt
M1T

In [584]:
toronto_final_df.shape

(103,)