# Segmenting and Clustering Neighborhoods in Toronto

In [12]:
import requests
!pip install bs4



You should consider upgrading via the 'python -m pip install --upgrade pip' command.


Collecting lxml
  Downloading https://files.pythonhosted.org/packages/d9/36/e79b8e112fb63b04b72724954ae5519e740982bec84f66e5eb4a353906ef/lxml-4.5.0-cp37-cp37m-win_amd64.whl (3.7MB)
Installing collected packages: lxml
Successfully installed lxml-4.5.0


You should consider upgrading via the 'python -m pip install --upgrade pip' command.


## Part-1

In [13]:
# Postal codes for canada
post_codes_ca_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

### Get the data from Wikipedia page

In [34]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(post_codes_ca_url,'html.parser')

### Extract the data 

In [40]:
codes_list=[]
borough_list=[]
neighborhood_list=[]
i=1
for tag in soup.table.find_all('td'):
    
    if i == 1:
        codes_list.append((tag.text).split("\n")[0])
    if i == 2:
        borough_list.append((tag.text).split("\n")[0])
    if i == 3: 
        value = (tag.text).split("\n")[0]
        if value:
            neighborhood_list.append((tag.text).split("\n")[0])
        else:
            neighborhood_list.append("Not assigned")
    
    i = i+1
    if i==4:
        i=1

### Input the data in the dataframe

In [78]:
import pandas as pd
toronto_df = pd.DataFrame({"Postalcode":codes_list,"Borough":borough_list,"Neighborhood":neighborhood_list})
toronto_df.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,Malvern / Rouge


### Cleaning data 

In [82]:
# Remove rows having Borough "Not assigned"
toronto_df1 = toronto_df[toronto_df.Borough != 'Not assigned']
# toronto_df1 = toronto_df1.sort_values(by=['Postalcode','Borough'])

toronto_df1.reset_index(inplace=True)
toronto_df1.drop('index',axis=1,inplace=True)

toronto_df1.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern / Rouge
7,M3B,North York,Don Mills
8,M4B,East York,Parkview Hill / Woodbine Gardens
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [86]:
## Define the Neighborhoods which are 'Not assgined' with value of Borough.
for i in range(toronto_df1.shape[0]):
    if toronto_df1.loc[i, "Neighborhood"]=='Not assigned':
        toronto_df1.loc[i, "Neighborhood"] = toronto_df1.loc[i, "Borough"]

toronto_df1.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern / Rouge
7,M3B,North York,Don Mills
8,M4B,East York,Parkview Hill / Woodbine Gardens
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [93]:
## Combine the Neighborhood of same Posttal lines in one line. 
toronto_df_Final = toronto_df1.groupby(['Postalcode','Borough'])['Neighborhood'].apply(','.join).reset_index()
toronto_df_Final['Neighborhood'] = toronto_df_Final.Neighborhood.str.replace("/",",")
toronto_df_Final.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park"
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge"
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff , Cliffside West"


In [90]:
toronto_df_Final.shape

(103, 3)

## Part-2

In [98]:
## get all geographical coordinates of the neighborhoods.
geo_data_df = pd.read_csv('https://cocl.us/Geospatial_data')
print(geo_data_df.shape)
geo_data_df.head(10)

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [97]:
## Merge the coordinates info in Postalcode/Borough/Neighborhood dataframe.
merged_df = pd.merge(toronto_df_Final, geo_data_df,left_on='Postalcode',right_on='Postal Code',how='left').drop('Postal Code', axis=1)
merged_df.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848


In [99]:
merged_df.shape

(103, 5)