# Segmenting and Clustering Neighborhoods in Toronto

 Importing necessary libraries 

In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import folium
from sklearn.cluster import KMeans
import numpy as np

## Part 1 

The data is scraped from "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [6]:
l=list()
#Scraping the data and creating a soup object 
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data  = requests.get(url).text 
soup = BeautifulSoup(data,features="lxml")

In [7]:
#appending all information to list l 
table = soup.find_all('td')
for lines in table:
    try:    
        line = lines.p.text
        line = line[0:3]+' '+line[3:]
        if line.find('Not assigned')> 0:pass
        else : l.append(line)
    except : pass

In [8]:
l[0:10]

['M3A North York(Parkwoods)\n',
 'M4A North York(Victoria Village)\n',
 'M5A Downtown Toronto(Regent Park / Harbourfront)\n',
 'M6A North York(Lawrence Manor / Lawrence Heights)\n',
 "M7A Queen's Park(Ontario Provincial Government)\n",
 'M9A Etobicoke(Islington Avenue)\n',
 'M1B Scarborough(Malvern / Rouge)\n',
 'M3B North York(Don Mills)North\n',
 'M4B East York(Parkview Hill / Woodbine Gardens)\n',
 'M5B Downtown Toronto(Garden District, Ryerson)\n']

In [9]:
#parsing the strings and creating a dataframe 
df = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])
for i in l:
    postcode = i[0:3]
    i = i.replace('(',',')
    i = i.replace(')',',')
    j = i.split(',')
    neighborhood=j[1]
    borough=j[0]
    borough = borough[3:]
    df = df.append({'PostalCode':postcode,'Borough':borough,'Neighborhood':neighborhood},ignore_index=True)


for i in df.columns.tolist():
    df[i] = df[i].str.strip()

In [10]:
print(df.shape)

(103, 3)


## Part 2 


In [13]:
#loading data from csv and merging 
c_df = pd.read_csv('Geospatial_Coordinates.csv')
c_df.rename(columns={'Postal Code':'PostalCode'},inplace=True)
result = pd.merge(df,c_df,on='PostalCode')
result.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


## Part 3 

In [15]:
#redusing cluster size to Borough containg Toronto only
result = result[result['Borough'].str.contains('Toronto',regex=False)]
# you need only lat and long to cluster
cluster_df= result[['Latitude','Longitude']].copy()

As there are 5 boroughs with the name "Toronto" we shall assign 5 cluster centers 

In [17]:
k_means = KMeans(init = "k-means++", n_clusters = 5, n_init = 12).fit(cluster_df)
cluster_df['Label']=k_means.labels_.tolist()
#mergind label to the dataframe 
result = pd.merge(result,cluster_df,on=['Latitude','Longitude'])

Visualizing the cluster using folium 

In [18]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# setting color scheme for the clusters
rainbow =['red', 'blue', 'green', 'purple', 'orange']

# add markers to the map
for lat,lon,neighbourhood,cluster in zip(result['Latitude'], result['Longitude'], result['Neighborhood'], result['Label']):
    label = folium.Popup( str(cluster)+"-"+neighbourhood)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

In [19]:
map_clusters