# Capstone Project: Segmentation and Clustering of Toronto Neighborhood


### Part 1: Webscraping & Preparing Data

In [130]:
#Importing required libraries
import pandas as pd
from bs4 import BeautifulSoup # this module helps in web scrapping
import requests # this module helps us to download the webpage

In [134]:
#Download data from Webpage & Webscraping
torontowk = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(torontowk,'html')

In [135]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    #print(row.span.text)
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

df=pd.DataFrame(table_contents)


In [138]:
df.shape # We have 103 rows and 3 columns

(103, 3)

In [139]:
df['Borough'].unique() #Returns unique values of 'Boroughs'

array(['North York', 'Downtown Toronto', "Queen's Park", 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'East YorkEast Toronto', 'Central Toronto',
       'MississaugaCanada Post Gateway Processing Centre',
       'Downtown TorontoStn A PO Boxes25 The Esplanade',
       'EtobicokeNorthwest',
       'East TorontoBusiness reply mail Processing Centre969 Eastern'],
      dtype=object)

In [140]:
df[df.PostalCode=='M5A'] #this shows that is has worked that we combine 2 neighborhoods in 1 row (separated with ,) if it is the same postal code

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [141]:
df[df.Neighborhood=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [142]:
df.shape #Check

(103, 3)

In [143]:
df.head(10) #Check

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


The data is succesfully prepared. We have 103 rows and 3 columns. Lets move on with adding geocodes.

### Part 2: Adding Geocodes

In [None]:
from csv import reader

In [146]:
#import csv file as geocoder is not reliable
file_path = (r"C:\Users\vanes\Desktop\Zukunft\Business\Data Science\Coursera Capstone\long.csv")
latlong_data = pd.read_csv(file_path)

In [148]:
#merge the dataframes with unique postal code
df_geo=pd.merge(df,latlong_data, right_on='Postal Code', left_on='PostalCode', how='left')

In [149]:
df_geo # check our merged dataframe

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A,43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,M7A,43.662301,-79.389494
...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",M8X,43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.665860,-79.383160
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L,M7Y,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",M8Y,43.636258,-79.498509


In [151]:
df_geo.shape # we still have 103 rows, but as we merge the data we now have 6 rows. We currently have 2* postal code column.

(103, 6)

In [152]:
#We only extract boroughs that inlcude the word "Toronto"
df_geo_toro=df_geo.loc[(df_geo['Borough'].str.contains('toronto|Toronto'))] #case sensitive

In [154]:
#Install Folium for Data Visualization//Maps
!pip install folium
import folium #folium makes it easy to visualize data that’s been manipulated in Python on an interactive leaflet map. It enables both the binding of data to a map for choropleth visualizations as well as passing rich vector/raster/HTML visualizations as markers on the map.

Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [161]:
toromap = folium.Map(location=[43.651070,-79.347015],zoom_start=13) #Map of Toronto & Level of Zoom

for lat,lng,Borough,Neighborhood in zip(df_geo_toro['Latitude'],df_geo_toro['Longitude'],df_geo_toro['Borough'],df_geo_toro['Neighborhood']):
    label = '{}, {}'.format(Neighborhood, Borough) #labels include Neigborhood and Borough
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker( #version of dot
    [lat,lng],
    radius=5, #size of dot
    popup=label,
    color='blue', #color of dot
    fill=True,
    fill_color='#3186cc', #inner color
    fill_opacity=0.7, #size of outer circle
    parse_html=False).add_to(toromap)
toromap 

### Part 3: Clustering the Neighborhoods

In [163]:
#Import modules: KMeans
from sklearn.cluster import KMeans

In [164]:
#We set the number of clusters at 5
kclusters = 5


In [166]:
toro_clusters = df_geo_toro.drop(['PostalCode', 'Postal Code', 'Borough', 'Neighborhood'], 1)

In [169]:
# We run the kmeans clustering
kmeans =KMeans(n_clusters=kclusters, random_state=0).fit(toro_clusters)

In [170]:
# We check the cluster labels tht are generated for each row of the df
kmeans.labels_[0:10]
df_geo_toro['Cluster Labels']=kmeans.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_geo_toro['Cluster Labels']=kmeans.labels_


In [171]:
kmeans.labels_ #check array

array([0, 0, 0, 3, 0, 0, 2, 0, 4, 3, 0, 2, 3, 0, 2, 3, 0, 3, 1, 1, 1, 1,
       4, 1, 2, 4, 1, 2, 4, 1, 2, 1, 0, 0, 0, 0, 0, 0, 3])

In [172]:
#Import further modules
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [173]:
#Create a map
clustermap = folium.Map(location=[43.651070,-79.347015], zoom_start=13)
#clustermap

In [177]:
# we set the colors for the map
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range (kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len (ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

In [182]:
# we add the markers to the map
markers_col = []
for lat, lon, poi, cluster in zip(df_geo_toro['Latitude'], df_geo_toro['Longitude'], df_geo_toro['Neighborhood'], df_geo_toro['Cluster Labels']):
        label = folium.Popup(str(poi) + 'Cluster' + str(cluster), parse_html=True)
        folium.CircleMarker(
        [lat, lon], radius=5, popup=label, color=rainbow[cluster-1], fill=True, fill_color=rainbow[cluster-1], fill_opacity=0.7).add_to(clustermap)

In [183]:
clustermap #show map

Clustering was done based on the values of Lattitude and Longitude and it was only done for the neighborhoods trhat contain Toronto in the Borough name. With the help of K means we clustered the data into 5 clusters based on their location. 