# Segmenting and Clustering Neighborhoods in Toronto


#### The project includes scraping the Wikipedia page for the postal codes of Canada and then process and clean the data for the clustering. The clustering is carried out by K Means and the clusters are plotted using the Folium Library. The Boroughs containing the name 'Toronto' in it are first plotted and then clustered and plotted again.



In [None]:
!pip install beautifulsoup4
!pip install lxml
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 


from IPython.display import display_html
import pandas as pd
import numpy as np
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Folium installed')
print('Libraries imported.')

In [None]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url,'lxml')
#print(soup.prettify())
from IPython.display import display_html
tab = str(soup.table)
display_html(tab,raw=True)

## Now The HTML Table is Converted Into pandas dataframe

In [None]:
df = pd.read_html(tab)
df1=df[0]
df1.head()

### NOW CLEANING THE DATA AND PROCESSING IT!!

In [None]:
df2 = df1[df1.Borough != 'Not assigned']

# Combining the neighbourhoods with same Postalcode
df3 = df2.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
df3.reset_index(inplace=True)

# Replacing the name of the neighbourhoods which are 'Not assigned' with names of Borough
df3['Neighborhood'] = np.where(df3['Neighborhood'] == 'Not assigned',df3['Borough'], df3['Neighborhood'])

df3.rename(columns={'Postal Code':'Postalcode'},inplace=True)
df3

In [None]:
# Shape of data frame
df3.shape

### IMPORTING THE CSV FILE CONTAINING LATITUDE AND LONGITUDE OF NEIGHBORHOOD IN TORONTO

In [None]:
lat_lon = pd.read_csv('https://cocl.us/Geospatial_data')
lat_lon.rename(columns={'Postal Code':'Postalcode'},inplace=True)
lat_lon.head()


### MERGING BOTH THE TABLES

In [None]:

df4 = pd.merge(df3,lat_lon,on='Postalcode')
df4.head()

### Getting all the rows from the data frame which contains Toronto in their Borough.

In [None]:
df5 = df4[df4['Borough'].str.contains('Toronto',regex=False)]
df5

### Using KMeans clustering for the clustering of the neighborhoods

In [None]:
k=5
toronto_clustering = df5.drop(['Postalcode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
df5.insert(0, 'Cluster Labels', kmeans.labels_)

In [None]:
df5

In [None]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df5['Latitude'], df5['Longitude'], df5['Neighborhood'], df5['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
       
map_clusters