# This notebook is for Exploring, Segmentation and Clustering the neighborhoods of Toronto.

### The Postal data has been scraped from the Wiki page and then loaded into a Pandas dataframe. Pandas, Urllib, Lxml and BeautifulSoup packages have been used for the same.

### The scraped data in the dataframe has then been cleaned as per the requirements given in the assignment. All the requisites have been met.

In [3]:
# Importing all the required libraries
!conda install -c conda-forge bs4 --yes
!pip3 install lxml
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import lxml
print("Libraries Imported !")

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/79/37/d420b7fdc9a550bd29b8cfeacff3b38502d9600b09d7dfae9a69e623b891/lxml-4.5.2-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 5.3MB/s eta 0:00:01     |█████▏                          | 901kB 5.3MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.2
Libraries Imported !


In [4]:
#Initializing the URL variable
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# Reading data from the HTML webpage into Pandas dataframe to extract only the data for Postal Codes, Boroughs and Neighbouhoods
df = pd.read_html(url)
df = df[0]

# Filtering out records whose Borough is Not Assigned
df = df[df['Borough'] != 'Not assigned']
df.reset_index(inplace=True)
df.drop(columns='index',inplace=True)

In [5]:
# Checking if there are any records where Neighbourhood is Not Assigned
df[df['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [6]:
# Displaying the first 5 rows of the dataframe
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
# Printing the row and column count of the dataframe
print("There are {} rows and {} columns in the dataframe !".format(df.shape[0],df.shape[1]))

There are 103 rows and 3 columns in the dataframe !


### Now we read the geographical data with Latitudes and Longitudes and align it with each Borough and Neighbourhood.

### Pandas read_csv() function has been used to extract the .csv data and store in the Dataframe.

In [8]:
# Reading the geographical csv file and storing in Pandas dataframe
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')

### We merge the obtained dataframe with the original web-scraped dataframe by using the Pandas merge() function. This results in a new dataframe with the Postal Code, Borough, Neighbourhood, Latitude and Longitude visible in a single row.

In [9]:
# Merging the new dataframe with the original web-scraped dataframe by using the Pandas merge() function
geo_df = pd.merge(left=df, right=geo_df, left_on='Postal Code', right_on='Postal Code')

# Displaying the obtained Dataframe
geo_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [10]:
# Printing the row and column count of the geographical location dataframe
print("There are {} rows and {} columns in the dataframe !".format(geo_df.shape[0],geo_df.shape[1]))

There are 103 rows and 5 columns in the dataframe !


### Now we select ONLY the records containing the word 'Toronto' in the Borough Name.

In [120]:
tor_geo_df = geo_df[geo_df['Borough'].str.contains("Toronto")].reset_index()
tor_geo_df.drop(columns = 'index',inplace = True)
print("There are {} rows and {} columns in the tor_geo_df dataframe !".format(tor_geo_df.shape[0],tor_geo_df.shape[1]))

There are 39 rows and 5 columns in the tor_geo_df dataframe !


### We will be using KMeans clustering algotithm to cluster the neighbourhoods. We will be using Matplotlib CM and Colors module to generate the color coding for the clusters.

In [121]:
# Installing and importing Folium package for map creation
!conda install -c conda-forge folium=0.5.0 --yes
import folium

# Importing KMeans for clustering the neighbourhoods
from sklearn.cluster import KMeans
import numpy as np

# Importing Matplotlib and associated plotting modules for color-coding the clustered neighbourhoods
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries imported !')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported !


### We will be defining the Latitude and Longitude values of Toronto from the Internet.

In [122]:
# Defining the Latitude and Longitude of Toronto
tor_lat = 43.6532
tor_lon = -79.3832

### Now, we display the map of Toronto with the different neighbourhoods.

In [123]:
# Create map of Toronto using Latitude and Longitude values
map_toronto = folium.Map(location=[tor_lat, tor_lon], zoom_start=12)

# Add markers to map
for lat, lng, label in zip(tor_geo_df['Latitude'], tor_geo_df['Longitude'], tor_geo_df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
# Displaying the map of Toronto with the different neighbourhoods
map_toronto

### We use One-Hot encoding to generate dummy values for demarcating which Neighbourhood comes under which Borough.

In [124]:
# One-Hot encoding
tor_onehot = pd.get_dummies(tor_geo_df[['Borough']], prefix="", prefix_sep="")

# Add Neighbourhood column to One-Hot encoding dataframe
tor_onehot['Neighbourhood'] = tor_geo_df['Neighbourhood']

# Move Neighborhood column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]

# Dropping the Neighbourhood column
tor_grouped_clustering = tor_onehot.drop('Neighbourhood', 1)

### Now we apply KMeans clustering algorithm and cluster the neighbourhoods into 4 clusters.

In [125]:
# Defining the number of clusters
num_clusters = 4

# KMeans algorithm
k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(tor_grouped_clustering)
labels = k_means.labels_

# Printing the cluster labels
print('Clustering Labels are : ',labels)

Clustering Labels are :  [1 1 1 1 0 1 1 1 1 3 1 3 0 1 3 0 1 0 2 2 2 2 3 2 2 3 2 1 3 2 1 2 1 1 1 1 1
 1 0]


### Now we simply add the Cluster Labels into the original dataframe as the last column.

In [126]:
# Add clustering labels back to DataFrame
tor_geo_df.insert(5, 'Cluster Labels', k_means.labels_)

In [127]:
# Displaying final dataframe
tor_geo_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,0
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,1
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,1
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564,1
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,1
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259,3


### Finally, we will be using Folium library to visualize the different clusters on the map of Toronto.

In [138]:
# Create map of Toronto based on the defined Latitute and Longitude
map_tor_clusters = folium.Map(location=[tor_lat, tor_lon], zoom_start=12)

# Set color scheme for the clusters
x = np.arange(num_clusters)
ys = [i + x + (i*x)**2 for i in range(num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i**10) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_geo_df['Latitude'], tor_geo_df['Longitude'], tor_geo_df['Neighbourhood'], tor_geo_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.7).add_to(map_tor_clusters)
       
# Displaying the Map
map_tor_clusters

### From the map, we can see how the neighbourhoods have been clustered based on the Boroughs under which they fall. We observe that the 'The Annex, North Midtown' and 'Rosedale' fall near the border of the two clusters 1 and 2.

### This is the end of the notebook.