## 1. Import the necessary libraries

In [1]:
# library to handle data in a vectorized manner
import numpy as np

# library for data analsysis
import pandas as pd
from bs4 import BeautifulSoup
import requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# Machine Learning and visualization packages
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Libraries imported.


## 2. Load the dataset and conduct an exploratory analysis

#### Description: The dataset containing postal codes of Canada starting with 'M' was sourced from Wikipedia. The dataframe was populated after looping through all tags within the table tag of the corresponding Wikipedia page. Rows containing 'Not assigned' and invalid values, if any, in the 'Borough' column were removed. The table was sorted by 'Postal Code' and multiple neigborhoods for the same borough were ensured to be separated by a comma.

### Access the webpage and extract the data from the table tag on the page, into a list

In [2]:
source= requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup= BeautifulSoup(source, 'xml')
print(soup.prettify())

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="UTF-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XoufggpAMM8AAvvJOwcAAABL","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":949497198,"wgRevisionId":949497198,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Pos

In [3]:
tb1= soup.find('table',{'class':'wikitable'})
list_can= []

for i in tb1.find_all('tr'):
  list_can.append([j.text.strip() for j in (i.find_all('td'))])

### Populate the dataframe from the list, clean and sort it

In [4]:
df= pd.DataFrame(list_can, columns= ['Postal Code','Borough','Neighborhood'], index= None)
df.count()

Postal Code     180
Borough         180
Neighborhood    180
dtype: int64

In [5]:
df= df[df.Borough!= 'Not assigned']
df=df.iloc[1:]

df= df.sort_values(by= 'Postal Code')
df= df.set_index('Postal Code')
df.reset_index(inplace= True)

df['Neighborhood']= df['Neighborhood'].apply(lambda x: x.replace('/', ','))

## The dataframe displaying postal codes of Canada starting with M, its boroughs and neighborhoods

In [6]:
pd.set_option('display.max_rows', 103)
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park"
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge"
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff , Cliffside West"


In [7]:
print('Shape of the dataframe:', df.shape)

Shape of the dataframe: (103, 3)


## 3. Determine the locations of the neighborhoods

#### Description: The csv file was used to read off the geographical coordinates of the neighborhoods. The file was read into a dataframe and added to the first one. The index on the second dataframe was dropped before adding it to the first one. A casual check was done to determine the borough with the most neighborhoods.

### Read the csv file, drop the index on the resulting dataframe and add the coordinates to the main dataframe

In [8]:
df1= pd.read_csv('http://cocl.us/Geospatial_data')

In [9]:
df1.reset_index(drop= True, inplace= True)

In [10]:
df['Latitude']= df1['Latitude'].values
df['Longitude']= df1['Longitude'].values

### A casual check on the number of postal codes was done for every borough

In [11]:
df.groupby('Borough').count()

Unnamed: 0_level_0,Postal Code,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,19,19,19,19
East Toronto,5,5,5,5
East York,5,5,5,5
Etobicoke,12,12,12,12
Mississauga,1,1,1,1
North York,24,24,24,24
Scarborough,17,17,17,17
West Toronto,6,6,6,6
York,5,5,5,5


### Observations: Initially, there were 180 rows, and after cleanup, the number of rows dropped to 103. 'North York' had the most number of postal codes with 24, followed by 'Downtown Toronto' with 19 and 'Scarborough' with 17.

## The dataframe including geographical coordinates for the postal codes

In [12]:
pd.set_option('display.max_rows', 103)
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848


In [13]:
print('Shape of the dataframe:', df.shape)

Shape of the dataframe: (103, 5)


## 4. Analyzing the boroughs of Toronto

#### Description: A dataframe containing boroughs with Toronto in their names was created and a map was created using that dataframe.

### Create the dataframe

In [14]:
toronto_group= df[df['Borough'].str.contains('Toronto')]
toronto_group.count()

Postal Code     39
Borough         39
Neighborhood    39
Latitude        39
Longitude       39
dtype: int64

### Create the map

In [15]:
map_toronto = folium.Map(location=[43.6532,-79.3832],zoom_start=10) 

for lat,lng,borough,neighborhood in zip(toronto_group['Latitude'],toronto_group['Longitude'],toronto_group['Borough'],toronto_group['Neighborhood']):
    label = folium.Popup('borough' + 'neighborhood', parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='red',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

## 5. Use K-Means Clustering for visualizing the neighborhood clusters

#### Description: The number of clusters used was 5 and the centroid initialization was random. The labels were created and inserted into the 'Cluster Labels' column that was added to the dataframe from the previous step. A map was generated using data from the resulting dataframe.

### Set the number of clusters to 5 and create the dataframe that contains the mapping of every neighborhood in that dataframe to a cluster label

In [16]:
# Set the number of clusters to 5
k = 5

#Retain only the coordinate columns for the clustering
toronto_group_clustering = toronto_group.drop(['Postal Code','Borough','Neighborhood'], axis= 1)

#Run the k-Means clustering
kmeans = KMeans(n_clusters=k, random_state=0).fit(toronto_group_clustering)
toronto_group.insert(0,'Cluster Labels',kmeans.labels_,True)

toronto_group

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,4,M4K,East Toronto,"The Danforth West , Riverdale",43.679557,-79.352188
42,4,M4L,East Toronto,"India Bazaar , The Beaches West",43.668999,-79.315572
43,4,M4M,East Toronto,Studio District,43.659526,-79.340923
44,2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,2,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,2,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,2,M4T,Central Toronto,"Moore Park , Summerhill East",43.689574,-79.38316
49,2,M4V,Central Toronto,"Summerhill West , Rathnelly , South Hill , For...",43.686412,-79.400049


### Create the map after K-Means Clustering

In [17]:
# create map
map_clusters = folium.Map(location=[43.6532,-79.3832],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighborhood, cluster in zip(toronto_group['Latitude'], toronto_group['Longitude'], toronto_group['Neighborhood'], toronto_group['Cluster Labels']):
    label = folium.Popup(str(neighborhood) + 'Cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Observations: Cluster sizes of 5- 8 were tried, and the neighborhoods seemed to fit visually the best in 5 clusters.