## Libraries for data analysis & web scraping

In [70]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

print('Libraries imported.')

Libraries imported.


In [73]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries imported.')

Libraries imported.


In [72]:
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Libraries imported.


## Access the website and read the page

In [74]:
source= requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup= BeautifulSoup(source, 'xml')

## Search for the data that is at the html tag 'table'

In [75]:
tb1= soup.find('table',{'class':'wikitable'})
list_can= []

## Convert each of the 'td' rows to text and add it to the list

In [76]:
for i in tb1.find_all('tr'):
  list_can.append([j.text.strip() for j in (i.find_all('td'))])

## Create a pandas dataframe with the 3 columns

In [77]:
df= pd.DataFrame(list_can, columns= ['PostalCode','Borough','Neighborhood'], index= None)
df.count()

PostalCode      180
Borough         180
Neighborhood    180
dtype: int64

## Remove 'Not assigned' from the 'Borough' column and the first row with None values

In [78]:
df= df[df.Borough!= 'Not assigned']
df=df.iloc[1:]

## Sort the table and reset the index

In [79]:
df= df.sort_values(by= 'PostalCode')
df= df.set_index('PostalCode')
df.reset_index(inplace= True)

## Replace '/' with ',' in the last column

In [80]:
df['Neighborhood']= df['Neighborhood'].apply(lambda x: x.replace('/', ','))

## The dataframe displaying postal codes of Canada starting with M, its boroughs and neighborhoods

In [81]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park"
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge"
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff , Cliffside West"


In [82]:
print('Shape of the dataframe:', df.shape)

Shape of the dataframe: (103, 3)


## Access and read the geographical coordinates off the csv file

In [83]:
df1= pd.read_csv('http://cocl.us/Geospatial_data')

## Drop the index on the second dataframe before adding it to the main one

In [84]:
df1.reset_index(drop= True, inplace= True)

## Add the Latitude and Longitude from the second dataframe to the main one

In [85]:
df['Latitude']= df1['Latitude'].values
df['Longitude']= df1['Longitude'].values

## A casual check on the number of postal codes in every borough

In [91]:
df.groupby('Borough').count()

Unnamed: 0_level_0,PostalCode,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,19,19,19,19
East Toronto,5,5,5,5
East York,5,5,5,5
Etobicoke,12,12,12,12
Mississauga,1,1,1,1
North York,24,24,24,24
Scarborough,17,17,17,17
West Toronto,6,6,6,6
York,5,5,5,5


## Observations: Initially, there were 180 rows, and after cleaup, the number of rows dropped to 103. North York had the most number of postal codes with 24, followed by Downtown Toronto with 19 and Scarborough with 17.

## The dataframe including geographical coordinates for the postal codes

In [92]:
pd.set_option('display.max_rows', 103)
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848


In [93]:
print('Shape of the dataframe:', df.shape)

Shape of the dataframe: (103, 5)


## Isolating to the boroughs of Toronto and analyzing them

In [95]:
toronto_group= df[df['Borough'].str.contains('Toronto')]
toronto_group.count()

PostalCode      39
Borough         39
Neighborhood    39
Latitude        39
Longitude       39
dtype: int64

## Create the map

In [103]:
map_toronto = folium.Map(location=[43.6532,-79.3832],zoom_start=10) 

for lat,lng,borough,neighborhood in zip(toronto_group['Latitude'],toronto_group['Longitude'],toronto_group['Borough'],toronto_group['Neighborhood']):
    label = folium.Popup('neighborhood' + 'borough', parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

## Use K-Means Clustering- Set the number of clusters and the run the process

In [101]:
k = 5

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_group_cluster)
toronto_group.insert(0,'Cluster Labels',kmeans.labels_,True)

toronto_group

Unnamed: 0,Cluster Labels,PostalCode,...,Latitude,Longitude
37,4,M4E,...,43.676357,-79.293031
41,4,M4K,...,43.679557,-79.352188
42,4,M4L,...,43.668999,-79.315572
43,4,M4M,...,43.659526,-79.340923
44,1,M4N,...,43.72802,-79.38879
45,1,M4P,...,43.712751,-79.390197
46,1,M4R,...,43.715383,-79.405678
47,1,M4S,...,43.704324,-79.38879
48,1,M4T,...,43.689574,-79.38316
49,1,M4V,...,43.686412,-79.400049


## Create the map after the K-Means Clustering

In [102]:
# create map
map_clusters = folium.Map(location=[43.6532,-79.3832],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighborhood, cluster in zip(toronto_group['Latitude'], toronto_group['Longitude'], toronto_group['Neighborhood'], toronto_group['Cluster Labels']):
    label = folium.Popup(str(neighborhood) + 'Cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters