<h1>Append df data with longitude and latitude</h1>

In [1]:
#install beauitful soup and all the other packages
!pip install beautifulsoup4

!pip install lxml

import requests
from urllib.request import urlopen

from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

#make a function that will pull in the site and parse.

def getWikipediaPage(site):
    url = urlopen(site)
    soup = BeautifulSoup(url,'lxml')
    return soup

#Now bring in the wiki page.

wiki = getWikipediaPage('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

#I know the class of the table I need is wikitable sortable, so I'm going to pull the tr rows of that.

table = wiki.find('table', {'class': 'wikitable sortable'})
rows = table.find_all('tr')


#Now a loop to return all of the rows of tr into a list

data = []

for row in rows:
    cell = row.find_all('td')
    cell_clean = [cell.text.strip('\n') for cell in cell]
    data.append(cell_clean)

#Read it into a dataframe

df = pd.DataFrame(data)

#Should probably add headers

df.columns = ['Postal Code','Borough','Neighborhood']

#Drop empty rows and rows where the Borough is not assigned.

df_dropp = df.dropna()  #Used this to remove empty rows


df_final = df[~df['Borough'].isin(['Not assigned'])]

#Bring in Lat and Long
url = 'https://cocl.us/Geospatial_data'
df_longlat = pd.read_csv(url)
df_merged = pd.merge(df_final, df_longlat, on='Postal Code')
df_merged.head()



Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


<h2>CLustering Toronto Data</h2>

In [5]:
!pip install folium
import folium # map rendering library

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 14.7MB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [6]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

In [10]:
#Cluster the neighborhoods into 4 clusters (I chose 4 arbitrarily)
kclusters = 4

#I admit, I'm not sure why I need to drop this string column here. But it works if I do.
df_merged_clusters = df_merged.drop(['Neighborhood','Borough','Postal Code'], 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_merged_clusters)


In [12]:
#Adding the clusters back into my original DF
df_merged.insert(0, 'Cluster Labels', kmeans.labels_)
df_merged.head()

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,3,M3A,North York,Parkwoods,43.753259,-79.329656
1,3,M4A,North York,Victoria Village,43.725882,-79.315572
2,0,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,2,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,0,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


In [18]:
#Let's see this broken down on the map

# create map
map_clusters = folium.Map(location=[43.651070, -79.347015], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighborhood'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<h2>Observations</h2>

<p>It looks like there is higher densi