# Install libraries

In [1]:
!pip install bs4
!pip install requests
!pip install lxml
!pip install html5lib

from bs4 import BeautifulSoup
import requests
import pandas as pd

Collecting bs4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/dsxuser/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


# Extract table from webpage

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

my_table = BeautifulSoup(source , features = "lxml").find('table').prettify()


my_table = pd.read_html(my_table)
my_table = pd.DataFrame(my_table[0], columns = ['Postcode', 'Borough', 'Neighbourhood'])
my_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# Drop 'Not assigned' Boroughs

In [3]:
borough_not_assigned = my_table[my_table['Borough'] == "Not assigned"].index
my_table.drop(borough_not_assigned, inplace = True)
my_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


# Concatonate Neighbourhoods to same postcode

In [4]:
my_table_concat = my_table.groupby(['Postcode', 'Borough'], as_index = False).agg({'Neighbourhood': ','.join})
my_table_concat.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Find Neighbourhoods that are 'Not assigned' and change

In [5]:
neigh_not_assigned = my_table_concat[my_table_concat['Neighbourhood'] == "Not assigned"].index
print(neigh_not_assigned)                                #Find positions where Neighbour is not assigned
print(my_table_concat.loc[[93]])                         #Check
my_table_concat.ix[93, 'Neighbourhood'] = "Queen's Park" #Change manually
print(my_table_concat.loc[[93]])                         #Check to see if it updated

Int64Index([93], dtype='int64')
   Postcode       Borough Neighbourhood
93      M9A  Queen's Park  Not assigned
   Postcode       Borough Neighbourhood
93      M9A  Queen's Park  Queen's Park


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


# Print size

In [6]:
my_table_concat.size

309

# Download CSV of postcodes

In [7]:
postcodes = pd.read_csv("http://cocl.us/Geospatial_data")
postcodes.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Merge tables 

In [8]:
merged_table = pd.merge(left=my_table_concat, right=postcodes, left_on='Postcode', right_on='Postal Code')
merged_table.drop(["Postal Code"], axis = 1, inplace = True)
merged_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Clustering

In [9]:
!pip install sklearn
!pip install folium

Collecting sklearn
  Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/dsxuser/.cache/pip/wheels/76/03/bb/589d421d27431bcd2c6da284d5f2286c8e3b2ea3cf1594c074
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0
Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 13.6MB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed b

In [17]:
from sklearn.cluster import KMeans
import numpy as np
locations = np.array(postcodes[["Latitude", "Longitude"]])
kmeans = KMeans(n_clusters = 6, random_state = 7).fit(locations)
centres = pd.DataFrame(kmeans.cluster_centers_, columns = ["Latitude","Longitude"])
print(centres)
import folium
# instantiate a feature group for the incidents in the dataframe
incidents = folium.map.FeatureGroup()

# loop through the 100 crimes and add each to the incidents feature group
for lat, lng, in zip(centres.Latitude, centres.Longitude):
    incidents.add_child(
        folium.vector_layers.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            color='yellow',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6
        )
    )

# add incidents to map
toronto_map = folium.Map(location = [43.6529,-79.3849], zoom_start = 10)
toronto_map.add_child(incidents)

    Latitude  Longitude
0  43.661512 -79.395144
1  43.776575 -79.242084
2  43.727425 -79.500709
3  43.700474 -79.320872
4  43.653629 -79.531249
5  43.754850 -79.396386


NameError: name 'centres' is not defined