### Toronto Neighbourhoods

In [1]:
import pandas as pd
import numpy as np

In [2]:
WikiWebsite = 'http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'  #Data where we will scrape data
Data = pd.read_html(WikiWebsite)

In [3]:
Data

[    Postal Code           Borough  \
 0           M1A      Not assigned   
 1           M2A      Not assigned   
 2           M3A        North York   
 3           M4A        North York   
 4           M5A  Downtown Toronto   
 ..          ...               ...   
 175         M5Z      Not assigned   
 176         M6Z      Not assigned   
 177         M7Z      Not assigned   
 178         M8Z         Etobicoke   
 179         M9Z      Not assigned   
 
                                          Neighbourhood  
 0                                         Not assigned  
 1                                         Not assigned  
 2                                            Parkwoods  
 3                                     Victoria Village  
 4                            Regent Park, Harbourfront  
 ..                                                 ...  
 175                                       Not assigned  
 176                                       Not assigned  
 177                

We notice the scraping here returned the 3 structures presented in Wikipedia website, however we have interest only in the first, as it is our data table

In [4]:
PostData = Data[0] #Transfering  only what we need to the DF Postal Code

In [5]:
PostData=PostData[PostData.Borough != 'Not assigned'].reset_index() #Cleaning the not assigned borough names
PostData = PostData.drop(['index'], axis=1)
PostData.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Creating a pivot table to see how many repetitions we have.

In [6]:
Pivot = PostData.pivot_table(index=['Postal Code'], aggfunc='size')
Pivot

Postal Code
M1B    1
M1C    1
M1E    1
M1G    1
M1H    1
      ..
M9N    1
M9P    1
M9R    1
M9V    1
M9W    1
Length: 103, dtype: int64

Ok, we don't have any duplicates in the data, not a single NAN value, all postal codes repeat themselves only once, good to go.

Now we cross validate these data with the available geospatial data CSV to obtain each postal code Latitude and Longitude within a double for iteration.

In [7]:
url = 'https://cocl.us/Geospatial_data'
Geo = pd.read_csv(url,sep=',')
Lats=[]
Longs=[]
for i, row in PostData.iterrows():
    for j, row in Geo.iterrows():
        if PostData.loc[i,"Postal Code"]==Geo.loc[j,"Postal Code"]:
            Lats.append(Geo.loc[j,"Latitude"])
            Longs.append(Geo.loc[j,"Longitude"])
PostData['Lat'] = Lats
PostData['Long'] = Longs
PostData.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Lat,Long
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [8]:
#------------------------------To import the libraries necessary to show and cluster our data--------------------------------
import json # Handling JSON files
import requests # Library to handle requests
from pandas.io.json import json_normalize # Transform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium as fl # Map Rendering as

Let's take a look into how this can be represented in a map.

In [9]:
# Inputing Toronto's coordinates
lat=43.651070
long = -79.347015
#Creating city map
map_toronto = fl.Map(location=[lat,long],zoom_start = 10)
for index, rows in PostData.iterrows():
    lat = PostData.loc[index,"Lat"]
    long = PostData.loc[index,"Long"]
    Neigh = PostData.loc[index,"Neighbourhood"]
    Borough = PostData.loc[index,"Borough"]
    label = '{},{}'.format(Neigh,Borough)
    label = fl.Popup(label,parse_html=True)
    fl.CircleMarker(
    [lat,long],
    radius=10,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
#Well, here we iterate from all the neibhbourhoods in data and add then to our map.
map_toronto