# Table of Contents

<div class="alert alert-block alert-info" style="margin-top: 20px">

<font size = 3>

1. <a href="#item1">Download and Explore Dataset</a>

2. <a href="#item2">Explore Neighborhoods in Toronto</a>

3. <a href="#item3">Analyze Each Neighborhood</a>

4. <a href="#item4">Cluster Neighborhoods</a>

5. <a href="#item5">Examine Clusters</a>    
</font>
</div>

Before we get the data and start exploring it, let's download all the dependencies that we will need.

In [70]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!pip install folium
import folium # map rendering library

#!pip install beautifulsoup4
from bs4 import BeautifulSoup
import urllib
import requests
import re

#!pip install geocoder
import geocoder # import geocoder

print('Libraries imported.')

Libraries imported.


<a id='item1'></a>

## 1. Download and Explore Dataset

#### (i) download the data

In [71]:
# specify the url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
        
r = requests.get(url)
HCE = BeautifulSoup(r.content)
type(HCE)

htmlpage = urllib.request.urlopen(url)

lst = []
for line in htmlpage:
    line = line.rstrip()
    if re.search('table class', line.decode('utf-8')) :
        lst.append(line)
#print(lst)

table=HCE.find('table', {'class', 'wikitable sortable'})
type(table)

headers= [header.text.lstrip('\n').strip() for header in table.find_all('th')]
print(headers)

rows = []
for row in table.find_all('tr'):
    rows.append([val.text.lstrip('\n').strip() for val in row.find_all('td')])

['Postcode', 'Borough', 'Neighbourhood']


#### (ii) transform the data into pandas dataframe

Start by creating an empty dataframe

In [72]:
# define the dataframe columns
column_names = ['Postcode','Borough', 'Neighborhood'] 

# instantiate the dataframe
df = pd.DataFrame(rows, columns=column_names)

df.head(10), df.shape

(  Postcode           Borough      Neighborhood
 0     None              None              None
 1      M1A      Not assigned      Not assigned
 2      M2A      Not assigned      Not assigned
 3      M3A        North York         Parkwoods
 4      M4A        North York  Victoria Village
 5      M5A  Downtown Toronto      Harbourfront
 6      M5A  Downtown Toronto       Regent Park
 7      M6A        North York  Lawrence Heights
 8      M6A        North York    Lawrence Manor
 9      M7A      Queen's Park      Not assigned, (290, 3))

#### (iii)Clean up the data

In [73]:
# drop row with Borough == 'Not assigned'
df = df[df['Borough'] != 'Not assigned']

# If a cell has a borough but a Not assigned neighborhood, the neighborhood will be the same as the borough.
df.at[df['Neighborhood'] == 'Not assigned','Neighborhood'] = df[df['Neighborhood'] == 'Not assigned']['Borough']

df = df[1:-1]

df = df.reset_index(drop=True)

df.head(10), df.shape

(  Postcode           Borough      Neighborhood
 0      M3A        North York         Parkwoods
 1      M4A        North York  Victoria Village
 2      M5A  Downtown Toronto      Harbourfront
 3      M5A  Downtown Toronto       Regent Park
 4      M6A        North York  Lawrence Heights
 5      M6A        North York    Lawrence Manor
 6      M7A      Queen's Park      Queen's Park
 7      M9A         Etobicoke  Islington Avenue
 8      M1B       Scarborough             Rouge
 9      M1B       Scarborough           Malvern, (211, 3))

In [74]:
postcode_data = df['Postcode'].unique()
postcode_data, postcode_data.shape

(array(['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B',
        'M5B', 'M6B', 'M9B', 'M1C', 'M3C', 'M4C', 'M5C', 'M6C', 'M9C',
        'M1E', 'M4E', 'M5E', 'M6E', 'M1G', 'M4G', 'M5G', 'M6G', 'M1H',
        'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M1J', 'M2J', 'M3J', 'M4J',
        'M5J', 'M6J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M1L',
        'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M9L', 'M1M', 'M2M', 'M3M',
        'M4M', 'M5M', 'M6M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N',
        'M6N', 'M9N', 'M1P', 'M2P', 'M4P', 'M5P', 'M6P', 'M9P', 'M1R',
        'M2R', 'M4R', 'M5R', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M5S',
        'M6S', 'M1T', 'M4T', 'M5T', 'M1V', 'M4V', 'M5V', 'M8V', 'M9V',
        'M1W', 'M4W', 'M5W', 'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X',
        'M4Y', 'M7Y', 'M8Y', 'M8Z'], dtype=object), (103,))

Merge rows with same postcode

In [75]:
#Then let's loop through the data and fill the dataframe one row at a time.
df.set_index(['Postcode','Borough'],inplace=True)

neighborhoods = df.groupby(level=['Postcode','Borough'], sort=False).agg(','.join)

neighborhoods.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighborhood
Postcode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Harbourfront,Regent Park"
M6A,North York,"Lawrence Heights,Lawrence Manor"
M7A,Queen's Park,Queen's Park
M9A,Etobicoke,Islington Avenue
M1B,Scarborough,"Rouge,Malvern"
M3B,North York,Don Mills North
M4B,East York,"Woodbine Gardens,Parkview Hill"
M5B,Downtown Toronto,"Ryerson,Garden District"


Use 0,1,2,3,... as index

In [76]:
neighborhoods = neighborhoods.reset_index(drop=False)

neighborhoods.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


Finally, Check the number of rows

In [77]:
print('There are {} rows'.format(neighborhoods.shape[0]))

There are 103 rows


#### (iv) Append latitude and longtitude

Fetch the latitude/longtitude data

In [78]:
!wget -q -O 'Geospatial_data.csv' https://cocl.us/Geospatial_data
latlog = pd.read_csv('Geospatial_data.csv')
latlog.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


Instantiate the dataframe

In [79]:
column_names = ['Postcode','Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

df2 = pd.DataFrame(columns=column_names)
df2

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude


Find the latitude and longtitude for each postcode

In [80]:
for index, row in neighborhoods.iterrows():
    postcode = row['Postcode']
    borough = row['Borough']
    neighborhood = row['Neighborhood']
    
    latitude = latlog[latlog['Postal Code'] == postcode]['Latitude']
    longitude = latlog[latlog['Postal Code'] == postcode]['Longitude']
        
    df2 = df2.append({'Postcode': postcode,
                      'Borough': borough,
                      'Neighborhood': neighborhood,
                      'Latitude': latitude.values[0],
                      'Longitude': longitude.values[0]}, ignore_index=True)
df2.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
