# Table of Contents

<div class="alert alert-block alert-info" style="margin-top: 20px">

<font size = 3>

1. <a href="#item1">Download and Explore Dataset</a>

2. <a href="#item2">Explore Neighborhoods in Toronto</a>

3. <a href="#item3">Analyze Each Neighborhood</a>

4. <a href="#item4">Cluster Neighborhoods</a>

5. <a href="#item5">Examine Clusters</a>    
</font>
</div>

Before we get the data and start exploring it, let's download all the dependencies that we will need.

In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!pip install folium
import folium # map rendering library

#!pip install beautifulsoup4
from bs4 import BeautifulSoup
import urllib
import requests
import re

#!pip install geocoder
import geocoder # import geocoder

print('Libraries imported.')

<a id='item1'></a>

## 1. Download and Explore Dataset

#### (i) download the data

In [None]:
# specify the url
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
        
r = requests.get(url)
HCE = BeautifulSoup(r.content)
type(HCE)

htmlpage = urllib.request.urlopen(url)

lst = []
for line in htmlpage:
    line = line.rstrip()
    if re.search('table class', line.decode('utf-8')) :
        lst.append(line)
#print(lst)

table=HCE.find('table', {'class', 'wikitable sortable'})
type(table)

headers= [header.text.lstrip('\n').strip() for header in table.find_all('th')]
print(headers)

rows = []
for row in table.find_all('tr'):
    rows.append([val.text.lstrip('\n').strip() for val in row.find_all('td')])

#### (ii) transform the data into pandas dataframe

Start by creating an empty dataframe

In [None]:
# define the dataframe columns
column_names = ['Postcode','Borough', 'Neighborhood'] 

# instantiate the dataframe
df = pd.DataFrame(rows, columns=column_names)

df.head(10), df.shape

#### (iii)Clean up the data

In [None]:
# drop row with Borough == 'Not assigned'
df = df[df['Borough'] != 'Not assigned']

# If a cell has a borough but a Not assigned neighborhood, the neighborhood will be the same as the borough.
df.at[df['Neighborhood'] == 'Not assigned','Neighborhood'] = df[df['Neighborhood'] == 'Not assigned']['Borough']

df = df[1:-1]

df = df.reset_index(drop=True)

df.head(10), df.shape

In [None]:
postcode_data = df['Postcode'].unique()
postcode_data, postcode_data.shape

Merge rows with same postcode

In [None]:
#Then let's loop through the data and fill the dataframe one row at a time.
df.set_index(['Postcode','Borough'],inplace=True)

neighborhoods = df.groupby(level=['Postcode','Borough'], sort=False).agg(','.join)

neighborhoods.head(10)

Use 0,1,2,3,... as index

In [None]:
neighborhoods = neighborhoods.reset_index(drop=False)

neighborhoods.head(10)

Finally, Check the number of rows

In [None]:
print('There are {} rows'.format(neighborhoods.shape[0]))

#### (iv) Append latitude and longtitude

Fetch the latitude/longtitude data

In [None]:
!wget -q -O 'Geospatial_data.csv' https://cocl.us/Geospatial_data
latlog = pd.read_csv('Geospatial_data.csv')
latlog.head()

Instantiate the dataframe

In [None]:
column_names = ['Postcode','Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

df2 = pd.DataFrame(columns=column_names)
df2

Find the latitude and longtitude for each postcode

In [None]:
for index, row in neighborhoods.iterrows():
    postcode = row['Postcode']
    borough = row['Borough']
    neighborhood = row['Neighborhood']
    
    latitude = latlog[latlog['Postal Code'] == postcode]['Latitude']
    longitude = latlog[latlog['Postal Code'] == postcode]['Longitude']
        
    df2 = df2.append({'Postcode': postcode,
                      'Borough': borough,
                      'Neighborhood': neighborhood,
                      'Latitude': latitude.values[0],
                      'Longitude': longitude.values[0]}, ignore_index=True)
df2.head()