# To explore and cluster the neighborhoods in Toronto

### Imort necessary Libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## 1. Download and Explore Dataset

In [2]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url,'lxml')
#print(soup.prettify())

In [4]:
M_table = soup.find('table',{'class':'wikitable sortable'})
#M_table



In [5]:
headers= [header.text for header in M_table.find_all('th')]
headers[-1] = headers[-1].strip()
headers

['Postcode', 'Borough', 'Neighbourhood']

In [6]:
rows = []
for row in M_table.find_all('tr'):
    rows.append([val.text.encode('utf8').strip().decode("utf-8") for val in row.find_all('td')])

In [7]:
df = pd.DataFrame(rows[1:], columns=headers) # Remove the None row, can be seen by selecting all rows
df.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# 2. Data Preprocessing
### 1. Ignore cells with a borough that is Not assigned.

### 2. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [8]:

df['Borough'] = df['Borough'] .replace('Not assigned', float('nan'))
if df['Borough'] is not None:
    df['Neighbourhood'] = df['Neighbourhood'].replace('Not assigned', df['Borough'])  
    
    
df=df.dropna()
print(df.shape)
df.head() # Check all the Not assigned entries are removed


(210, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### 3. Grouping by Postcode to remove duplicate entries
#### Additional step: group by Borough for understanding post code distribution and clustering

In [9]:
df1=df.groupby(['Postcode','Borough'])['Neighbourhood'].unique()
df_final=df1.to_frame().reset_index()
df_final['Neighbourhood']= df_final['Neighbourhood'].transform(lambda x: ', '.join(x))

#print(df_final.head())


df_final.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### 4. Shape of final dataset

In [10]:
df_final.shape

(103, 3)