### This notebook will be used to explore, segment, and cluster the neighborhoods in the city of Toronto as practice for the final capstone project

In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib as plt

%matplotlib inline 

In [2]:
#scrapping toronto city data from wikipedia page
scrape = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
#the above function returns a list of dataframes, lets see how many and explore them
len(scrape)

3

In [3]:
scrape[0].head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
scrape[1].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,,Canadian postal codes,,,,,,,,,,,,,,,,
1,NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...,NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...,NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...,,,,,,,,,,,,,,,
2,NL,NS,PE,NB,QC,QC,QC,ON,ON,ON,ON,ON,MB,SK,AB,BC,NU/NT,YT
3,A,B,C,E,G,H,J,K,L,M,N,P,R,S,T,V,X,Y


In [5]:
scrape[2].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,NL,NS,PE,NB,QC,QC,QC,ON,ON,ON,ON,ON,MB,SK,AB,BC,NU/NT,YT
1,A,B,C,E,G,H,J,K,L,M,N,P,R,S,T,V,X,Y


In [6]:
#toronto city data is on the first element of scrape list
toronto_data= scrape[0]
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [10]:
#cleaning up data for further analysis

#getting rid of Boroughs without designation
toronto_data= toronto_data[toronto_data.Borough != 'Not assigned']
toronto_data.reset_index(drop= True, inplace= True)

#assign Borough name to Neighbourhoods without designation
neigh_notassigned= toronto_data.index[toronto_data['Neighbourhood']== 'Not assigned'].to_list()
for i in neigh_notassigned:
    toronto_data.iloc[i,2]= toronto_data.iloc[i,1]
    
#combining Neighbourhoods with same postal codes into a single Neighbourhood row
unique_postcodes= toronto_data['Postcode'].unique().shape[0] #using later for accuracy check
toronto_data= toronto_data.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

#checking for accuracy: the new dataframe should be the same row size as unique postal codes before combination
print('Is dataframe accurate: {}'.format(int(toronto_data.shape[0])==int(unique_postcodes)))

Is dataframe accurate: True


In [11]:
#displaying dataframe
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
print(toronto_data.shape)

(103, 3)
