## Libraries for data analsysis & web scraping

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

print('Libraries imported.')

Libraries imported.


## Access the website and read the page

In [2]:
source= requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup= BeautifulSoup(source, 'xml')

## Search for the data that is at the html tag 'table'

In [3]:
tb1= soup.find('table',{'class':'wikitable'})
list_can= []

## Convert each of the 'td' rows to text and add it to the list

In [4]:
for i in tb1.find_all('tr'):
  list_can.append([j.text.strip() for j in (i.find_all('td'))])

## Create a pandas dataframe with the 3 columns

In [5]:
df= pd.DataFrame(list_can, columns= ['PostalCode','Borough','Neighborhood'], index= None)
df.count()

PostalCode      180
Borough         180
Neighborhood    180
dtype: int64

## Remove 'Not assigned' from the 'Borough' column

In [6]:
df= df[df.Borough!= 'Not assigned']

## Remove the first row with None values

In [7]:
df=df.iloc[1:]

## Sort the table and reset the index

In [8]:
df= df.sort_values(by= 'PostalCode')
df= df.set_index('PostalCode')
df.reset_index(inplace= True)

## Replace '/' with ',' in the last column

In [9]:
df['Neighborhood']= df['Neighborhood'].apply(lambda x: x.replace('/', ','))

## The dataframe displaying postal codes of Canada starting with M, its boroughs and neighborhoods

In [10]:
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park"
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge"
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff , Cliffside West"


In [11]:
print('Shape of the dataframe:', df.shape)

Shape of the dataframe: (103, 3)


## Access and read the geographical coordinates off the csv file

In [12]:
df1= pd.read_csv('http://cocl.us/Geospatial_data')

## Drop the index on the second dataframe before adding it to the main one

In [13]:
df1.reset_index(drop= True, inplace= True)

## Add the Latitude and Longitude from the second dataframe to the main one

In [14]:
df['Latitude']= df1['Latitude'].values
df['Longitude']= df1['Longitude'].values

## A casual check on the number of postal codes in every borough

In [15]:
df.groupby('Borough').count()

Unnamed: 0_level_0,PostalCode,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,19,19,19,19
East Toronto,5,5,5,5
East York,5,5,5,5
Etobicoke,12,12,12,12
Mississauga,1,1,1,1
North York,24,24,24,24
Scarborough,17,17,17,17
West Toronto,6,6,6,6
York,5,5,5,5


## Observations: Initially, there were 180 rows, and after cleaup, the number of rows dropped to 103. North York had the most number of postal codes with 24, followed by Downtown Toronto with 19 and Scarborough with 17.

## The dataframe including geographical coordinates for the postal codes

In [16]:
pd.set_option('display.max_columns', 5)
df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848


In [17]:
print('Shape of the dataframe:', df.shape)

Shape of the dataframe: (103, 5)
