# **WEEK 3 OF THE IBM CAPSTONE**
- THIS NOTEBOOK IS TO PARSE THE TORONTO POSTAL CODE WIKI PAGE

# INSTALL LIBRARIES AND PACKAGES
## OBTAIN HTML DOC FOR TORONTO POSTAL CODE INFORMATION

**Link to Beautiful Soup Doc:**
https://beautiful-soup-4.readthedocs.io/en/latest/#installing-beautiful-soup

In [1]:
! pip install beautifulsoup4
! pip install requests
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 4.6MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2; python_version >= "3.0" (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/41/e7/3617a4b988ed7744743fb0dbba5aa0a6e3f95a9557b43f8c4740d296b48a/soupsieve-2.2-py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.9.3 soupsieve-2.2


In [2]:
html_doc = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

# SCRAPE TORONTO WIKIPEDIA PAGE

In [4]:
soup = bs(html_doc.content)

## A CLEAN VIEW OF THE PAGE CODE

In [5]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"YCZ84honME3N@eJk-mqHAQAAAQw","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":1004901187,"wgRevisionId":1004901187,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Communications in 

## IDENTIFY THE TABLE BY HTML LABELS

In [6]:
tables = soup.select("tbody")[0]
columns = tables.find("tr").find_all("th")
column_names = [str(c.get_text()).strip() for c in columns]
column_names

['Postal Code', 'Borough', 'Neighbourhood']

## LOOP EACH TABLE ROW INTO A NEW DATAFRAME

In [7]:
rows = tables.find_all("tr")
df = []
for tr in rows:
    td= tr.find_all("td")
    row = [str(tr.get_text()).strip() for tr in td]
    df.append(row)
df
df = pd.DataFrame(df, columns = column_names)

# CLEAN THE DATA

## VIEW THE FULL DATAFRAME AND ITS SHAPE

In [8]:
pd.set_option("display.max_rows",None,"display.max_columns", None)
print(df.shape)
print(df.head())
df

(181, 3)
  Postal Code       Borough     Neighbourhood
0        None          None              None
1         M1A  Not assigned      Not assigned
2         M2A  Not assigned      Not assigned
3         M3A    North York         Parkwoods
4         M4A    North York  Victoria Village


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"


## REMOVE THE POSTAL CODES WITHOUT BOROUGH ASSIGNMENT

In [9]:
df_trim = df.loc[df['Borough'] != 'Not assigned']
df_trim

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,,,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"


## SORT BY POSTAL CODE

In [10]:
df_sort = df_trim.sort_values(['Postal Code'],ascending=True)
df_sort = pd.DataFrame(df_sort)
df_sort

Unnamed: 0,Postal Code,Borough,Neighbourhood
10,M1B,Scarborough,"Malvern, Rouge"
19,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
28,M1E,Scarborough,"Guildwood, Morningside, West Hill"
37,M1G,Scarborough,Woburn
46,M1H,Scarborough,Cedarbrae
55,M1J,Scarborough,Scarborough Village
64,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
73,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
82,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
91,M1N,Scarborough,"Birch Cliff, Cliffside West"


## DROP THE EMPTY ROW

In [12]:
df_sort2 = df_sort.drop([0])
df_sort2

Unnamed: 0,Postal Code,Borough,Neighbourhood
10,M1B,Scarborough,"Malvern, Rouge"
19,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
28,M1E,Scarborough,"Guildwood, Morningside, West Hill"
37,M1G,Scarborough,Woburn
46,M1H,Scarborough,Cedarbrae
55,M1J,Scarborough,Scarborough Village
64,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
73,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
82,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
91,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [13]:
print(df_sort2.shape)

(103, 3)


## LOAD THE COORDINATES TABLE

In [13]:
coords= pd.read_csv('Geospatial_Coordinates.csv')
coord_df = pd.DataFrame(coords)
coord_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [15]:
print(cord_df.shape)

(103, 3)


## MERGE THE NEIGHBOURHOOD TABLE AND COORD TABLE BY POSTAL CODE

In [14]:
merged = pd.merge(df_sort2, coord_df, on = 'Postal Code')

NameError: name 'df_sort2' is not defined

## CONFIRM AND COMPARE

In [17]:
print(df_sort2.shape)
print(coord_df.shape)
print(merged.shape)

(103, 3)
(103, 3)
(103, 5)


In [18]:
print(merged.head())
print(merged.tail())

  Postal Code      Borough                           Neighbourhood   Latitude  \
0         M1B  Scarborough                          Malvern, Rouge  43.806686   
1         M1C  Scarborough  Rouge Hill, Port Union, Highland Creek  43.784535   
2         M1E  Scarborough       Guildwood, Morningside, West Hill  43.763573   
3         M1G  Scarborough                                  Woburn  43.770992   
4         M1H  Scarborough                               Cedarbrae  43.773136   

   Longitude  
0 -79.194353  
1 -79.160497  
2 -79.188711  
3 -79.216917  
4 -79.239476  
    Postal Code    Borough                                      Neighbourhood  \
98          M9N       York                                             Weston   
99          M9P  Etobicoke                                          Westmount   
100         M9R  Etobicoke  Kingsview Village, St. Phillips, Martin Grove ...   
101         M9V  Etobicoke  South Steeles, Silverstone, Humbergate, Jamest...   
102         M9W  E

## SAVE THE RESULTING DATAFRAME FOR USE IN SUPPLEMENTAL NOTEBOOKS

In [20]:
merged.to_csv('Neighborhoods and locations.csv')