In [1]:
# Authors: Rachel Dunn, Janson Lin, Anouk de Brouwer
# Date created: Sep 12, 2020

In [2]:
# import libraries
from datetime import datetime
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [3]:
# scrape html from website
r = requests.get('https://www.waterlevels.gc.ca/eng/data/table/2020/wlev_sec/7965')
soup = BeautifulSoup(r.text, 'html.parser')
print(soup)


<!DOCTYPE html>

<!--[if IE 7]><html lang="en" class="no-js ie7"><![endif]-->
<!--[if IE 8]><html lang="en" class="no-js ie8"><![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en">
<!--<![endif]-->
<head>
<meta charset="utf-8"/>
<!-- Title begins / Début du titre -->
<title>Comox - Tides, Currents, and Water Levels</title>
<!-- Title ends / Fin du titre -->
<!-- Favicon (optional) begins / Début du favicon (optionnel) -->
<link href="/Content/WET/dist/theme-gcwu-fegc/images/favicon.ico" rel="shortcut icon"/>
<!-- Favicon (optional) ends / Fin du favicon (optionnel) -->
<!-- Meta-data begins / Début des métadonnées -->
<meta content="Tides, Currents, and Water Levels" name="dcterms.description"/>
<meta content="Tides, Currents, and Water Levels" name="description"/>
<meta content="Careers, Environment, Aquaculture, AZMP, COE, ACRDP, Ocean Science, Oceanography, Climate Science" name="keywords"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta con

In [4]:
# the website has a table with data for each month,
# get all tables by looking for the 'width-100' class
tables = soup.find_all(class_='width-100')
print(tables[0]) # print the January table

<table class="width-100">
<caption>January 2020</caption>
<thead>
<tr>
<th>Day</th>
<th>
<span class="wrap-none">Time</span>
</th>
<th>Height<br/>(m)</th>
</tr>
</thead>
<tbody>
<tr>
<td class="align-right">1</td>
<td class="align-right">2:59 AM</td>
<td class="align-right">1.8</td>
</tr>
<tr>
<td class="align-right">1</td>
<td class="align-right">10:22 AM</td>
<td class="align-right">4.9</td>
</tr>
<tr>
<td class="align-right">1</td>
<td class="align-right">5:14 PM</td>
<td class="align-right">3.0</td>
</tr>
<tr>
<td class="align-right">1</td>
<td class="align-right">9:30 PM</td>
<td class="align-right">3.5</td>
</tr>
<tr>
<td class="align-right">2</td>
<td class="align-right">3:36 AM</td>
<td class="align-right">2.2</td>
</tr>
<tr>
<td class="align-right">2</td>
<td class="align-right">10:54 AM</td>
<td class="align-right">4.9</td>
</tr>
<tr>
<td class="align-right">2</td>
<td class="align-right">6:09 PM</td>
<td class="align-right">2.7</td>
</tr>
<tr>
<td class="align-right">2</td>


In [5]:
# loop over tables to get the year, month, day, time and tide height
for table in tables:
    
    # get month and year from caption
    month_year = table.find("caption").text.strip()
    [month,year] = month_year.split()
    
    # get all cells by looking for 'align-right' class
    cell = table.find_all(class_="align-right")
    
    # loop over cells in table
    # every 1st cell has the day, every 2nd cell has the time, every 3rd cell has the height 
    date = []
    height = []
    for index in range(len(cell)):
        
        # get day
        if ((index % 3) == 0):
            d = cell[index].text.strip()
        
        # get time 
        if ((index % 3) == 1):
            t = cell[index].text.strip()
            
            # paste year, month, day and time together, and append to date list
            ymdt_str = '-'.join([year,month,d,t])
            ymdt = datetime.strptime(ymdt_str,'%Y-%B-%d-%I:%M %p')
            date.append(ymdt)
        
        # get tide height
        if ((index % 3) == 2):
            height.append(cell[index].text.strip())