In [2]:
import warnings
warnings.filterwarnings('ignore')

import copy
import pandas as pd 

# Requests sends and recieves HTTP requests.
import requests 

# Beautiful soup parses HTML documents in python.
from bs4 import BeautifulSoup

from pymongo import MongoClient
import pprint

# Step 1: Check out the website in a browser

1: The table and wav is inside of body /html/body/div[3]/div/div/table/tbody/tr[2]/td[4]/a

# Step 2: Send a Get request for the data.

In [3]:
watkins_sounds = "https://cis.whoi.edu/science/B/whalesounds/bestOf.cfm?code=BE3D"

r = requests.get(watkins_sounds)

In [4]:
r # A status code of 200 means that everything went well

<Response [200]>

We can check out the raw hypertext in the content attribute of the request.

In [5]:
r.content

b'\n<!doctype html>\n<html class="no-js" lang="en">\n<head><script type="text/javascript" src="/cf_scripts/scripts/cfform.js"></script>\n<script type="text/javascript" src="/cf_scripts/scripts/masks.js"></script>\n\n    <meta charset="utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1.0" />\n    <title>Watkins Marine Mammal Sound Database</title>\n    <link rel="stylesheet" href="https://www.whoi.edu/cf_headers/watkins/css/foundation.css" />\n    <link rel="stylesheet" href="https://www.whoi.edu/cf_headers/watkins/css/styles.css" />\n    <link rel="stylesheet" href="https://www.whoi.edu/cf_headers/watkins/css/audioplayer.css" />\n    <script src="https://www.whoi.edu/cf_headers/watkins/js/vendor/modernizr.js"></script>\n    <script>\n$(\'getSpeciesCommon\').change(function(){\n    var url = $(this).val();\n    window.location = url;\n});\n\t</script>\n\n<script language="JavaScript">\n<!--\nfunction goTo(pagename) {\n    if ((-1 < pagename.selectedIndex) 

In [6]:
# connect to the hosted MongoDB instance
client = MongoClient('localhost', 27017)

In [7]:
# Create a database
db = client['marine_sounds']

In [8]:
# Create a collection called dolphins

short_finned_pilot_whale = db['marine_sounds']

In [9]:
short_finned_pilot_whale

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'marine_sounds'), 'marine_sounds')

# Step 3: Save all the hypertext into mongo for later use.

In [11]:
short_finned_pilot_whale.insert_one({'html': r.content}) # change name to collections

ServerSelectionTimeoutError: localhost:27017: [Errno 61] Connection refused, Timeout: 30s, Topology Description: <TopologyDescription id: 636c4ff592c6215bdb3038c1, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 61] Connection refused')>]>

# Step 4: Parse the hypertext with BeautifulSoup 

In [12]:
# This is the beautiful part of soup. Parsing the HTML into a python object is effortless.

In [13]:
soup = BeautifulSoup(r.content, "html")

In [14]:
print(soup)


<!DOCTYPE html>

<html class="no-js" lang="en">
<head><script src="/cf_scripts/scripts/cfform.js" type="text/javascript"></script>
<script src="/cf_scripts/scripts/masks.js" type="text/javascript"></script>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Watkins Marine Mammal Sound Database</title>
<link href="https://www.whoi.edu/cf_headers/watkins/css/foundation.css" rel="stylesheet"/>
<link href="https://www.whoi.edu/cf_headers/watkins/css/styles.css" rel="stylesheet"/>
<link href="https://www.whoi.edu/cf_headers/watkins/css/audioplayer.css" rel="stylesheet"/>
<script src="https://www.whoi.edu/cf_headers/watkins/js/vendor/modernizr.js"></script>
<script>
$('getSpeciesCommon').change(function(){
    var url = $(this).val();
    window.location = url;
});
	</script>
<script language="JavaScript">
<!--
function goTo(pagename) {
    if ((-1 < pagename.selectedIndex) &&
        (pagename.options[pagename.selectedIndex].value != "nil

In [15]:
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en">
 <head>
  <script src="/cf_scripts/scripts/cfform.js" type="text/javascript">
  </script>
  <script src="/cf_scripts/scripts/masks.js" type="text/javascript">
  </script>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Watkins Marine Mammal Sound Database
  </title>
  <link href="https://www.whoi.edu/cf_headers/watkins/css/foundation.css" rel="stylesheet"/>
  <link href="https://www.whoi.edu/cf_headers/watkins/css/styles.css" rel="stylesheet"/>
  <link href="https://www.whoi.edu/cf_headers/watkins/css/audioplayer.css" rel="stylesheet"/>
  <script src="https://www.whoi.edu/cf_headers/watkins/js/vendor/modernizr.js">
  </script>
  <script>
   $('getSpeciesCommon').change(function(){
    var url = $(this).val();
    window.location = url;
});
  </script>
  <script language="JavaScript">
   <!--
function goTo(pagename) {
    if ((-1 < pagename.selectedIndex) &&
        (pagename.

In [16]:
print(soup.title)

<title>Watkins Marine Mammal Sound Database</title>


# Step 5: Navigate the data to pull out the table information

Recall the structure of the table we are looking for:

In [17]:
div = soup.find("div", {"class": "database"})

table = div.find("table")

In [18]:
# This returns an iterator over the rows in the table
rows = table.find_all("tr")

In [19]:
import re 

links = [a['href'] for a in soup.find_all('a',href=re.compile('/science.*\.(mp3|wav|ogg|wma)'))]

In [20]:
from pydub import AudioSegment
from pydub.playback import play

# Scrape Audio:

* First create the directory for the class in the data directory.

* Second rename with open(f'data/northern right whale/rightWhale{idx}.wav', 'wb') to the just created path

* The data flow is first download the raw audio data here and them concatenate the audio into the training dataset directory into 30 seconds chucks. 

In [24]:
def download_audio_and_save(lst_of_urls):
    
    for idx, url in enumerate(lst_of_urls):
        doc = requests.get(f'https://whoicf2.whoi.edu{url}')

        with open(f'data/short-finned-pilot-whale/pilotWhale{idx}.wav', 'wb') as f:
            f.write(doc.content)
            print(f"Saved audio file{idx}")

In [25]:
download_audio_and_save(links)

FileNotFoundError: [Errno 2] No such file or directory: 'data/short-finned-pilot-whale/pilotWhale0.wav'