In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import pandas as pd
import os
import time
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from pathlib import Path

RETRIEVE = False

In [20]:
base_url = 'https://whoicf2.whoi.edu/science/B/whalesounds/metaData.cfm?RN='

ds_path = 'dataset'
full_ds_path = 'full_dataset'
if not os.path.exists(full_ds_path):
    os.makedirs(full_ds_path)
if not os.path.exists(ds_path):
    os.makedirs(ds_path)
    
# Campi dei metadati da estrarre
col = ['filename', 'CU:', 'SR:', 'CS:']
metadata_df = pd.DataFrame(columns=col)

In [None]:
# Funzione per l'estrazione dei metadati
def retrieve_metadata(folder_path, base_url):
    ret_list = []
    for f in os.listdir(folder_path):
        fn = f.split('.')[0]
        curr_dict = {}
        curr_url = base_url + fn
        curr_dict['filename'] = fn
        curr_dict['species'] = f

        # Richiesta html e parsing della tabella
        html = requests.get(curr_url)
        soup = BeautifulSoup(html.content, 'html.parser')
        rows = soup.find_all('tr')

        # Estrazione dei dati dalla tabella
        campi_desiderati = col[1:]
        for row in rows:
            celle = row.find_all('td')
            if len(celle) == 2:
                chiave = celle[0].get_text(strip=True)
                valore = celle[1].get_text(strip=True)
                if chiave in campi_desiderati:
                    curr_dict[chiave] = valore
        ret_list.append(curr_dict)

    return ret_list


In [None]:
# Salvataggio dei metadati di tutte le specie in un unico file csv
if RETRIEVE:
    for folder in os.listdir('dataset'):
        folder_dict = retrieve_metadata(Path(ds_path, folder), base_url)
        metadata_df = pd.concat([metadata_df, pd.DataFrame.from_records(folder_dict)], ignore_index=True)
    metadata_df.to_csv('metadata.csv', index=False)


In [None]:
md_df = pd.read_csv(Path('metadata', 'metadata.csv'))
md_df['filename'] = md_df['filename'].apply(lambda x: x[:5])
md_df = md_df.drop_duplicates(subset=['filename'])
print(md_df.shape)

def check_full_audio(row, ret_list):
    url = f"https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/MasterFiles/{row['filename']}.zip"
    curr_list = []
    request = requests.get(url, stream=True, allow_redirects=True, timeout=10)
    if request.status_code == 200:
        curr_list = [row['species'], row['filename'], True]
        print(f'{row['species']} - {row['filename']}: True')
    else:
        curr_list = [row['species'], row['filename'], False]
        print(f'{row['species']} - {row['filename']}: False')
    ret_list.append(curr_list)
    time.sleep(2)

df_list = []
md_df.apply(check_full_audio, args=(df_list,), axis=1)
df = pd.DataFrame(df_list, columns=['species', 'filename', 'full_audio'])
df.to_csv(Path('metadata','full_audio.csv'), index=False)

In [3]:
df = pd.read_csv(Path('metadata', 'full_audio.csv'))
download_df = df[df['full_audio'] == True]
popular_species = pd.read_csv(Path('metadata', 'species_count.csv'))
popular_species = popular_species[popular_species['file_count'] > 999]
restricted_link = download_df[download_df['species'].isin(popular_species['species'])]

def link_generator(row):
    link_list = []
    url = f"https://whoicf2.whoi.edu/science/B/whalesounds/WhaleSounds/MasterFiles/{row['filename']}.zip"
    link_list.append(url)
    return link_list
link_df = restricted_link.apply(link_generator, axis=1)
# scrivere la lista di link in un file di testo in modo da avere un link per riga
with open('link_list.txt', 'w') as f:
    for link in link_df:
        for l in link:
            f.write(f"{l}\n")