In [None]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import pandas as pd
import os
import time
from tqdm import tqdm
import requests
from pathlib import Path

DOWNLOAD = True

In [None]:
driver = webdriver.Chrome()
url = 'https://whoicf2.whoi.edu/science/B/whalesounds/fullCuts.cfm'
driver.get(url)
species_select = Select(driver.find_element("name", "getSpecies"))
species_options = species_select.options[1:]
ds_path = 'dataset'
md_path = 'metadata'

if not os.path.exists(ds_path):
    os.makedirs(ds_path)

if not os.path.exists(md_path):
    os.makedirs(md_path)

In [None]:
# Funzione per ritornare una lista di elementi presenti in un tag select html
def list_pages(web_elem, id):
    elem_list = []
    for i in range(len(web_elem)):
        elem_select = Select(driver.find_element("name", id))
        elem_options = elem_select.options[1:]
        option = elem_options[i]
        elem_name = option.text.strip()
        elem_list.append(elem_name)
    return elem_list

In [None]:
if DOWNLOAD:    
    species_name = list_pages(species_options, "getSpecies")
    print(f"Trovate {len(species_name)} specie.")
    
    for spec in species_name:
        # Selezione della specie
        print(f"Elaborazione specie: {spec}")
        species_select = Select(driver.find_element("name", "getSpecies"))
        species_select.select_by_visible_text(spec)
        time.sleep(1) 
        os.makedirs(spec, exist_ok=True)

        # Creazione lista per gli anni relativi alla specie
        year_select = Select(driver.find_element("name", "pickYear"))
        year_options = year_select.options[1:]
        year_list = list_pages(year_options, "pickYear")
        print(f"Elaborazione anni per {spec}: {year_list}")

        for year in year_list:
            # Selezione dell'anno
            year_select = Select(driver.find_element("name", "pickYear"))
            year_select.select_by_visible_text(year)
            time.sleep(2) 

            # Creazione della lista con i link per il download dei file
            download_links = driver.find_elements("partial link text", "Download")
            print(f"Anno {year}: trovati {len(download_links)} file.")

            # Download dei file
            for link in tqdm(download_links, desc="Download"):
                file_url = link.get_attribute("href")
                file_name = file_url.split('/')[-1]
                file_path = Path(ds_path, spec, file_name)

                # Scarica il file
                if os.path.exists(file_path):
                    continue
                response = requests.get(file_url)
                with open(file_path, 'wb') as f:
                    f.write(response.content)
else:
    print("Download disabilitato")

In [None]:
# Conteggio dei file presenti per specie
folder_list = []
for folder in os.listdir(ds_path):
    folder_path = Path(ds_path, folder)
    if os.path.isdir(folder_path):
        file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(Path(folder_path, f))])
        folder_list.append({'species': folder, 'file_count': file_count})

df = pd.DataFrame(folder_list)
df.to_csv(Path(md_path, 'species_count.csv'), index=False)

In [None]:
# Stampa del numero totale di file audio
species_df = pd.read_csv(Path(md_path,'species_count.csv'))
tot_audio = species_df['file_count'].sum()
print(f"Totale file audio scaricati: {tot_audio}")