In [None]:
import requests
import pandas as pd

def get_species_data(common_name, scientific_name):
    api = "https://xeno-canto.org/api/2/recordings?query=" + scientific_name.replace(" ", "+")

    # First query for the scientific name
    # If nothing is found for the scientific name, then try the common name
    response = requests.get(api)
    data = response.json()
    if len(data["recordings"]) == 0:
        print("scientific name not found, so switch to searching for the common name")
        api = "https://xeno-canto.org/api/2/recordings?query=" + common_name.replace(" ", "+")
        response = requests.get(api)
        data = response.json()
    
    animal_sounds = data["recordings"]
    while data["numPages"] > data["page"]:
        api_page = api+f"&page=" + str(int(data["page"]) + 1)
        data = requests.get(api_page).json()
        animal_sounds.extend(data["recordings"])

    if len(animal_sounds) == 0:
        print(common_name, scientific_name, "doesn't exist")

    print("Number of recordings found: ", len(animal_sounds))

    return animal_sounds

In [None]:
""" Creates the metadata for all the bird species """

import os

# For each bird, add all the metadata to an excel sheet
for common_name, scientific_name in species.to_numpy():
    recordings = get_species_data(common_name, scientific_name)

    df = pd.DataFrame(recordings)
    df.index += 1
    folder_to_save_to = "/Users/wesleywu/Desktop/E4E Research/Acoustic Species Identification Project/Kendall-Frost Marsh Project/KF_local_birds_dataset/bird_recordings_metadata/"
    excel_path = folder_to_save_to + common_name.replace(" ", "_") + "_metadata.xlsx"
    df.to_excel(excel_path, index=True, index_label="No. Recording")
    print(f"Excel file has been saved at {excel_path}")


In [None]:
def is_valid_time_format(value):
    return bool(re.match(r'^\d{2}:\d{2}:\d{2}$', value))

# Function to convert to hh:mm:ss format
def to_hms_format(value):
    if value.isdigit():  
        return f'00:00:{int(value):02d}'
    elif ':' in value and len(value.split(':')) == 2:
        minutes, seconds = value.split(':')
        return f'00:{int(minutes):02d}:{int(seconds):02d}'
    elif is_valid_time_format(value):
        return value
    else:
        return value

In [None]:
import pandas as pd
import os

""" EDA """

counter = 0
folder_path = "/Users/wesleywu/Desktop/E4E Research/Acoustic Species Identification Project/Kendall-Frost Marsh Project/KF_local_birds_dataset/bird_recordings_metadata/"

res = []
for file in os.listdir(folder_path):
    if file.endswith(".xlsx"):
        counter += 1
        print(counter)
        file_path = os.path.join(folder_path, file)
        print(file_path)

        vals = {
            "species": "N/A",
            "recordings": "N/A",
            "total time": "N/A",
            "avg recording time": "N/A",
            "total countries": "N/A",
            "which countries": "N/A",
            "U.S. recordings": "N/A",
            "CA recordings": "N/A", 
            "SD recordings": "N/A",
            ">= B quality recordings": "N/A"
        }

        df = pd.read_excel(file_path)

        name = " ".join(file.split("_")[:-1])
        vals["species"] = name
        
        rows, cols = df.shape
        vals["recordings"] = rows

        df['length'] = df['length'].apply(to_hms_format)
        df['length'] = pd.to_timedelta(df['length'])
        total_time = df['length'].sum()
        total_seconds = total_time.total_seconds()
        hours = int(total_seconds // 3600)
        minutes = int((total_seconds % 3600) // 60)
        seconds = int(total_seconds % 60)
        vals["total time"] = f"{hours:02d}:{minutes:02d}:{seconds:02d}"

        avg_time = total_time / vals["recordings"]
        avg_seconds = avg_time.total_seconds()
        hours = int(avg_seconds // 3600)
        minutes = int((avg_seconds % 3600) // 60)
        seconds = int(avg_seconds % 60)
        vals["avg recording time"] = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
        
        countries = set(df["cnt"].dropna())
        vals["total countries"] = len(countries)

        cnt_str = ", ".join(list(countries))
        vals["which countries"] = ", ".join(list(countries))

        vals["U.S. recordings"] = (df["cnt"] == "United States").sum()

        vals["CA recordings"] = df["loc"].str.contains("California", case=False).sum()

        vals["SD recordings"] = df["loc"].str.contains("San Diego", case=False).sum()

        vals[">= B quality recordings"] = df["q"].isin(["A", "B"]).sum()

        for country in list(countries):
            if country != "United States":
                key = f"{country} recordings"
                vals[key] = (df["cnt"] == country).sum()

        res.append(vals)
        

new_df = pd.DataFrame(res)
print(" - - - ")
print(new_df)

path_to_save_to = "/Users/wesleywu/Desktop/E4E Research/Acoustic Species Identification Project/Kendall-Frost Marsh Project/KF_local_birds_dataset/EDA/EDA.xlsx"

new_df.to_excel(path_to_save_to, index=False)
print(f"Excel file has been saved at {path_to_save_to}")


In [None]:
""" Downloads all the bird recordings from the bird recordings metadata folder """

import os
import pandas as pd
import requests

folder_path = "/Users/wesleywu/Desktop/E4E Research/Acoustic Species Identification Project/Kendall-Frost Marsh Project/KF_local_birds_dataset/bird_recordings_metadata/"
recording_folder = "/Users/wesleywu/Desktop/E4E Research/Acoustic Species Identification Project/Kendall-Frost Marsh Project/KF_local_birds_dataset/bird_recordings/"

counter = 0
for file in os.listdir(folder_path):
    if file.endswith(".xlsx"):
        counter += 1
        file_path = os.path.join(folder_path, file)
        bird_name = "_".join(file.split("_")[:-1])

        new_folder_path = recording_folder + bird_name + "_recordings"
        if not os.path.exists(new_folder_path):
            os.mkdir(new_folder_path)

        df = pd.read_excel(file_path)
        
        for index, recording_url in enumerate(df["file"], 1):
            try:
                print(f"Downloading audio file")
                response = requests.get(recording_url, stream = True)
                response.raise_for_status()
                f_name = f"{index}_" + bird_name + ".mp3"
                f_path = os.path.join(new_folder_path, f_name)

                with open(f_path, "wb") as audio_file:
                    for chunk in response.iter_content(chunk_size=8192):
                        audio_file.write(chunk)

            except Exception as e:
                    print(f"Failed to download {recording_url}: {e}")

            print(f"Downloaded {bird_name} recordings")

print(" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
print("Finished downloading all the bird recordings")

In [None]:
""" Download all the San Diego bird recordings to our NAS Server """

import os
import pandas as pd
import requests
import logging
from tqdm import tqdm

log_file = "download_log.txt"
logging.basicConfig(
    level=logging.WARNING,
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[
        logging.FileHandler(log_file)
    ]
)

folder_path = "/Users/wesleywu/Desktop/E4E Research/Acoustic Species Identification Project/Kendall-Frost Marsh Project/KF_local_birds_dataset/bird_recordings_metadata/"
nas_root = "/Volumes/passive-acoustic-biodiversity/san_diego_xeno_canto"

metadata_files = [f for f in os.listdir(folder_path) if f.endswith(".xlsx")]

for file in tqdm(metadata_files, desc="Overall Progress", position=0):
    file_path = os.path.join(folder_path, file)
    bird_name = "_".join(file.split("_")[:-1])

    new_folder_path = os.path.join(nas_root, bird_name + "_recordings")

    try:
        df = pd.read_excel(file_path)
    except Exception as e:
        logging.error(f"Failed to read Excel file {file_path}: {e}")
        continue

    if os.path.exists(new_folder_path):
        continue
    else:
        try:
            os.makedirs(new_folder_path)
        except Exception as e:
            logging.error(f"Failed to create folder: {new_folder_path} — {e}")
            continue

    for index, recording_url in enumerate(tqdm(df["file"], desc=f"{bird_name}", position=1, leave=False), 1):
        f_name = f"{index}_{bird_name}.mp3"
        f_path = os.path.join(new_folder_path, f_name)

        if os.path.exists(f_path):
            continue

        try:
            response = requests.get(recording_url, stream=True, timeout=10)
            response.raise_for_status()

            with open(f_path, "wb") as audio_file:
                for chunk in response.iter_content(chunk_size=8192):
                    audio_file.write(chunk)

        except Exception as e:
            logging.warning(f"Failed to download {recording_url} for {bird_name}: {e}")

logging.warning("Bird recording download script has completed.")
