In [51]:
import os
import argparse
import subprocess
import pandas as pd
from compute_cost import get_ground_truth_df, total_cost, check_dataframe
from pathlib import Path
import numpy as np

In [57]:
base = Path.cwd() 
dataset_path   = base / 'MLPC2025_classification'
audio_dir = base / 'MLPC2025_test' / "audio"
features_dir = base / 'MLPC2025_test' / "audio_features"

# test access
test = '17730'
if (not Path.exists(audio_dir / str(test + '.mp3')) or 
    not Path.exists(features_dir / str(test + '.npz'))
    ):
    raise FileNotFoundError(f"Couldn't access audio file {test}.mp3 not found in {audio_dir}")
test = '14'
if (not Path.exists(dataset_path / 'audio_features' / str(test + '.npz')) or
    not Path.exists(dataset_path / 'labels' / str(test + '_labels.npz'))
    ):
    raise FileNotFoundError(f"Couldn't access audio file {test}.mp3 not found in {dataset_path}")

metadata = pd.read_csv(dataset_path / 'metadata.csv')
metadata

Unnamed: 0,filename,keywords,freesound_id,sound_link,manufacturer,license,title,description,num_downloads,geotag,start_time_s,end_time_s
0,321771.mp3,"Interior, AMB, Italy, Distant, Speech, Reverb",321771,https://freesound.org/people/Skjor1/sounds/321...,Skjor1,http://creativecommons.org/publicdomain/zero/1.0/,Interior Ambience + Distant Reverberant Speech...,Interior Ambience + Distant Reverberant Speech...,120,,5.200,27.179
1,451371.mp3,"kids, throaty, crowd, India, distant, traffic,...",451371,https://freesound.org/people/kyles/sounds/451371/,kyles,http://creativecommons.org/publicdomain/zero/1.0/,election rally crowd and speech with distant t...,election rally crowd and speech with distant t...,122,,120.800,144.984
2,199414.mp3,"broadcast, speech, radio",199414,https://freesound.org/people/martinimeniscus/s...,martinimeniscus,http://creativecommons.org/publicdomain/zero/1.0/,"Old Radio Speech Background, higher FF125.aif",Background noise for an old radio broadcast sp...,391,,102.003,130.921
3,410952.mp3,"loop2017, atmos, dolby, speech, ableton",410952,https://freesound.org/people/lietoofine/sounds...,lietoofine,https://creativecommons.org/licenses/by/4.0/,dolby atmos speech.wav,dolby atmos speech @Loop2017,193,52.479543 13.500279,31.330,54.021
4,203908.mp3,"dr-40, project, speech, student, italian, reci...",203908,https://freesound.org/people/s9ames/sounds/203...,s9ames,http://creativecommons.org/licenses/by/3.0/,bologna speech Italian2,recorded with a tascam dr-40 in a sound studio...,526,,29.200,45.689
...,...,...,...,...,...,...,...,...,...,...,...,...
8225,505984.mp3,"droplets, several-drops, dripping, liquid-drip...",505984,https://freesound.org/people/Perplessio/sounds...,Perplessio,http://creativecommons.org/publicdomain/zero/1.0/,Droplets underwater (multiplied).wav,"Droplets falling on a water surface, recorded ...",138,,0.000,16.960
8226,428889.mp3,"tin, patter, rain, roof",428889,https://freesound.org/people/moviebuffgavin/so...,moviebuffgavin,http://creativecommons.org/publicdomain/zero/1.0/,Rain Droplets on Tin Roof,The sound of water droplets from trees and suc...,278,,12.400,36.251
8227,575816.mp3,"raining, tone, upstairs, house, hearing, gener...",575816,https://freesound.org/people/Iceofdoom/sounds/...,Iceofdoom,https://creativecommons.org/licenses/by/4.0/,Upstairs Apartment - Rain Ambience (3 minutes)...,My buddy finds solace in listening to the rain...,213,,55.421,80.704
8228,174501.mp3,"atmosphere, blowing, air, whistle, weather, wind",174501,https://freesound.org/people/unfa/sounds/174501/,unfa,http://creativecommons.org/publicdomain/zero/1.0/,Window Wind,It's a sound of wind blowing through a shut (b...,861,,15.200,33.771


In [53]:
CLASSES = [
    'Speech', 'Dog Bark', 'Rooster Crow', 'Shout',
    'Lawn Mower', 'Chainsaw', 'Jackhammer',
    'Power Drill', 'Horn Honk', 'Siren'
]

In [58]:
def split_dataset(dataset_path, train_txt, val_txt,
                  val_size=0.2, random_state=42):
    """
    Reads metadata, aggregates multi-labels per file, and performs
    a multilabel stratified 80/20 split at the file level.
    Requires `iterative-stratification` package.
    """
    filenames = metadata['filename'].unique()

    # Build ground truth for all files
    gt_all = get_ground_truth_df(filenames, dataset_path)

    # Aggregate labels per file (presence/absence)
    file_labels = gt_all.groupby('filename')[CLASSES].max()

    # Perform stratified split
    try:
        from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
    except ImportError:
        raise ImportError(
            "Please install iterative-stratification:\n"
            "    pip install iterative-stratification"
        )

    msss = MultilabelStratifiedShuffleSplit(
        n_splits=1, test_size=val_size, random_state=random_state
    )
    train_idx, val_idx = next(msss.split(file_labels.index, file_labels.values))
    train_files = file_labels.index[train_idx].tolist()
    val_files = file_labels.index[val_idx].tolist()

    # Save file lists
    with open(train_txt, 'w') as f:
        f.write("\n".join(train_files))
    with open(val_txt, 'w') as f:
        f.write("\n".join(val_files))

    return train_files, val_files


# def compute_cost(predictions_csv, dataset_path, ground_truth_csv):
#     """
#     Runs compute_cost.py and parses the printed Total cost.
#     """
#     result = subprocess.run(
#         ['python', 'compute_cost.py',
#          '--dataset_path', dataset_path,
#          '--ground_truth_csv', ground_truth_csv,
#          '--predictions_csv', predictions_csv],
#         capture_output=True, text=True, check=True
#     )
#     for line in result.stdout.splitlines():
#         if 'Total cost:' in line:
#             return float(line.split(':', 1)[1].strip())
#     raise RuntimeError(f"Cost not found in output:\n{result.stdout}")


def compute_my_cost(predictions_csv, dataset_path, ground_truth_csv):
    """
    Reads the two CSVs, calls total_cost() directly, and returns the total.
    """
    # load predictions & ground truth
    df_pred = pd.read_csv(predictions_csv)
    df_gt   = pd.read_csv(ground_truth_csv)

    # validate format (optional, but recommended)
    check_dataframe(df_pred, dataset_path)
    check_dataframe(df_gt,   dataset_path)

    # call the evaluation routine
    total, _ = total_cost(df_pred, df_gt)
    return total

In [59]:
# 1. Split dataset
train_files, val_files = split_dataset(
    dataset_path,
    train_txt='train.txt', val_txt='val.txt'
)

# 2. Generate ground truth for validation set
gt_val = get_ground_truth_df(val_files, dataset_path)
gt_val.to_csv('ground_truth_val.csv', index=False)

# 3. Baseline 1: All-zero predictions
zero_pred    = gt_val.copy()
zero_pred[CLASSES] = 0
zero_pred.to_csv('baseline_zero.csv', index=False)
cost_zero    = compute_my_cost('baseline_zero.csv', dataset_path, 'ground_truth_val.csv')

# 4. Baseline 2: Prevalence-based predictions
gt_train     = get_ground_truth_df(train_files, dataset_path)
prevalence   = gt_train[CLASSES].mean()
prev_pred    = gt_val.copy()
for cls in CLASSES:
    prev_pred[cls] = int(prevalence[cls] >= 0.5)
prev_pred.to_csv('baseline_prevalence.csv', index=False)
cost_prev    = compute_my_cost('baseline_prevalence.csv', dataset_path, 'ground_truth_val.csv')

# 5. Save cost summary
summary = pd.DataFrame({
    'baseline': ['all_zero', 'prevalence'],
    'cost':     [cost_zero, cost_prev]
})
summary.to_csv('baseline_cost.csv', index=False)

print("Generated:")
print(" - train.txt & val.txt")
print(f" - ground_truth_val.csv")
print(f" - baseline_zero.csv      (cost: {cost_zero:.2f})")
print(f" - baseline_prevalence.csv(cost: {cost_prev:.2f})")
print(" - baseline_cost.csv")

Generated:
 - train.txt & val.txt
 - ground_truth_val.csv
 - baseline_zero.csv      (cost: 107.78)
 - baseline_prevalence.csv(cost: 107.78)
 - baseline_cost.csv
