In [1]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List
from matplotlib import cm
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import balanced_accuracy_score
import torchaudio
import torchaudio.transforms as T
from tqdm import tqdm

from copy import deepcopy


import pickle

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
from portable_m2d import PortableM2D as M2D
from transformer_wrapper import BaseModelWrapper


class M2DWrapper(BaseModelWrapper):
    def __init__(self) -> None:
        super().__init__()
        self.m2d = M2D()

    def mel_forward(self, x):
        return self.m2d.to_normalized_feature(x)

    def forward(self, spec):
        return self.m2d.forward_mel(spec)

    def separate_params(self):
        pt_params = [[], [], [], [], [], [], [], [], [], [], [], []]
        for k, p in self.named_parameters():
            if any(['cls_token' in k,
                    'pos_embed' in k,
                    'norm_stats' in k,
                    'patch_embed' in k]):
                pt_params[0].append(p)
            elif 'blocks.0.' in k:
                pt_params[0].append(p)
            elif 'blocks.1.' in k:
                pt_params[1].append(p)
            elif 'blocks.2.' in k:
                pt_params[2].append(p)
            elif 'blocks.3.' in k:
                pt_params[3].append(p)
            elif 'blocks.4.' in k:
                pt_params[4].append(p)
            elif 'blocks.5.' in k:
                pt_params[5].append(p)
            elif 'blocks.6.' in k:
                pt_params[6].append(p)
            elif 'blocks.7.' in k:
                pt_params[7].append(p)
            elif 'blocks.8.' in k:
                pt_params[8].append(p)
            elif 'blocks.9.' in k:
                pt_params[9].append(p)
            elif 'blocks.10.' in k:
                pt_params[10].append(p)
            elif 'blocks.11.' in k:
                pt_params[11].append(p)
            elif 'backbone.norm.weight' in k or 'backbone.norm.bias' in k:
                pt_params[11].append(p)
            else:
                raise ValueError(f"Check separate params for M2D! Unknown key: {k}")
        return list(reversed(pt_params))



In [3]:
import argparse
import os
import math
import numpy as np
import pandas as pd

from typing import Iterable, Dict, List, Optional

CLASSES = ['Speech', 'Shout', 'Chainsaw', 'Jackhammer', 'Lawn Mower', 'Power Drill', 'Dog Bark', 'Rooster Crow', 'Horn Honk', 'Siren']

COST_MATRIX = {
    "Speech":         {"TP": 0,  "FP": 1,  "TN": 0, "FN": 5},
    "Dog Bark":       {"TP": 0,  "FP": 1,  "TN": 0, "FN": 5},
    "Rooster Crow":   {"TP": 0,  "FP": 1,  "TN": 0, "FN": 5},
    "Shout":          {"TP": 0,  "FP": 2,  "TN": 0, "FN": 10},
    "Lawn Mower":     {"TP": 0,  "FP": 3,  "TN": 0, "FN": 15},
    "Chainsaw":       {"TP": 0,  "FP": 3,  "TN": 0, "FN": 15},
    "Jackhammer":     {"TP": 0,  "FP": 3,  "TN": 0, "FN": 15},
    "Power Drill":    {"TP": 0,  "FP": 3,  "TN": 0, "FN": 15},
    "Horn Honk":      {"TP": 0,  "FP": 3,  "TN": 0, "FN": 15},
    "Siren":          {"TP": 0,  "FP": 3,  "TN": 0, "FN": 15},
}

def check_dataframe(data_frame, dataset_path):
    """
    Validates the integrity of a predictions or ground truth DataFrame.

    Parameters:
    ----------
    predictions_df : pandas.DataFrame
        A DataFrame containing model predictions or the ground truth.
        It must include columns:
        - 'filename': Name of the audio file (e.g., "xyz.wav")
        - 'onset': Onset times or frame indices
        - One column for each class in the global `CLASSES` list

    dataset_path : str
        Path to the root of the dataset directory. It must contain a
        subdirectory 'audio_features' with `.npz` files for each audio file.

    Raises:
    ------
    AssertionError:
        If any of the following checks fail:
        - The dataset or audio_features directory doesn't exist
        - The DataFrame is missing required columns
        - Expected feature files are missing
        - Number of predictions doesn't match the number of expected timesteps

    Example:
    -------
    check_dataframe(predicted_df, "MLPC2025_dataset")
    """
    audio_features_path = os.path.join(dataset_path, "audio_features")
    assert os.path.exists(dataset_path), f"Dataset path '{dataset_path}' does not exist."
    assert os.path.exists(audio_features_path), f"Audio features path '{audio_features_path}' does not exist."

    required_columns = set(CLASSES + ["filename", "onset"])
    missing_columns = required_columns - set(data_frame.columns)
    assert not missing_columns, f"Missing columns in predictions_df: {missing_columns}"

    assert ((data_frame["onset"] / 1.2) % 1).apply(lambda x: np.isclose(x, 0, atol=0.1)).all(), "Not all values are divisible by 1.2."
    assert data_frame[CLASSES].isin([0, 1]).all().all(), "Not all predictions are 0 or 1."

    for filename in data_frame["filename"].unique():
        file_id = os.path.splitext(filename)[0]
        feature_file = os.path.join(audio_features_path, f"{file_id}.npz")

        assert os.path.exists(feature_file), f"Feature file '{feature_file}' does not exist."

        embeddings = np.load(feature_file)["embeddings"]
        expected_timesteps = math.ceil(len(embeddings) / 10)
        actual_timesteps = len(data_frame[data_frame["filename"] == filename])

        assert actual_timesteps == expected_timesteps, (
            f"Mismatch in timesteps for '{filename}': expected {expected_timesteps}, found {actual_timesteps}."
        )


def total_cost(predictions_df, ground_truth_df):
    """
    Computes total cost of predictions based on a cost matrix for TP, FP, TN, and FN
    for each class in a multilabel classification problem.

    Parameters:
    ----------
    predictions_df : pandas.DataFrame
        DataFrame containing predicted binary labels (0 or 1) for each class in CLASSES.

    ground_truth_df : pandas.DataFrame
        DataFrame containing ground truth binary labels for each class in CLASSES.

    Returns:
    -------
    total_cost_value : float
        Total cost across all classes and samples.

    metrics_per_class : dict
        Dictionary with TP, FP, TN, FN counts and cost per class.
    """

    # Align rows by filename and onset
    merged = predictions_df.merge(
        ground_truth_df,
        on=["filename", "onset"],
        suffixes=("_pred", "_true"),
        how="inner",
        validate="one_to_one"
    )

    if merged.shape[0] != predictions_df.shape[0]:
        raise ValueError("Mismatch in alignment between prediction and ground truth rows")

    metrics_per_class = {}

    for cls in CLASSES:
        y_pred = predictions_df[cls].astype(int)
        y_true = ground_truth_df[cls].astype(int)

        TP = ((y_pred == 1) & (y_true == 1)).mean()* 50
        FP = ((y_pred == 1) & (y_true == 0)).mean()* 50
        TN = ((y_pred == 0) & (y_true == 0)).mean()* 50
        FN = ((y_pred == 0) & (y_true == 1)).mean()* 50

        cost = (
            COST_MATRIX[cls]["TP"] * TP +
            COST_MATRIX[cls]["FP"] * FP +
            COST_MATRIX[cls]["TN"] * TN +
            COST_MATRIX[cls]["FN"] * FN
        )

        metrics_per_class[cls] = {
            "TP": TP, "FP": FP, "TN": TN, "FN": FN, "cost": cost
        }

    return sum([metrics_per_class[c]["cost"] for c in metrics_per_class]), metrics_per_class


def aggregate_targets(arr: np.ndarray, f: int = 10) -> np.ndarray:
    """
    Aggregates frame-level ground truths into segment-level by taking the max over fixed-size chunks.

    Parameters:
    ----------
    arr : np.ndarray
        Array of shape (N, D) where N is the number of frames, D is number of classes.

    f : int
        Aggregation factor (number of frames per chunk).

    Returns:
    -------
    np.ndarray
        Aggregated labels of shape (ceil(N/f), D)
    """
    N, D = arr.shape
    full_chunks = N // f
    remainder = N % f

    # Aggregate full chunks
    aggregated = arr[:full_chunks * f].reshape(full_chunks, f, D).max(axis=1)

    # Handle leftover frames
    if remainder > 0:
        tail = arr[full_chunks * f:].max(axis=0, keepdims=True)
        aggregated = np.vstack([aggregated, tail])

    return aggregated


def get_ground_truth_df(filenames: Iterable[str], dataset_path: str) -> pd.DataFrame:
    """
    Loads and aggregates ground truth labels for an arbitrary list of files.

    Parameters:
    ----------
    filenames : Iterable[str]
        List or array of filenames (e.g., from a subset of metadata.csv) to process.

    dataset_path : str
        Path to dataset containing the 'labels/' folder with .npz files.

    Returns:
    -------
    pd.DataFrame
        DataFrame with columns: ["filename", "onset"] + CLASSES
    """
    rows = []

    for fname in filenames:
        base = os.path.splitext(fname)[0]
        label_path = os.path.join(dataset_path, 'labels', f"{base}_labels.npz")
        assert os.path.exists(label_path), f"Missing label file: {label_path}"

        y = np.load(label_path)
        class_matrix = np.stack([y[cls].mean(-1) for cls in CLASSES], axis=1)
        aggregated = aggregate_targets(class_matrix)

        for i, row in enumerate(aggregated):
            onset = round(i * 1.2, 1)
            binary_labels = (row > 0).astype(int).tolist()
            rows.append([fname, onset] + binary_labels)

    return pd.DataFrame(data=rows, columns=["filename", "onset"] + CLASSES)


def get_segment_prediction_df(
    predictions: Dict[str, Dict[str, np.ndarray]],
    class_names: Optional[List[str]] = None
) -> pd.DataFrame:
    """
    Aggregates frame-level predictions into fixed-length segments for a set of files.

    Parameters:
    ----------
    predictions : Dict[str, Dict[str, np.ndarray]]
        Dictionary mapping each filename to another dictionary of class-wise frame-level predictions.
        Each class prediction is a 1D NumPy array of shape (T,), where T is time.

    class_names : List[str], optional
        List of class names to include in the output. If None, uses keys from the first file's prediction dict.

    Returns:
    -------
    pd.DataFrame
        DataFrame with columns: ["filename", "onset"] + class_names.
        Each row represents a segment and contains aggregated predictions for that segment.
    """
    if class_names is None:
        class_names = list(next(iter(predictions.values())).keys())

    rows = []

    for filename, class_preds in predictions.items():
        # Collect and stack predictions into shape (T, num_classes)
        frame_matrix = np.stack([class_preds[cls] for cls in class_names], axis=1)

        # Aggregate over fixed-length segments
        aggregated = aggregate_targets(frame_matrix, f=10)

        for seg_idx, segment in enumerate(aggregated):
            onset = round(seg_idx * 1.2, 1)
            rows.append([filename, onset] + segment.tolist())

    return pd.DataFrame(rows, columns=["filename", "onset"] + class_names)


In [4]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from collections import OrderedDict

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Instantiate the model
model = M2DWrapper()

# Load the original state dict
data = torch.load("M2D_strong_1.pt")

# Create a new OrderedDict with replaced keys
new_state_dict = OrderedDict()

for k, v in data.items():
    new_key = k.replace("model.", "m2d.", 1)  # Replace ONLY the first occurrence
    new_state_dict[new_key] = v

# You can now save or load this into a model
torch.save(new_state_dict, "M2D_strong_1_m2d.pt")

model

M2DWrapper(
  (m2d): PortableM2D(
    (backbone): LocalViT(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(16, 16))
        (norm): Identity()
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (patch_drop): Identity()
      (norm_pre): Identity()
      (blocks): Sequential(
        (0): Block(
          (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (attn): Attention(
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (q_norm): Identity()
            (k_norm): Identity()
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=768, out_features=768, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
          )
          (ls1): Identity()
          (drop_path1): Identity()
          (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): Linear(in_features=768, out_featu

In [5]:
# Example outputs

model.eval()

current = os.getcwd()
audio_folder = Path(f'{current}/mlpc2025_dataset/audio')

print("Audio folder exists:", audio_folder.exists())
print("Is directory:", audio_folder.is_dir())

outputs = []
inputs = [torch.randn((1, 1, 80, 208))]
for input in inputs:

    with torch.no_grad():
        output = model(input)

    outputs.append((input, output.cpu()))

print(f"Processed {len(outputs)} files.")
print(outputs)

Audio folder exists: True
Is directory: True
Processed 1 files.
[(tensor([[[[-0.7138,  0.3219,  0.9397,  ..., -2.1659, -1.7354,  0.1474],
          [ 1.3134,  0.9238,  0.3188,  ...,  1.7226, -0.7648, -0.4899],
          [ 1.5341,  1.3754,  0.9947,  ..., -0.5720, -0.0578,  0.7657],
          ...,
          [-0.4178,  1.1254, -0.1166,  ..., -1.8507,  0.3180,  2.2248],
          [ 0.4263,  0.8289,  0.2129,  ..., -0.5466, -0.6865,  0.1803],
          [-1.0919,  0.4353,  0.2634,  ...,  0.2983, -0.1794,  0.6602]]]]), tensor([[[ 0.7724,  0.2534, -0.9733,  ..., -0.1214, -0.3854, -0.5118],
         [-0.6734, -0.1932, -1.0854,  ...,  0.6122, -1.3088, -0.8926],
         [-0.2172, -0.7379, -0.7474,  ..., -0.9279,  1.2254, -1.6635],
         ...,
         [ 0.1901,  0.3431, -2.4453,  ...,  0.6166, -0.3571, -1.9920],
         [-1.0015, -0.5443,  0.7885,  ...,  0.5753,  0.6722,  0.4565],
         [-1.0392,  0.3696, -1.3452,  ..., -0.5419,  1.9614,  0.1452]]]))]


In [6]:
target_sample_rate = 16000

mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=target_sample_rate,
    n_fft=400,
    hop_length=160,
    n_mels=80  # <- set to 80
)

def load_and_process_audio(file_path, duration_sec=2.1):  # about 2.1 seconds
    waveform, sr = torchaudio.load(file_path)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    if sr != target_sample_rate:
        waveform = T.Resample(sr, target_sample_rate)(waveform)

    num_samples = int(target_sample_rate * duration_sec)
    if waveform.shape[1] > num_samples:
        waveform = waveform[:, :num_samples]
    elif waveform.shape[1] < num_samples:
        pad = num_samples - waveform.shape[1]
        waveform = torch.nn.functional.pad(waveform, (0, pad))

    mel_spec = mel_transform(waveform)
    mel_spec = torchaudio.functional.amplitude_to_DB(mel_spec, multiplier=10, amin=1e-10, db_multiplier=0)

    # Ensure mel_spec has shape (1, 80, 208) — pad or crop time dimension
    if mel_spec.shape[2] > 208:
        mel_spec = mel_spec[:, :, :208]
    elif mel_spec.shape[2] < 208:
        pad_frames = 208 - mel_spec.shape[2]
        mel_spec = torch.nn.functional.pad(mel_spec, (0, pad_frames))

    mel_spec = mel_spec.unsqueeze(0)  # add batch dim
    return mel_spec  # shape (1, 1, 80, 208)




#### NO NEED TO RUN THIS CELL MORE THEN ONCE:

In [None]:
#def generate_output():
outputs = []

max_iter = len(list(audio_folder.glob('*.mp3')))
audio_files = list(audio_folder.glob('*.mp3'))[:max_iter]  # take only first x files

for audio_file in tqdm(audio_files, desc="Processing audio files"):
    mel_spec = load_and_process_audio(str(audio_file), duration_sec=1.2)  # 1 second audio for enough frames
    mel_spec = mel_spec.to(device)

    with torch.no_grad():
        # Forward pass expects mel spectrogram input
        output = model(mel_spec)

    outputs.append((audio_file.name, output.cpu()))
    
# Save
with open('outputs.pkl', 'wb') as f:
    pickle.dump(outputs, f)
print(f"Processed {len(outputs)} files.")

In [8]:
with open('outputs.pkl', 'rb') as f:
    outputs = pickle.load(f)
print(f"Number of outputs: {len(outputs)}")


Number of outputs: 8230


In [9]:


# Step 1: Convert model outputs into frame-wise probabilities
predictions_dict = {}
for filename, logits in outputs:
    probs = torch.sigmoid(logits.squeeze(0))  # Shape: (T, C)
    probs_np = probs.numpy()
    predictions_dict[filename] = {
        CLASSES[i]: probs_np[:, i] for i in range(len(CLASSES))
    }

# Step 2: Create segment-level prediction DataFrame
segment_df = get_segment_prediction_df(predictions_dict)

# Step 3: Load ground truth for selected filenames only
filenames = [f.name for f in audio_files]  # From your earlier loop
labels_path = Path(f'{current}/mlpc2025_dataset')
ground_truth_df = get_ground_truth_df(filenames, labels_path)

# Step 4: Optimize thresholds class-by-class
best_thresholds = {}
threshold_range = np.arange(0.1, 0.91, 0.05)

print("LOSS ON TRAINING DATA\n")

for cls in CLASSES:
    min_cost = float('inf')
    best_thresh = 0.5

    for thresh in threshold_range:
        temp_df = deepcopy(segment_df)

        # Apply threshold only to the current class
        temp_df[cls] = (temp_df[cls] > thresh).astype(int)
        
        # Keep other classes binarized at 0.5
        for other_cls in CLASSES:
            if other_cls != cls:
                temp_df[other_cls] = (temp_df[other_cls] > 0.5).astype(int)

        # Compute metrics
        _, class_metrics = total_cost(temp_df, ground_truth_df)

        # Use only this class's cost for optimization
        cost = class_metrics[cls]['cost']

        if cost < min_cost:
            min_cost = cost
            best_thresh = thresh

    best_thresholds[cls] = best_thresh
    print(f"✅ Best threshold for {cls:15}: {best_thresh:.2f} (Class cost: {min_cost:.2f})")

# Step 5: Apply best thresholds to all classes
final_df = deepcopy(segment_df)
for cls in CLASSES:
    final_df[cls] = (final_df[cls] > best_thresholds[cls]).astype(int)

# Step 6: Compute final cost and detailed metrics
total, class_metrics = total_cost(final_df, ground_truth_df)

# Step 7: Display results
print(f"\n🎯 Total Cost after Threshold Optimization: {total:.2f}")
print("\n📊 Per-Class Metrics:")

for cls, metrics in class_metrics.items():
    # Round metrics for interpretability
    TP = round(metrics['TP'], 1)
    FP = round(metrics['FP'], 1)
    FN = round(metrics['FN'], 1)
    TN = round(metrics.get('TN', 0), 1)  # Optional TN
    cost = round(metrics['cost'], 2)

    print(f"{cls:15} → TP: {TP:>5}, FP: {FP:>5}, FN: {FN:>5}, TN: {TN:>5}, Cost: {cost:6.2f}")


LOSS ON TRAINING DATA

✅ Best threshold for Speech         : 0.90 (Class cost: 3.08)
✅ Best threshold for Shout          : 0.90 (Class cost: 1.04)
✅ Best threshold for Chainsaw       : 0.90 (Class cost: 0.82)
✅ Best threshold for Jackhammer     : 0.90 (Class cost: 0.55)
✅ Best threshold for Lawn Mower     : 0.90 (Class cost: 0.78)
✅ Best threshold for Power Drill    : 0.90 (Class cost: 1.54)
✅ Best threshold for Dog Bark       : 0.90 (Class cost: 0.81)
✅ Best threshold for Rooster Crow   : 0.90 (Class cost: 0.09)
✅ Best threshold for Horn Honk      : 0.90 (Class cost: 1.95)
✅ Best threshold for Siren          : 0.90 (Class cost: 2.08)

🎯 Total Cost after Threshold Optimization: 12.75

📊 Per-Class Metrics:
Speech          → TP:   0.0, FP:   0.2, FN:   0.6, TN:   4.4, Cost:   3.08
Shout           → TP:   0.0, FP:   0.1, FN:   0.1, TN:   5.0, Cost:   1.04
Chainsaw        → TP:   0.0, FP:   0.0, FN:   0.1, TN:   5.2, Cost:   0.82
Jackhammer      → TP:   0.0, FP:   0.0, FN:   0.0, TN:   5.2

#### NO NEED TO RUN THIS CELL MORE THEN ONCE:

In [None]:
#def generate_output():
outputs_test = []

current = os.getcwd()
audio_folder = Path(f'{current}/mlpc2025_test/audio')

max_iter = len(list(audio_folder.glob('*.mp3')))
audio_files = list(audio_folder.glob('*.mp3'))[:max_iter]  # take only first x files

for audio_file in tqdm(audio_files, desc="Processing audio files"):
    mel_spec = load_and_process_audio(str(audio_file), duration_sec=1.2)  # 1 second audio for enough frames
    mel_spec = mel_spec.to(device)

    with torch.no_grad():
        # Forward pass expects mel spectrogram input
        output = model(mel_spec)

    outputs_test.append((audio_file.name, output.cpu()))
    
# Save
with open('outputs_test.pkl', 'wb') as f:
    pickle.dump(outputs_test, f)
print(f"Processed {len(outputs_test)} files.")

In [10]:
with open('outputs_test.pkl', 'rb') as f:
    outputs_test = pickle.load(f)
print(f"Number of outputs: {len(outputs_test)}")


Number of outputs: 2742


In [14]:
# Step 1: Convert model outputs into frame-wise probabilities
predictions_dict_test = {}
for filename, logits in outputs_test:
    probs = torch.sigmoid(logits.squeeze(0))  # Shape: (T, C)
    probs_np = probs.numpy()
    predictions_dict_test[filename] = {
        CLASSES[i]: probs_np[:, i] for i in range(len(CLASSES))
    }

# Step 2: Create segment-level prediction DataFrame
segment_df_test = get_segment_prediction_df(predictions_dict_test)

# Step 3: Load ground truth for selected filenames only
csv_path = f'{current}/mlpc2025_test/ground_truth.csv'
ground_truth_df_test = pd.read_csv(csv_path)

# Step 4: Optimize thresholds class-by-class
best_thresholds_test = {}
threshold_range_test = np.arange(0.1, 0.91, 0.05)

print("LOSS ON TEST DATA\n")

for cls in CLASSES:
    min_cost = float('inf')
    best_thresh = 0.5

    for thresh in threshold_range_test:
        temp_df = deepcopy(segment_df_test)

        # Apply threshold only to the current class
        temp_df[cls] = (temp_df[cls] > thresh).astype(int)
        
        # Keep other classes binarized at 0.5
        for other_cls in CLASSES:
            if other_cls != cls:
                temp_df[other_cls] = (temp_df[other_cls] > 0.5).astype(int)

        # Compute metrics
        _, class_metrics = total_cost(temp_df, ground_truth_df_test)

        # Use only this class's cost for optimization
        cost = class_metrics[cls]['cost']

        if cost < min_cost:
            min_cost = cost
            best_thresh = thresh

    best_thresholds_test[cls] = best_thresh
    print(f"✅ Best threshold for {cls:15}: {best_thresh:.2f} (Class cost: {min_cost:.2f})")

# Step 5: Apply best thresholds to all classes
final_df_test = deepcopy(segment_df_test)
for cls in CLASSES:
    final_df_test[cls] = (final_df_test[cls] > best_thresholds_test[cls]).astype(int)

# Step 6: Compute final cost and detailed metrics
total, class_metrics = total_cost(final_df_test, ground_truth_df_test)

# Step 7: Display results
print(f"\n🎯 Total Cost after Threshold Optimization: {total:.2f}")
print("\n📊 Per-Class Metrics:")

for cls, metrics in class_metrics.items():
    # Round metrics for interpretability
    TP = round(metrics['TP'], 1)
    FP = round(metrics['FP'], 1)
    FN = round(metrics['FN'], 1)
    TN = round(metrics.get('TN', 0), 1)  # Optional TN
    cost = round(metrics['cost'], 2)

    print(f"{cls:15} → TP: {TP:>5}, FP: {FP:>5}, FN: {FN:>5}, TN: {TN:>5}, Cost: {cost:6.2f}")


LOSS ON TEST DATA

✅ Best threshold for Speech         : 0.90 (Class cost: 0.10)
✅ Best threshold for Shout          : 0.90 (Class cost: 0.04)
✅ Best threshold for Chainsaw       : 0.90 (Class cost: 0.05)
✅ Best threshold for Jackhammer     : 0.90 (Class cost: 0.14)
✅ Best threshold for Lawn Mower     : 0.90 (Class cost: 0.05)
✅ Best threshold for Power Drill    : 0.90 (Class cost: 0.14)
✅ Best threshold for Dog Bark       : 0.90 (Class cost: 0.05)
✅ Best threshold for Rooster Crow   : 0.90 (Class cost: 0.01)
✅ Best threshold for Horn Honk      : 0.90 (Class cost: 0.14)
✅ Best threshold for Siren          : 0.90 (Class cost: 0.23)

🎯 Total Cost after Threshold Optimization: 0.94

📊 Per-Class Metrics:
Speech          → TP:   0.0, FP:   0.1, FN:   0.0, TN:   5.2, Cost:   0.10
Shout           → TP:   0.0, FP:   0.0, FN:   0.0, TN:   5.2, Cost:   0.04
Chainsaw        → TP:   0.0, FP:   0.0, FN:   0.0, TN:   5.2, Cost:   0.05
Jackhammer      → TP:   0.0, FP:   0.0, FN:   0.0, TN:   5.2, Cos

### No training required?