In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
import glob
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import librosa
import numpy as np
from datetime import datetime


from model import CNNWithGAP
MODEL_PATH = "/content/gdrive/MyDrive/best_cnn.pth"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CNNWithGAP(n_classes=4).to(DEVICE)
# Check if model file exists before loading
if os.path.exists(MODEL_PATH):
    model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
else:
    print(f"Warning: Model file not found at {MODEL_PATH}. Using an untrained model.")
model.eval()


# -----------------------------
# Segmentation + Prediction
# -----------------------------
def segment_and_predict(y, sr, model, segment_length=10, hop_length=10, conf_threshold=0.3):
    """
    Processes an audio signal, segments it, and returns predictions.
    Note: Takes the audio waveform 'y' as input directly.
    """
    seg_samples = int(segment_length * sr)
    hop_samples = int(hop_length * sr)

    results = []
    # Process the audio in segments
    for start in range(0, len(y) - seg_samples + 1, hop_samples):
        end = start + seg_samples
        seg_y = y[start:end]

        # Feature extraction: Mel spectrogram
        mel = librosa.feature.melspectrogram(y=seg_y, sr=sr, n_mels=128)
        mel_db = librosa.power_to_db(mel, ref=np.max)

        # Prepare tensor for the model
        X = torch.tensor(mel_db, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(DEVICE)

        # Get model prediction
        with torch.no_grad():
            out = model(X)
            probs = F.softmax(out, dim=1).cpu().numpy()[0]

            # --- Applying adjustments to probabilities ---
            probs[1] *= 1.5  # marine_animal
            probs[2] *= 1.5  # natural_sound
            probs[3] *= 1.5  # other_anthropogenic
            probs /= probs.sum() # Re-normalize probabilities

        pred_id = int(np.argmax(probs)) + 1  # 1-based indexing for categories
        conf = float(probs[pred_id - 1])

        # Store result if confidence is above the threshold
        if conf >= conf_threshold:
            results.append({
                "category_id": pred_id,
                "start_time": round(start / sr, 3),
                "end_time": round(end / sr, 3),
                "duration": round((end - start) / sr, 3),
                "score": conf
            })
    return results

# -----------------------------
# Build JSON file
# -----------------------------

today = datetime.now().strftime("%Y_%m_%d")
output_file = "output.json"


TEST_AUDIO_DIR = "/content/gdrive/MyDrive/20251103"

# Initialize the main submission dictionary with all required keys
submission = {
    "audios": [],
    "categories": [
        {"id": 1, "name": "vessel"},
        {"id": 2, "name": "marine_animal"},
        {"id": 3, "name": "natural_sound"},
        {"id": 4, "name": "other_anthropogenic"}
    ],
    "annotations": []
}

# Recursively find all .wav files in the test directory
wav_files = glob.glob(os.path.join(TEST_AUDIO_DIR, "**", "*.wav"), recursive=True)
if not wav_files:
    print(f"Error: No .wav files found in {TEST_AUDIO_DIR}. Please check the path.")
else:
    print(f"Found {len(wav_files)} wav files to process.")

# Initialize a global counter for unique annotation IDs
annotation_id_counter = 1

# Process each audio file
for audio_id, file_path in enumerate(wav_files, 1):
    try:
        # Load audio file and get its total duration
        y, sr = librosa.load(file_path, sr=48000, mono=True)
        total_duration = librosa.get_duration(y=y, sr=sr)

        # Add audio file information to the "audios" list
        submission["audios"].append({
            "id": audio_id,
            "file_name": os.path.basename(file_path),
            "file_path": file_path,  # Storing the full path for reference
            "duration": round(total_duration, 4)
        })

        # Get segment predictions for the loaded audio
        annotations = segment_and_predict(y, sr, model, conf_threshold=0.3)

        # Add the results to the main "annotations" list
        for ann in annotations:
            ann["id"] = annotation_id_counter
            ann["audio_id"] = audio_id  # Link annotation to the audio file
            submission["annotations"].append(ann)
            annotation_id_counter += 1

        print(f"[{audio_id}/{len(wav_files)}] Processed {os.path.basename(file_path)} -> Found {len(annotations)} segments.")

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Save the final JSON file
with open(output_file, "w") as f:
    json.dump(submission, f, indent=4)

print(f"\n✅ Successfully generated submission file: {output_file}")
print(f"Total audio files processed: {len(submission['audios'])}")
print(f"Total annotations generated: {len(submission['annotations'])}")

Found 60 wav files to process.
[1/60] Processed S2_Test_001.wav -> Found 38 segments.
[2/60] Processed S2_Test_003.wav -> Found 20 segments.
[3/60] Processed S2_Test_002.wav -> Found 24 segments.
[4/60] Processed S2_Test_004.wav -> Found 40 segments.
[5/60] Processed S2_Test_006.wav -> Found 18 segments.
[6/60] Processed S2_Test_005.wav -> Found 50 segments.
[7/60] Processed S2_Test_007.wav -> Found 36 segments.
[8/60] Processed S2_Test_008.wav -> Found 36 segments.
[9/60] Processed S2_Test_009.wav -> Found 50 segments.
[10/60] Processed S2_Test_012.wav -> Found 36 segments.
[11/60] Processed S2_Test_011.wav -> Found 24 segments.
[12/60] Processed S2_Test_014.wav -> Found 9 segments.
[13/60] Processed S2_Test_015.wav -> Found 30 segments.
[14/60] Processed S2_Test_013.wav -> Found 18 segments.
[15/60] Processed S2_Test_010.wav -> Found 20 segments.
[16/60] Processed S2_Test_016.wav -> Found 24 segments.
[17/60] Processed S2_Test_017.wav -> Found 26 segments.
[18/60] Processed S2_Test_0