In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import os

In [3]:
INPUT_CSV = "/content/drive/MyDrive/Mezahim Pizuzim Ve Azakot/cnn14_model/labels_uncleaned_with_predictions.csv"
OUTPUT_CSV = "/content/drive/MyDrive/Mezahim Pizuzim Ve Azakot/cnn14_model/labels_cleaned3.csv"

In [4]:
def preprocess_csv(input_csv, output_csv):
    # Check if file exists
    if not os.path.exists(input_csv):
        raise FileNotFoundError(f"Input CSV not found at: {input_csv}")

    # Check if file is empty
    if os.path.getsize(input_csv) == 0:
        raise ValueError(f"Input CSV is empty: {input_csv}")

    # Try to read CSV
    try:
        df = pd.read_csv(input_csv)
    except Exception as e:
        with open(input_csv, 'r') as f:
            lines = f.readlines()[:5]
            print("First 5 lines of the file:")
            for i, line in enumerate(lines, 1):
                print(f"Line {i}: {line.strip()}")
        raise e

    # Define mapping for 3 classes [war_sounds, animals, other]
    tag_map = {
        "0": [0, 0, 1],  # OTHER
        "1": [1, 0, 0],  # WAR SOUNDS
        "3": [0, 1, 0],  # ANIMALS
        "4": [1, 0, 0],  # PLANES -> WAR SOUNDS
    }

    # New DataFrame with 'clean name' and 3-class columns
    cleaned_data = {"filename": [], "war_sounds": [], "animals": [], "other": []}
    skipped = []

    for _, row in df.iterrows():
        tag = str(row["tag"]).strip() if pd.notna(row["tag"]) else "None"

        # Handle multi-labels (e.g., "4,3")
        if "," in tag:
            labels = [int(x.strip()) for x in tag.split(",")]
            war_sounds = 1 if 1 in labels or 4 in labels else 0  # Merge 1 and 4 into war_sounds
            animals = 1 if 3 in labels else 0
            other = 1 if 0 in labels and not (war_sounds or animals) else 0  # other only if no other labels
            cleaned_data["filename"].append(row["clean name"])
            cleaned_data["war_sounds"].append(war_sounds)
            cleaned_data["animals"].append(animals)
            cleaned_data["other"].append(other)
        # Handle single labels
        elif tag in tag_map:
            cleaned_data["filename"].append(row["clean name"])
            cleaned_data["war_sounds"].append(tag_map[tag][0])
            cleaned_data["animals"].append(tag_map[tag][1])
            cleaned_data["other"].append(tag_map[tag][2])
        else:
            skipped.append(row["clean name"])

    # Save cleaned CSV with 'filename' and 3-class columns
    cleaned_df = pd.DataFrame(cleaned_data)
    cleaned_df.to_csv(output_csv, index=False)

    # Report
    print(f"Saved {len(cleaned_df)} rows to {output_csv}")
    print(f"Skipped {len(skipped)} rows with invalid tags")
    print(f"Sample skipped rows: {skipped[:5]}")

In [5]:
preprocess_csv(INPUT_CSV, OUTPUT_CSV)

Saved 1475 rows to /content/drive/MyDrive/Mezahim Pizuzim Ve Azakot/cnn14_model/labels_cleaned3.csv
Skipped 1111 rows with invalid tags
Sample skipped rows: ['12_20240925_084320.WAV', '12_20240925_114240.WAV', '12_20240925_124120.WAV', '12_20240925_214040.WAV', '12_20240925_225400.WAV']
