In [6]:
import os
import pandas as pd

GAS_ROOT = "dataset/DATASET/"
OUTPUT_CSV = "dataset/DATASET/gas.csv"

LABEL_MAP = {
    # ===== Animals =====
    "Animals/cat":        (8,  "cat"),
    "Animals/dog":        (7,  "dog"),
    "Animals/elephant":    (9,  "elephant"),
    "Animals/horse":       (10, "horse"),
    "Animals/lion":       (11, "lion"),

    # ===== Birds =====
    "Birds/crow":         (12, "crow"),
    "Birds/parrot":        (13, "parrot"),
    "Birds/peacock":       (14, "peacock"),
    "Birds/sparrow":       (15, "sparrow"),

    # ===== Environment (MILITARY ignored) =====
    "Environment/crowd":   (16, "crowd"),
    "Environment/office":  (17, "office"),
    "Environment/rainfall":(18, "rainfall"),
    "Environment/traffic": (19, "traffic"),
    "Environment/wind":    (20, "wind"),
    # ===== Vehicles =====
    "Vehicles/airplane":   (21, "airplane"),
    "Vehicles/bicycle":    (22, "bicycle"),
    "Vehicles/bike":       (23, "bike"),
    "Vehicles/bus":        (24, "bus"),
    "Vehicles/car":        (25, "car"),
    "Vehicles/helicopter": (5,  "helicopter"),

    "Vehicles/train":      (26, "train"),
    "Vehicles/truck":      (27, "truck"),
}

rows = []

for rel_path, (label_id, label_name) in LABEL_MAP.items():
    abs_dir = os.path.join(GAS_ROOT, rel_path)

    if not os.path.isdir(abs_dir):
        print(f"⚠️ Missing folder: {abs_dir}")
        continue

    for fname in os.listdir(abs_dir):
        if not fname.lower().endswith(".wav"):
            continue

        rows.append({
            "path": os.path.join("DATASET", rel_path, fname),
            "label": label_id,
            "label_name": label_name,
            "source": "GAS"
        })

df_gas = pd.DataFrame(rows)
df_gas.to_csv(OUTPUT_CSV, index=False)

print(f"✅ GAS CSV created: {len(df_gas)} samples")
print(df_gas["label_name"].value_counts())


✅ GAS CSV created: 22195 samples
label_name
bus           4221
train         2552
office        1376
rainfall      1174
sparrow       1162
wind          1150
traffic       1111
crow          1095
cat           1032
crowd          918
parrot         834
horse          740
airplane       673
bicycle        617
dog            596
elephant       539
bike           537
lion           523
peacock        497
helicopter     353
truck          265
car            230
Name: count, dtype: int64


In [5]:
training_MAD_csv = "dataset/MAD_dataset/training.csv"
df_mad = pd.read_csv(training_MAD_csv)
print(f"classes in MAD: {df_mad['label'].nunique()} classes")

#0 - communications
#1 - gunshot
#2 - footsteps
#3 - shelling
#4 - vehicle
#5 - helicopter        <-- MERGED
#6 - fighter

communications_samples = df_mad[df_mad['label'] == 0].shape[0]
gunshot_samples = df_mad[df_mad['label'] == 1].shape[0]
footsteps_samples = df_mad[df_mad['label'] == 2].shape[0]
shelling_samples = df_mad[df_mad['label'] == 3].shape[0]
vehicle_samples = df_mad[df_mad['label'] == 4].shape[0]
helicopter_samples = df_mad[df_mad['label'] == 5].shape[0]
fighter_samples = df_mad[df_mad['label'] == 6].shape[0]
print(f"communications samples: {communications_samples}")
print(f"gunshot samples: {gunshot_samples}")
print(f"footsteps samples: {footsteps_samples}")
print(f"shelling samples: {shelling_samples}")
print(f"vehicle samples: {vehicle_samples}")
print(f"helicopter samples: {helicopter_samples}")
print(f"fighter samples: {fighter_samples}")
total_mad_samples = df_mad.shape[0]
print(f"total MAD samples: {total_mad_samples}")


classes in MAD: 7 classes
communications samples: 774
gunshot samples: 1293
footsteps samples: 773
shelling samples: 883
vehicle samples: 910
helicopter samples: 934
fighter samples: 862
total MAD samples: 6429


In [7]:
#combined training and test from MAD
df_training_mad = pd.read_csv("dataset/MAD_dataset/training.csv")
df_test_mad = pd.read_csv("dataset/MAD_dataset/test.csv")
df_combined_mad = pd.concat([df_training_mad, df_test_mad], ignore_index=True)
print(f"combined MAD samples: {df_combined_mad.shape[0]} samples")

communications_samples_combined = df_combined_mad[df_combined_mad['label'] == 0].shape[0]
gunshot_samples_combined = df_combined_mad[df_combined_mad['label'] == 1].shape[0]
footsteps_samples_combined = df_combined_mad[df_combined_mad['label'] == 2].shape[0]
shelling_samples_combined = df_combined_mad[df_combined_mad['label'] == 3].shape[0]
vehicle_samples_combined = df_combined_mad[df_combined_mad['label'] == 4].shape[0]
helicopter_samples_combined = df_combined_mad[df_combined_mad['label'] == 5].shape[0]
fighter_samples_combined = df_combined_mad[df_combined_mad['label'] == 6].shape[0]
print(f"combined communications samples: {communications_samples_combined}")
print(f"combined gunshot samples: {gunshot_samples_combined}")
print(f"combined footsteps samples: {footsteps_samples_combined}")
print(f"combined shelling samples: {shelling_samples_combined}")
print(f"combined vehicle samples: {vehicle_samples_combined}")
print(f"combined helicopter samples: {helicopter_samples_combined}")
print(f"combined fighter samples: {fighter_samples_combined}") 


combined MAD samples: 7466 samples
combined communications samples: 981
combined gunshot samples: 1573
combined footsteps samples: 877
combined shelling samples: 987
combined vehicle samples: 1032
combined helicopter samples: 1025
combined fighter samples: 991


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

GAS_CSV = "dataset/DATASET/gas.csv"
TRAIN_OUT = "dataset/DATASET/gas_train.csv"
TEST_OUT  = "dataset/DATASET/gas_test.csv"

df_gas = pd.read_csv(GAS_CSV)

print("Total GAS samples:", len(df_gas))
print("Classes:", df_gas["label_name"].nunique())

# Stratified split
df_train, df_test = train_test_split(
    df_gas,
    test_size=0.2,
    stratify=df_gas["label"],
    random_state=42
)

df_train.to_csv(TRAIN_OUT, index=False)
df_test.to_csv(TEST_OUT, index=False)

print("✅ GAS split complete")
print("Train samples:", len(df_train))
print("Test samples:", len(df_test))

print("\nTrain distribution:")
print(df_train["label_name"].value_counts())

print("\nTest distribution:")
print(df_test["label_name"].value_counts())


Total GAS samples: 22195
Classes: 22
✅ GAS split complete
Train samples: 17756
Test samples: 4439

Train distribution:
label_name
bus           3377
train         2042
office        1101
rainfall       939
sparrow        929
wind           920
traffic        889
crow           876
cat            826
crowd          734
parrot         667
horse          592
airplane       538
bicycle        494
dog            477
elephant       431
bike           430
lion           418
peacock        398
helicopter     282
truck          212
car            184
Name: count, dtype: int64

Test distribution:
label_name
bus           844
train         510
office        275
rainfall      235
sparrow       233
wind          230
traffic       222
crow          219
cat           206
crowd         184
parrot        167
horse         148
airplane      135
bicycle       123
dog           119
elephant      108
bike          107
lion          105
peacock        99
helicopter     71
truck          53
car            46

In [9]:
assert set(df_train["path"]).isdisjoint(set(df_test["path"]))
assert set(df_train["label"]) == set(df_test["label"])


In [11]:
#giving MAD source column "MAD"
mad_test = pd.read_csv("dataset/MAD_dataset/test.csv")
mad_train = pd.read_csv("dataset/MAD_dataset/training.csv")

#add new column "source" with value "MAD"
mad_test['source'] = 'MAD'
mad_train['source'] = 'MAD'

#save
mad_test.to_csv("dataset/MAD_dataset/test.csv", index=False)
mad_train.to_csv("dataset/MAD_dataset/training.csv", index=False)
