In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import torch
from torch import nn
import torch.optim as optim
from torchvision import models, transforms
import torchaudio, librosa
from torch.utils.data import Dataset,DataLoader
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import os
from skimage.transform import resize
import random
import torchaudio.transforms as T

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [None]:
train_dir ="/content/drive/MyDrive/train"
test_dir = "/content/drive/MyDrive/test"


In [None]:
train_paths=[]
train_labels = []
for class_folder in os.listdir(train_dir):
    class_path = os.path.join(train_dir, class_folder)


    if os.path.isdir(class_path):
        for file in os.listdir(class_path):
            train_paths.append(os.path.join(class_path, file))
            train_labels.append(class_folder)


train_df = pd.DataFrame({
    'FilePath': train_paths,
    'Class': train_labels
})

test_files = [os.path.join(test_dir, f) for f in os.listdir(test_dir) if f.endswith(".wav")]
test_df = pd.DataFrame({"FilePath": test_files})


In [None]:
label_encoder = LabelEncoder()
train_df["Class"] = label_encoder.fit_transform(train_df["Class"])

In [None]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(
    train_df,
    test_size=0.2,
    stratify=train_df["Class"],     # to ensure similar distribution of classes
    random_state=seed      # to ensure same split every time
)


In [None]:
class AudioDataset(Dataset):
    def __init__(self, df, augment=False):
        self.df = df
        self.augment = augment

        # Define SpecAugment transforms
        self.freq_mask = T.FrequencyMasking(freq_mask_param=15)
        self.time_mask = T.TimeMasking(time_mask_param=25)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_path = self.df.iloc[idx]["FilePath"]
        y = self.df.iloc[idx].get("Class", -1)

        # compute mel-spectrogram
        spec = self._melspec(file_path)
        # convert to tensor for SpecAugment
        spec_tensor = torch.tensor(spec, dtype=torch.float).unsqueeze(0)
        # apply SpecAugment only during training
        if self.augment:
            spec_tensor = self._spec_augment(spec_tensor)

        # Make 3-channel for ResNet18
        spec_tensor = spec_tensor.repeat(3, 1, 1)

        return spec_tensor,torch.tensor(y, dtype=torch.long)

    def _melspec(self, path):
        sr = 22050
        signal, _ = librosa.load(path, sr=sr, duration=5)
        mel = librosa.feature.melspectrogram(y=signal, sr=sr, n_fft=2048, hop_length=512, n_mels=128)
        mel_db = librosa.power_to_db(mel, ref=np.max)
        mel_resized = resize(mel_db, (128, 256), anti_aliasing=True)
        return mel_resized

    def _spec_augment(self, spec_tensor):
        spec_tensor = self.freq_mask(spec_tensor)
        spec_tensor = self.time_mask(spec_tensor)
        return spec_tensor


In [None]:
BATCH_SIZE = 16

train_loader = DataLoader(AudioDataset(train_data, augment=True), batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader   = DataLoader(AudioDataset(val_data, augment = False), batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_loader = DataLoader(AudioDataset(test_df,augment = False), batch_size = BATCH_SIZE,shuffle = False, num_workers = 2)

In [None]:
resnet = models.resnet18(weights="IMAGENET1K_V1")
resnet.fc = nn.Sequential(
    nn.Dropout(0.4),nn.Linear(resnet.fc.in_features, len(label_encoder.classes_))
  )
model = resnet.to(device)


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 207MB/s]


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-3)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',factor=0.5, patience=3)


In [None]:
best_val_loss = float('inf')
patience, wait = 5, 0
epochs = 25

for epoch in range(epochs):
    model.train()
    train_loss, correct = 0, 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward(); optimizer.step()
        train_loss += loss.item() * x.size(0)
        correct += (out.argmax(1) == y).sum().item()
    train_loss /= len(train_loader.dataset)
    train_acc = correct / len(train_loader.dataset)

    model.eval()
    val_loss, correct = 0, 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            loss = criterion(out, y)
            val_loss += loss.item() * x.size(0)
            correct += (out.argmax(1) == y).sum().item()
    val_loss /= len(val_loader.dataset)
    val_acc = correct / len(val_loader.dataset)

    scheduler.step(val_loss)

    print(f"Epoch {epoch+1}/{epochs} | Train {train_loss:.4f}/{train_acc:.4f} | Val {val_loss:.4f}/{val_acc:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        wait = 0
        torch.save(model.state_dict(), "audio_model_best.pth")

    else:
        wait += 1
        if wait >= patience:

            break

Epoch 1/25 | Train 0.3012/0.8975 | Val 0.1603/0.9464
Epoch 2/25 | Train 0.2346/0.9221 | Val 0.0866/0.9696
Epoch 3/25 | Train 0.1813/0.9380 | Val 0.0973/0.9681
Epoch 4/25 | Train 0.1656/0.9424 | Val 0.0726/0.9783
Epoch 5/25 | Train 0.1272/0.9572 | Val 0.0794/0.9681
Epoch 6/25 | Train 0.1051/0.9667 | Val 0.0730/0.9783
Epoch 7/25 | Train 0.0896/0.9703 | Val 0.1463/0.9594
Epoch 8/25 | Train 0.0767/0.9736 | Val 0.0873/0.9696
Epoch 9/25 | Train 0.0609/0.9822 | Val 0.0457/0.9841
Epoch 10/25 | Train 0.0451/0.9880 | Val 0.0535/0.9826
Epoch 11/25 | Train 0.0261/0.9924 | Val 0.0536/0.9812
Epoch 12/25 | Train 0.0250/0.9949 | Val 0.0589/0.9812
Epoch 13/25 | Train 0.0352/0.9906 | Val 0.0575/0.9783
Epoch 14/25 | Train 0.0261/0.9924 | Val 0.0459/0.9870


In [None]:
model.eval()

preds = []

with torch.no_grad():
    for x, _ in test_loader:
        x = x.to(device)
        outputs = model(x)
        pred_indices = outputs.argmax(1).cpu().numpy()
        preds.extend(pred_indices)

file_names = [path for path in test_df["FilePath"]]

# Convert predicted indices to class labels
pred_labels = label_encoder.inverse_transform(preds)


In [None]:
submission_df = pd.DataFrame({
    "ID": [os.path.basename(path) for path in test_df["FilePath"]],
    "Class": label_encoder.inverse_transform(preds)
})

In [None]:
submission_df

Unnamed: 0,ID,Class
0,103249-5-0-10.wav,engine_idling
1,101848-9-0-8.wav,street_music
2,106014-5-0-0.wav,engine_idling
3,101848-9-0-3.wav,street_music
4,103249-5-0-13.wav,engine_idling
...,...,...
735,84699-4-6-0.wav,drilling
736,99192-4-0-1.wav,drilling
737,89442-9-0-34.wav,street_music
738,98681-9-0-5.wav,street_music


In [None]:
submission_df.to_csv("/content/drive/MyDrive/submission2.csv", index=False)
