## Set up paths and imports

In [None]:
import os

import torch
import torch.nn as nn
from torchvision import transforms

if not os.path.exists("./notebooks"):
    %cd ..

from src.training import train, validate
from src.dataset import prepare_dataset_loaders
from src.data_processing import load_mean_std
from src.config import DATASET_DIR, PATIENCE_THRESHOLD, VALID_ACCESS_LABELS

wandb_enabled = False

## 1. Load standarization data and define Config

In [None]:
class Config:
    def __init__(self, lr=0.001, epochs=40, batch_size=32):
        self.learning_rate = lr
        self.epochs = epochs
        self.batch_size = batch_size

### Optionally initialize W&B project

In [None]:
import wandb

wandb_enabled = True

## 2. Define training and validation loop

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def do_train(name, train_loader, val_loader, config, model, criterion, optimizer):
    if wandb_enabled:
            wandb.init(name=name, project="iml", config=vars(config))
 
    model.device = device
    model.to(device)

    saved = False
    patience = 0
    best_f1 = -1

    for epoch in range(config.epochs):
        print(f"Epoch {epoch+1}/{config.epochs}")

        if wandb_enabled:
            logger = wandb.log
        else:
            logger = lambda data,step: print(f"  Step {step}: {data}")

        train(model, train_loader, criterion, optimizer, epoch, logger, len(train_loader) // 5 - 1)
        metrics = validate(model, val_loader)
        print(metrics)

        if wandb_enabled:
            wandb.log({"validation/recall": metrics.recall, "validation/accuracy": metrics.accuracy, "validation/precision": metrics.precision, "validation/f1": metrics.f1, "epoch": epoch+1})

        if metrics.f1 < best_f1:
            patience = patience + 1
        else:
            patience = 0
            best_f1 = metrics.f1
        if patience >= PATIENCE_THRESHOLD:
            model_path = f"./models/{name}.pth"
            os.makedirs(os.path.dirname(model_path), exist_ok=True)
            torch.save(model.state_dict(), model_path)
            saved = True

    if(saved == False):
            model_path = f"./models/{name}.pth"
            os.makedirs(os.path.dirname(model_path), exist_ok=True)
            torch.save(model.state_dict(), model_path)

    if wandb_enabled:
        wandb.save(model_path)
        wandb.finish()
    


In [None]:
from torchvggish import vggish
from torchvggish import vggish_input
import librosa
import numpy as np

model = vggish.VGGish(pretrained=True)
model.eval()

# VGGish specific
def preprocess_audio(file_path, target_sample_rate=16000):
    """
    Load a .wav file, convert to mono, and preprocess into log-Mel spectrogram.
    """
    audio, sr = librosa.load(file_path, sr=target_sample_rate, mono=True)
    
    # Ensure audio is not too short for VGGish
    if len(audio) < target_sample_rate:
        # Pad with zeros if shorter than 1 second
        padding = target_sample_rate - len(audio)
        audio = np.pad(audio, (0, padding), mode='constant')

    # Convert audio to log-Mel spectrogram
    mel_spec = vggish_input.waveform_to_examples(audio, sr)
    return torch.tensor(mel_spec).unsqueeze(1)

def extract_features(file_paths):
    features = []
    for file in file_paths:
        print(f"Processing: {file}")
        mel_spec = preprocess_audio(file)

        # Extract features using VGGish
        with torch.no_grad():
            file_features = model(mel_spec)
        features.append((file, file_features.numpy()))
    return features

In [None]:
class SpeechDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        spectrogram, label = self.data[idx]
        return spectrogram, torch.tensor(label, dtype=torch.long)

In [None]:
# EfficientNetB0
from torchvision.models import efficientnet_b0
from torchvision.models import EfficientNet_B0_Weights

weights = EfficientNet_B0_Weights.DEFAULT
pretrained_model = efficientnet_b0(weights=weights)
pre_trans = weights.transforms()
name_base="EfficientNet_B0"

In [None]:
# Freeze base model (transfer learning)
pretrained_model.requires_grad_(False)
next(iter(pretrained_model.parameters())).requires_grad
name = name_base + "_fine_tuning"

In [None]:
# Do not freeze model
name = name_base + "_transfer_learning"

In [None]:
N_CLASSES = 2

num_features = pretrained_model.classifier[1].in_features
pretrained_model.classifier = nn.Sequential(
    nn.Linear(num_features, 256),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(256, N_CLASSES)
)
my_model = pretrained_model

In [None]:
from torch.utils.data import Dataset
from PIL import Image

class SpectrogramVGG16Dataset(Dataset):
    def __init__(self, directory, transform=None):
        self.files = [
            os.path.join(directory, f)
            for f in os.listdir(directory)
            if f.endswith(".png")
        ]
        self.transform = transform

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        """
        Retrieves an image and its label.

        Parameters:
            idx (int): Index of the image in the dataset.

        Returns:
            tuple: A tuple containing the transformed image and its label.
        """
        img_path = self.files[idx]
        speaker_id = img_path.split("/")[-1].split("_")[0]
        label = int(speaker_id in VALID_ACCESS_LABELS)

        image = Image.open(img_path).convert("RGB")
        image = pre_trans(image)

        if self.transform:
            image = self.transform(image)

        return image, label

In [None]:
model = my_model
config = Config(batch_size=32, epochs=40, lr=0.001)
transform = transforms.Compose([])
train_loader, val_loader, test_loader = prepare_dataset_loaders(transform, config.batch_size, SpectrogramVGG16Dataset)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

do_train(name, train_loader, val_loader, config, model, criterion, optimizer)