In [1]:
import os
import torch
import torchaudio
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import numpy as np


In [2]:

DATA_DIR = "ML_TACTIGON/customTSkin/data/audiodati"
SAMPLE_RATE = 16000
DURATION = 1.0
NUM_CLASSES = 4
BATCH_SIZE = 16
EPOCHS = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
import os

def count_wav_files(directory):
    counts = {}

    # Itera attraverso le sottocartelle nella directory principale
    for subfolder in os.listdir(directory):
        subfolder_path = os.path.join(directory, subfolder)

        # Verifica che sia una directory
        if os.path.isdir(subfolder_path):
            # Conta i file con estensione .wav nella sottocartella
            wav_files = [file for file in os.listdir(subfolder_path) if file.endswith('.wav')]
            counts[subfolder] = len(wav_files)

    return counts

# Percorso della directory principale
directory = "ML_TACTIGON/customTSkin/data/audiodati"
counts = count_wav_files(directory)

# Stampa il numero di file .wav per ogni sottocartella
for subfolder, count in counts.items():
    print(f"{subfolder}: {count} file .wav")


down: 3917 file .wav
no: 3941 file .wav
up: 3723 file .wav
yes: 4043 file .wav


In [4]:

class AudioDataset(Dataset):
    def __init__(self, data_dir, sample_rate, duration):
        self.data = []
        self.labels = []
        self.sample_rate = sample_rate
        self.duration = duration
        self.num_samples = int(sample_rate * duration)
        self.classes = sorted(os.listdir(data_dir))
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}

        for cls in self.classes:
            class_dir = os.path.join(data_dir, cls)
            for file in os.listdir(class_dir):
                if file.endswith(".wav"):
                    self.data.append(os.path.join(class_dir, file))
                    self.labels.append(self.class_to_idx[cls])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data[idx]
        label = self.labels[idx]
        waveform, sr = torchaudio.load(file_path)
        waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.sample_rate)(waveform)

        if waveform.shape[1] < self.num_samples:
            waveform = torch.nn.functional.pad(waveform, (0, self.num_samples - waveform.shape[1]))
        else:
            waveform = waveform[:, :self.num_samples]

        return waveform, label


In [5]:
class AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AudioClassifier, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(2)
        self.dropout = nn.Dropout(0.3)
        self.batch_norm1 = nn.BatchNorm1d(16)
        self.batch_norm2 = nn.BatchNorm1d(32)
        self.batch_norm3 = nn.BatchNorm1d(64)

        self.fc1 = nn.Linear(64 * (SAMPLE_RATE // 8), 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.batch_norm1(self.conv1(x)))
        x = self.pool(x)
        x = self.relu(self.batch_norm2(self.conv2(x)))
        x = self.pool(x)
        x = self.relu(self.batch_norm3(self.conv3(x)))
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x


In [6]:
def train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs, patience=3):
    best_val_loss = float('inf')  # Migliore validation loss osservata
    counter = 0  # Contatore per la pazienza
    best_model_state = None  # Per salvare il miglior modello

    for epoch in range(epochs):
        # Fase di training
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Fase di validazione
        val_loss, val_acc = evaluate_model(model, val_loader, criterion, device)

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {running_loss / len(train_loader):.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2%}")

        # Early stopping: verifica se la validation loss migliora
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
            best_model_state = model.state_dict()  # Salva lo stato del miglior modello
        else:
            counter += 1
            print(f"Early stopping counter: {counter}/{patience}")

        # Interrompi il training se il contatore supera la pazienza
        if counter >= patience:
            print("Early stopping triggered")
            break

    # Ripristina il miglior modello trovato
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    return model


def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    return total_loss / len(dataloader), accuracy


In [7]:

# Creazione dataset completo
dataset = AudioDataset(DATA_DIR, SAMPLE_RATE, DURATION)

In [8]:

# Divisione del dataset in train e test set (80% train, 20% test)
train_size = int(0.7 * len(dataset))
val_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])


In [9]:
# Creazione dei DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [10]:
# Creazione modello
model = AudioClassifier(NUM_CLASSES).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)


In [11]:
train_model(model, train_loader, val_loader, criterion, optimizer, DEVICE, EPOCHS)

Epoch 1/10, Train Loss: 1.0168, Val Loss: 0.7220, Val Acc: 70.07%
Epoch 2/10, Train Loss: 0.7280, Val Loss: 0.6139, Val Acc: 74.84%
Epoch 3/10, Train Loss: 0.6012, Val Loss: 0.5406, Val Acc: 78.59%
Epoch 4/10, Train Loss: 0.5257, Val Loss: 0.5406, Val Acc: 78.46%
Early stopping counter: 1/3
Epoch 5/10, Train Loss: 0.4635, Val Loss: 0.5344, Val Acc: 78.27%
Epoch 6/10, Train Loss: 0.4143, Val Loss: 0.5035, Val Acc: 80.51%
Epoch 7/10, Train Loss: 0.3547, Val Loss: 0.4767, Val Acc: 82.46%
Epoch 8/10, Train Loss: 0.3186, Val Loss: 0.5027, Val Acc: 81.98%
Early stopping counter: 1/3
Epoch 9/10, Train Loss: 0.2986, Val Loss: 0.5181, Val Acc: 81.72%
Early stopping counter: 2/3
Epoch 10/10, Train Loss: 0.2754, Val Loss: 0.5401, Val Acc: 82.04%
Early stopping counter: 3/3
Early stopping triggered


AudioClassifier(
  (conv1): Conv1d(1, 16, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(16, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.3, inplace=False)
  (batch_norm1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=128000, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=4, bias=True)
  (relu): ReLU()
)

In [12]:
def test_model(model, test_loader, criterion, device):
    model.eval()  # Modalità di valutazione (disabilita dropout e batchnorm)
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    return total_loss / len(test_loader), accuracy

In [13]:
test_loss, test_accuracy = test_model(model, test_loader, criterion, DEVICE)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2%}")

Test Loss: 0.6071, Test Accuracy: 80.88%


In [14]:
# Salva i parametri del modello
torch.save(model.state_dict(), "model_state_dict.pth")

In [None]:
# Inizializza il modello (deve essere della stessa struttura del modello salvato)
model = AudioClassifier(NUM_CLASSES).to(DEVICE)

# Carica i parametri salvati
model.load_state_dict(torch.load("model_state_dict.pth"))

# Imposta il modello in modalità valutazione (per test o predizioni)
model.eval()
