In [1]:
import os
import torch
import torchaudio
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pyaudio

In [2]:
DATA_DIR = "ML_TACTIGON/customTSkin/data/audiodati"  
SAMPLE_RATE = 16000     
DURATION = 1.0          
NUM_CLASSES = 4        
BATCH_SIZE = 16
EPOCHS = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
class AudioDataset(Dataset):
    def __init__(self, data_dir, sample_rate, duration):
        self.data = []
        self.labels = []
        self.sample_rate = sample_rate
        self.duration = duration
        self.num_samples = int(sample_rate * duration)
        self.classes = sorted(os.listdir(data_dir))
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}

        for cls in self.classes:
            class_dir = os.path.join(data_dir, cls)
            for file in os.listdir(class_dir):
                if file.endswith(".wav"):
                    self.data.append(os.path.join(class_dir, file))
                    self.labels.append(self.class_to_idx[cls])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data[idx]
        label = self.labels[idx]
        waveform, sr = torchaudio.load(file_path)
        waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.sample_rate)(waveform)

    
        if waveform.shape[1] < self.num_samples:
            waveform = torch.nn.functional.pad(waveform, (0, self.num_samples - waveform.shape[1]))
        else:
            waveform = waveform[:, :self.num_samples]

        return waveform, label


In [4]:
class AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AudioClassifier, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=3,stride=1, padding=1)  
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1)  
        self.pool = nn.MaxPool1d(2)  

        
        self.fc1 = nn.LazyLinear(128) 
        self.fc2 = nn.LazyLinear(num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        x = torch.flatten(x, 1)  
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [6]:
def train_model(model, dataloader, criterion, optimizer, device, epochs):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            #inputs = inputs.unsqueeze(1)  
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss / len(dataloader)}")


In [7]:
dataset = AudioDataset(DATA_DIR, SAMPLE_RATE, DURATION)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [8]:
model = AudioClassifier(NUM_CLASSES).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [24]:
a, b = next(iter(dataloader))

In [9]:
train_model(model, dataloader, criterion, optimizer, DEVICE, EPOCHS)

Epoch 1/10, Loss: 1.0206618001084
Epoch 2/10, Loss: 0.5384140662017718


KeyboardInterrupt: 