In [1]:
! pip install "numpy<2.0"



In [2]:
! pip install --upgrade librosa
! pip install --upgrade soxr
! python.exe -m pip install --upgrade pip



In [3]:
import os
import numpy as np
import librosa
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

In [4]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # Синхронизация CUDA

In [5]:
def collect_file_paths_and_labels(dataset_path):
    file_paths = []
    labels = []
    emotion_folders = os.listdir(dataset_path)
    
    for emotion in emotion_folders:
        emotion_path = os.path.join(dataset_path, emotion)
        if os.path.isdir(emotion_path):
            for file_name in os.listdir(emotion_path):
                if file_name.endswith('.wav'):
                    file_paths.append(os.path.join(emotion_path, file_name))
                    labels.append(emotion)  # Название папки используется как метка

    return file_paths, labels

dataset_path = 'audios'
file_paths, labels = collect_file_paths_and_labels(dataset_path)

In [6]:
def find_max_len(file_paths):
    max_len = 0
    for file_path in file_paths:
        y, sr = librosa.load(file_path, sr=None)
        spect = librosa.feature.melspectrogram(y=y, sr=sr)
        if spect.shape[1] > max_len:
            max_len = spect.shape[1]
    return max_len

max_len = find_max_len(file_paths)
max_len

284

In [7]:
class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, max_len, transform=None):
        self.file_paths = file_paths
        self.max_len = max_len
        self.transform = transform
        
        self.labels = [str(label) for label in labels]
        unique_labels = sorted(set(self.labels))
        self.label_idx = {label: idx for idx, label in enumerate(unique_labels)}
        self.indexed_labels = [self.label_idx[label] for label in self.labels]
        
        print("Original Labels:", self.labels)
        print("Label Index Map:", self.label_idx)
        print("Indexed Labels:", self.indexed_labels)
        
        print(len(self.file_paths), len(self.labels))
        assert len(self.file_paths) == len(self.labels)

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]

        # Загрузка аудио файла и вычисление спектрограммы
        y, sr = librosa.load(file_path, sr=None)
        spect = librosa.feature.melspectrogram(y=y, sr=sr)
        spect = librosa.power_to_db(spect, ref=np.max)
        spect = np.expand_dims(spect, axis=0)  # Добавление канала для CNN

        if self.max_len is not None:
            if spect.shape[-1] < self.max_len:
                pad_width = self.max_len - spect.shape[-1]
                spect = np.pad(spect, ((0, 0), (0, 0), (0, pad_width)), mode='constant')
            elif spect.shape[-1] > self.max_len:
                spect = spect[:, :, :self.max_len]

        if self.transform:
            spect = self.transform(spect)

        label_idx = self.label_idx[label]
        return torch.tensor(spect, dtype=torch.float32), torch.tensor(label_idx, dtype=torch.long)


def create_dataloader(file_paths, labels, batch_size, max_len=None,  transform=None):
    dataset = AudioDataset(file_paths, labels, max_len, transform)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

batch_size = 32
dataloader = create_dataloader(file_paths, labels, batch_size, max_len)

Original Labels: ['angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', 'angry', '

In [8]:
for batch in dataloader:
    spectrograms, batch_labels = batch
    print(spectrograms.shape, batch_labels)
    break

torch.Size([32, 1, 128, 284]) tensor([1, 2, 1, 1, 2, 0, 3, 0, 1, 4, 5, 0, 6, 6, 6, 0, 4, 6, 4, 3, 3, 2, 0, 3,
        4, 1, 3, 0, 3, 6, 6, 0])


In [9]:
class EmotionRecognitionModel(nn.Module):
    def __init__(self, num_classes):
        super(EmotionRecognitionModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        
        # cnn_output_size = 64 * 32
        self.cnn_output_size = 64 * max_len // 4
        
        self.rnn = nn.LSTM(input_size=self.cnn_output_size, hidden_size=128, num_layers=2, batch_first=True, bidirectional=True)

        self.fc = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.cnn(x)
        x = x.permute(0, 2, 3, 1)
        x = x.reshape(x.size(0), x.size(1), -1)
        x, _ = self.rnn(x)
        x = x[:, -1, :]
        x = self.fc(x)
        return x

num_classes = 7
model = EmotionRecognitionModel(num_classes)

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device("cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.SGD(model.parameters(), lr=0.001)

num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for spects, labels in dataloader:
        spects = spects.to(device)
        labels = labels.to(device)
        
        # print(f"Batch spects: {spects.shape}, labels: {labels.shape}")
        # print(f"Labels min: {labels.min()}, labels max: {labels.max()}")
        
        optimizer.zero_grad()
        
        outputs = model(spects)
        # print(f"Outputs: {outputs.shape}")
        # print(outputs)
        # print(f"Labels: {labels}")
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * spects.size(0)
    
    epoch_loss = running_loss / len(dataloader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')


Epoch 1/100, Loss: 1.9273
Epoch 2/100, Loss: 1.9261
Epoch 3/100, Loss: 1.9236
Epoch 4/100, Loss: 1.9203
Epoch 5/100, Loss: 1.9189
Epoch 6/100, Loss: 1.9178
Epoch 7/100, Loss: 1.9153
Epoch 8/100, Loss: 1.9124
Epoch 9/100, Loss: 1.9111
Epoch 10/100, Loss: 1.9083
Epoch 11/100, Loss: 1.9044
Epoch 12/100, Loss: 1.9027
Epoch 13/100, Loss: 1.9010
Epoch 14/100, Loss: 1.8964
Epoch 15/100, Loss: 1.8931
Epoch 16/100, Loss: 1.8885
Epoch 17/100, Loss: 1.8858
Epoch 18/100, Loss: 1.8813
Epoch 19/100, Loss: 1.8779
Epoch 20/100, Loss: 1.8729
Epoch 21/100, Loss: 1.8681
Epoch 22/100, Loss: 1.8628
Epoch 23/100, Loss: 1.8583
Epoch 24/100, Loss: 1.8515
Epoch 25/100, Loss: 1.8457
Epoch 26/100, Loss: 1.8403
Epoch 27/100, Loss: 1.8335
Epoch 28/100, Loss: 1.8270
Epoch 29/100, Loss: 1.8176
Epoch 30/100, Loss: 1.8130
Epoch 31/100, Loss: 1.8049
Epoch 32/100, Loss: 1.7945
Epoch 33/100, Loss: 1.7879
Epoch 34/100, Loss: 1.7776
Epoch 35/100, Loss: 1.7698
Epoch 36/100, Loss: 1.7590
Epoch 37/100, Loss: 1.7494
Epoch 38/1