In [None]:
! pip install "numpy<2.0"

In [None]:
! pip install --upgrade librosa
! pip install --upgrade soxr
! python.exe -m pip install --upgrade pip

In [54]:
import os
import numpy as np
import librosa
# # from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

In [55]:
def collect_file_paths_and_labels(dataset_path):
    file_paths = []
    labels = []
    emotion_folders = os.listdir(dataset_path)
    
    for emotion in emotion_folders:
        emotion_path = os.path.join(dataset_path, emotion)
        if os.path.isdir(emotion_path):
            for file_name in os.listdir(emotion_path):
                if file_name.endswith('.wav'):
                    file_paths.append(os.path.join(emotion_path, file_name))
                    labels.append(emotion)  # Название папки используется как метка

    return file_paths, labels

dataset_path = 'audios'
file_paths, labels = collect_file_paths_and_labels(dataset_path)

In [56]:
def find_max_len(file_paths):
    max_len = 0
    for file_path in file_paths:
        y, sr = librosa.load(file_path, sr=None)
        spect = librosa.feature.melspectrogram(y=y, sr=sr)
        if spect.shape[1] > max_len:
            max_len = spect.shape[1]
    return max_len

max_len = find_max_len(file_paths)

In [57]:
class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, max_len, transform=None):
        self.file_paths = file_paths
        self.labels = labels
        self.max_len = max_len
        self.transform = transform
        
        self.label_idx = {label: idx for idx, label in enumerate(labels)}
        
        assert len(self.file_paths) == len(self.labels)

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]

        # Загрузка аудио файла и вычисление спектрограммы
        y, sr = librosa.load(file_path, sr=None)
        spect = librosa.feature.melspectrogram(y=y, sr=sr)
        spect = librosa.power_to_db(spect, ref=np.max)
        spect = np.expand_dims(spect, axis=0)  # Добавление канала для CNN

        if self.max_len is not None:
            if spect.shape[-1] < self.max_len:
                pad_width = self.max_len - spect.shape[-1]
                spect = np.pad(spect, ((0, 0), (0, 0), (0, pad_width)), mode='constant')
            elif spect.shape[-1] > self.max_len:
                spect = spect[:, :, :self.max_len]

        if self.transform:
            spect = self.transform(spect)

        label_idx = self.label_idx[label]
        return torch.tensor(spect, dtype=torch.float32), torch.tensor(label_idx, dtype=torch.long)


def create_dataloader(file_paths, labels, batch_size, max_len=None,  transform=None):
    dataset = AudioDataset(file_paths, labels, max_len, transform)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

batch_size = 32
dataloader = create_dataloader(file_paths, labels, batch_size, max_len)

In [59]:
for batch in dataloader:
    spectrograms, batch_labels = batch
    print(spectrograms.shape, batch_labels)
    break

torch.Size([32, 1, 128, 284]) tensor([2399, 2399, 2399, 2399, 1599, 1599, 2799, 1999, 2799, 2399, 2799, 1599,
        1999, 2399, 1599, 2399,  799, 2399,  399,  399, 2399, 1199, 1599, 1999,
        2799, 2399, 2799, 2799, 1999,  799, 1199, 2399])


In [60]:
import torch.nn as nn
import torch.nn.functional as F

class EmotionRecognitionModel(nn.Module):
    def __init__(self, num_classes):
        super(EmotionRecognitionModel, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        # Размерность данных после CNN
        cnn_output_size = 64 * 32
        
        self.rnn = nn.LSTM(input_size=cnn_output_size, hidden_size=128, num_layers=2, batch_first=True, bidirectional=True)

        self.fc = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.cnn(x)
        x = x.permute(0, 2, 3, 1)  # Перестановка размерностей для LSTM
        x = x.reshape(x.size(0), x.size(1), -1)  # Объединение признаков
        x, _ = self.rnn(x)
        x = x[:, -1, :]  # Используем только последний выход LSTM
        x = self.fc(x)
        return x

num_classes = 7
model = EmotionRecognitionModel(num_classes)

In [61]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for spects, labels in dataloader:
        spects = spects.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(spects)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * spects.size(0)
    
    epoch_loss = running_loss / len(dataloader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')


RuntimeError: input.size(-1) must be equal to input_size. Expected 2048, got 4544