In [18]:
import logging
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import tqdm.notebook as tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as torch_data
import torchaudio
import urllib
import soundfile as sf
from torch.utils.data import Dataset

from IPython.display import clear_output

%matplotlib inline

Создаем класс, в котором будут храниться сырые данные. При вызове __getitem__ происходит кодирование сырых данных в MFCC

In [19]:
class CustomRawDataset(Dataset):
    def __init__(self, folder_path, transform=None):
        self.folder_path = folder_path
        self.transform = transform
        self.files = []
        self.labels = []

        for file_name in os.listdir(folder_path):
            if file_name.endswith('.wav'):
                class_label = int(file_name[0])  
                self.files.append(os.path.join(folder_path, file_name))
                self.labels.append(class_label)


    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_path = self.files[idx]
        label = self.labels[idx]

        waveform, sample_rate = sf.read(file_path)
        waveform = torch.tensor(waveform).float()
        mfcc = torchaudio.transforms.MFCC(
            sample_rate=sample_rate,
            n_mfcc=13
        )(waveform)


        if self.transform:
            mfcc = self.transform(mfcc)
            spectrogram = self.transform(spectrogram)

        return mfcc, label



Данные можно скачать здесь: https://www.kaggle.com/datasets/joserzapata/free-spoken-digit-dataset-fsdd

In [20]:
folder_path = r'C:\Users\Александр\Downloads\free-spoken-digit-dataset-v1.0.8\Jakobovski-free-spoken-digit-dataset-e9e1155\recordings'
dataset = CustomRawDataset(folder_path, transform=None)
ds_x = []
ds_y = []

In [21]:
dataset[1210][0].shape

torch.Size([13, 17])

In [22]:
for i in range(len(dataset)):
    file, label = dataset[i]
    ds_x.append(file)
    ds_y.append(label)

Подготовка данных

In [23]:
import torch

def pad_2d(tensors):
    max_cols = max(tensor.shape[1] for tensor in tensors)
    padded_tensors = []
    for tensor in tensors:
        pad_width = max_cols - tensor.shape[1]
        padding = (0, pad_width) 
        padded_tensor = torch.nn.functional.pad(tensor, padding, mode='constant', value=0)
        padded_tensors.append(padded_tensor)
    result = torch.stack(padded_tensors)
    return result


In [24]:
ds_x = pad_2d(ds_x)
ds_y = torch.tensor(ds_y)

In [25]:
ds_x[3].shape

torch.Size([13, 92])

In [26]:
ds_x.shape, ds_y.shape

(torch.Size([1500, 13, 92]), torch.Size([1500]))

Создаем модель

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN1(nn.Module):
    def __init__(self):
        super(CNN1, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=2, out_channels=32, kernel_size=2)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 48, kernel_size=2)
        self.bn2 = nn.BatchNorm2d(48)
        self.pool = nn.MaxPool2d(kernel_size=2)
        self.dropout1 = nn.Dropout(0.25)
        self.flatten = nn.Flatten()
        # Размер входных данных для первого Dense слоя нужно определить после сверток и пуллинга
        # Предположим, что входные размеры: (2, 13, 91)
        # После первого Conv2d: (2, 12, 90)
        # После второго Conv2d: (48, 11, 89)
        # После MaxPool2d: (48, 5, 45)
        # Размер для Flatten: 48 * 5 * 45 = 10800
        self.fc1 = nn.Linear(10800, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(0.25)
        self.fc2 = nn.Linear(128, 64)
        self.bn4 = nn.BatchNorm1d(64)
        self.dropout3 = nn.Dropout(0.25)
        self.fc3 = nn.Linear(64, 10)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = self.dropout1(x)
        x = self.flatten(x)
        x = F.relu(self.bn3(self.fc1(x)))
        x = self.dropout2(x)
        x = F.relu(self.bn4(self.fc2(x)))
        x = self.dropout3(x)
        x = self.fc3(x)
        return F.softmax(x, dim=1)

model = CNN1()


In [28]:
from sklearn.model_selection import train_test_split

ds_x_train, ds_x_test, ds_y_train, ds_y_test = train_test_split(ds_x, ds_y, test_size=0.2, random_state=42)

print("Размер обучающей выборки:", ds_x_train.shape, ds_y_train.shape)
print("Размер тестовой выборки:", ds_x_test.shape, ds_y_test.shape)


Размер обучающей выборки: torch.Size([1200, 13, 92]) torch.Size([1200])
Размер тестовой выборки: torch.Size([300, 13, 92]) torch.Size([300])


In [29]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long) 
        if self.x.ndim == 3:  
            self.x = self.x.unsqueeze(1)  
            self.x = torch.cat([self.x, self.x], dim=1) 
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

train_dataset = CustomDataset(ds_x_train, ds_y_train)
test_dataset = CustomDataset(ds_x_test, ds_y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)



  self.x = torch.tensor(x, dtype=torch.float32)
  self.y = torch.tensor(y, dtype=torch.long)


Обучение

In [30]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

num_epochs = 15

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs, labels
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
    
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100.0 * correct / total
    
    model.eval()
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            test_correct += (predicted == labels).sum().item()
            test_total += labels.size(0)
    test_acc = 100.0 * test_correct / test_total
    
    print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {epoch_loss:.4f}, Train Accuracy: {epoch_acc:.2f}%, Test Accuracy: {test_acc:.2f}%")


Epoch [1/15] - Loss: 2.1392, Train Accuracy: 53.00%, Test Accuracy: 72.00%
Epoch [2/15] - Loss: 1.8581, Train Accuracy: 87.08%, Test Accuracy: 95.33%
Epoch [3/15] - Loss: 1.6812, Train Accuracy: 93.83%, Test Accuracy: 96.00%
Epoch [4/15] - Loss: 1.5956, Train Accuracy: 96.25%, Test Accuracy: 96.00%
Epoch [5/15] - Loss: 1.5636, Train Accuracy: 96.67%, Test Accuracy: 98.00%
Epoch [6/15] - Loss: 1.5345, Train Accuracy: 97.33%, Test Accuracy: 98.00%
Epoch [7/15] - Loss: 1.5187, Train Accuracy: 97.75%, Test Accuracy: 97.33%
Epoch [8/15] - Loss: 1.5054, Train Accuracy: 98.50%, Test Accuracy: 98.33%
Epoch [9/15] - Loss: 1.4986, Train Accuracy: 98.67%, Test Accuracy: 96.33%
Epoch [10/15] - Loss: 1.4919, Train Accuracy: 98.83%, Test Accuracy: 97.67%
Epoch [11/15] - Loss: 1.4888, Train Accuracy: 98.92%, Test Accuracy: 97.00%
Epoch [12/15] - Loss: 1.4907, Train Accuracy: 99.33%, Test Accuracy: 97.33%
Epoch [13/15] - Loss: 1.4859, Train Accuracy: 99.00%, Test Accuracy: 98.33%
Epoch [14/15] - Loss: