# Подключение библиотеки

In [1]:
import os
from datasets import load_dataset, load_from_disk

In [2]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import Resample
from torch.nn import CTCLoss
from torch.optim import Adam
import torch.nn as nn

## Загрузка датасет

In [24]:
# Проверка, существует ли уже сохраненный датасет
if os.path.exists(saved_dataset_path):
    # Загружаем сохраненный датасет, если он уже существует
    cv_11_train = load_from_disk('/home/redalexdad/recognition_speech/common_voice_11/train/')
    cv_11_test = load_from_disk('/home/redalexdad/recognition_speech/common_voice_11/test/')
else:
    # Иначе, скачиваем и сохраняем датасет
    cv_11_train = load_dataset("mozilla-foundation/common_voice_11_0", "ru", split="train")
    cv_11_train.save_to_disk('/home/redalexdad/recognition_speech')

## Содержимое датасет

In [25]:
cv_11_train

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 22862
})

In [26]:
cv_11_test

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 9630
})

In [27]:
cv_11_train[0]

{'client_id': '918b5e9edfb0aed8d7a73f938e07749e53fdda9babf808efe059e1ff3843b15b6e2e979fd619296e611965601e8219dc9f17de9dd480a08d8141942748e6f0ab',
 'path': '/home/redalexdad/.cache/huggingface/datasets/downloads/extracted/d814cc3a56a5df3b5ccfa17b831afd6938306b9d17da77b602bb4b95387084b6/ru_train_0/common_voice_ru_26426765.mp3',
 'audio': {'path': 'common_voice_ru_26426765.mp3',
  'array': array([-1.06581410e-14,  8.34887715e-14,  8.08242362e-14, ...,
         -2.88323849e-06,  1.16737738e-07,  9.74517661e-07]),
  'sampling_rate': 48000},
 'sentence': 'Демократия неумолимо продвигается по Африке, и «арабская весна» была ее кульминацией.',
 'up_votes': 2,
 'down_votes': 1,
 'age': '',
 'gender': '',
 'accent': '',
 'locale': 'ru',
 'segment': ''}

## Создание класса датасета и обучение

In [10]:
# Создание класса датасета
class VoiceDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.resampler = Resample(orig_freq=48_000, new_freq=16_000)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        audio_path = self.dataset[idx]["path"]
        waveform, sample_rate = torchaudio.load(audio_path, normalize=True)
        waveform = self.resampler(waveform)
        transcription = self.dataset[idx]["sentence"]
        return waveform, transcription

In [11]:
# Инициализация датасета и загрузчика данных
voice_dataset = VoiceDataset(cv_11_train)
dataloader = DataLoader(voice_dataset, batch_size=32, shuffle=True, num_workers=4)

In [12]:
# Пример определения простой модели для голосового распознавания
class SimpleSpeechRecognitionModel(nn.Module):
    def __init__(self, num_classes):
        super(SimpleSpeechRecognitionModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

In [13]:
# Указание количества классов
num_classes = 10  # Замените на фактическое количество классов в вашем датасете

# Инициализация модели
model = SimpleSpeechRecognitionModel(num_classes)

In [14]:
# Определение функции потерь и оптимизатора
criterion = CTCLoss(blank=0)
optimizer = Adam(model.parameters(), lr=0.001)

In [15]:
# Пример обучения модели на нескольких эпохах
num_epochs = 5
for epoch in range(num_epochs):
    for batch_idx, (waveform, transcription) in enumerate(dataloader):
        optimizer.zero_grad()
        output = model(waveform)
        # Преобразование текстов в тензоры для расчета функции потерь CTC
        target = torch.IntTensor([ord(char) for char in transcription])
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        if batch_idx % 10 == 0:
            print(f"Эпоха {epoch+1}/{num_epochs}, Шаг {batch_idx}, Потеря: {loss.item()}")

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/redalexdad/anaconda3/envs/dl_science/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/redalexdad/anaconda3/envs/dl_science/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "/home/redalexdad/anaconda3/envs/dl_science/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
  File "/home/redalexdad/anaconda3/envs/dl_science/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 144, in collate
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/home/redalexdad/anaconda3/envs/dl_science/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 144, in <listcomp>
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/home/redalexdad/anaconda3/envs/dl_science/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 121, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
  File "/home/redalexdad/anaconda3/envs/dl_science/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 174, in collate_tensor_fn
    return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [1, 43008] at entry 0 and [1, 77184] at entry 1


In [None]:
# Сохранение обученной модели
torch.save(model.state_dict(), "my_model_001.pth")