In [64]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# import pandas as pd
# import os

# # Set the path to the dataset directory
# data_dir = '/kaggle/input/speech-to-text-russian/data/data/'

# # Load the training labels into a pandas DataFrame
# train_labels_path = os.path.join(data_dir, 'train_labels.csv')
# train_labels_df = pd.read_csv(train_labels_path)

# # Print the first few rows of the DataFrame to verify that it loaded correctly
# print(train_labels_df.head())

# # Set the path to the training audio files directory
# train_audio_dir = os.path.join(data_dir, 'train_wavs')

# # Loop over the audio files in the directory and print their filenames
# for filename in os.listdir(train_audio_dir):
#     print(filename)


In [2]:
!pip install torchaudio --quiet
!pip install pandas --quiet
!pip install torchaudio soundfile --quiet
!pip install transformers --quiet


[0m

In [None]:
import pandas as pd
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim

# Load the dataset
df = pd.read_csv('/kaggle/input/speech-to-text-russian/data/data/train_labels.csv')
df = df[:5000]
audio_files = ['/kaggle/input/speech-to-text-russian/data/data/train_wavs/' + str(i) + '.wav' for i in df['Id']]
transcripts = df['Expected']

# Preprocess the data
waveform_transforms = torchaudio.transforms.MelSpectrogram(sample_rate=16000)
tokenizer = lambda x: x.split()
vocab = set([word for transcript in transcripts for word in tokenizer(transcript)])
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}



class SpeechToTextDataset(Dataset):
    def __init__(self, audio_files, transcripts, waveform_transforms, tokenizer, word_to_idx):
        self.audio_files = audio_files
        self.transcripts = transcripts
        self.waveform_transforms = waveform_transforms
        self.tokenizer = tokenizer
        self.word_to_idx = word_to_idx

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        waveform, sample_rate = torchaudio.load(self.audio_files[idx])
        waveform = self.waveform_transforms(waveform)
        transcript = self.tokenizer(self.transcripts[idx])
        transcript = [self.word_to_idx[word] for word in transcript]
        return waveform, transcript

train_dataset = SpeechToTextDataset(audio_files[:800], transcripts[:800], waveform_transforms, tokenizer, word_to_idx)
test_dataset = SpeechToTextDataset(audio_files[800:1000], transcripts[800:1000], waveform_transforms, tokenizer, word_to_idx)


def collate_fn(batch):
    # Sort the batch by sequence length
    batch = sorted(batch, key=lambda x: x[0].shape[2], reverse=True)
    # Pad the sequences to the same length
    max_length = batch[0][0].shape[2]
    padded_batch = []
    for waveform, transcript in batch:
        padded_waveform = nn.functional.pad(waveform, (0, 0, 0, max_length - waveform.shape[2]), mode='constant', value=0)
        padded_transcript = nn.functional.pad(torch.tensor(transcript), (0, max_length - len(transcript)), mode='constant', value=0)
        padded_batch.append((padded_waveform, padded_transcript))
    # Stack the tensors into a batch
    waveforms, transcripts = zip(*padded_batch)
    waveforms = torch.stack(waveforms)
    transcripts = torch.stack(transcripts)
    return waveforms, transcripts



train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)



In [4]:
for batch in train_loader:
    print(batch)
    break

(tensor([[[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [5.1327e-06, 1.4528e-07, 5.0579e-07,  ..., 7.1468e-09,
           5.0771e-09, 1.8826e-06],
          [2.7636e-05, 7.8223e-07, 2.7233e-06,  ..., 3.8481e-08,
           2.7336e-08, 1.0136e-05],
          ...,
          [6.7102e-04, 1.7031e-07, 9.9636e-08,  ..., 1.7734e-07,
           1.4525e-07, 7.8576e-07],
          [6.5696e-04, 2.3466e-07, 1.3278e-07,  ..., 1.4126e-07,
           1.0601e-07, 4.2714e-07],
          [6.3786e-04, 2.1794e-07, 1.7269e-07,  ..., 1.6624e-07,
           1.0927e-07, 1.9881e-07]]]]), tensor([[11023, 13056,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0, 

In [None]:
# Define the model
class SpeechToTextModel(nn.Module):
    def __init__(self, num_classes, input_size=128, hidden_size=256, num_layers=3, num_channels=32):
        super(SpeechToTextModel, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, num_channels, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.BatchNorm2d(num_channels),
            nn.ReLU(),
            nn.Conv2d(num_channels, num_channels, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
            nn.BatchNorm2d(num_channels),
            nn.ReLU(),
            nn.Conv2d(num_channels, num_channels, kernel_size=(3, 3), stride=(1, 2), padding=(1, 1)),
            nn.BatchNorm2d(num_channels),
            nn.ReLU(),
            nn.Conv2d(num_channels, num_channels, kernel_size=(3, 3), stride=(1, 2), padding=(1, 1)),
            nn.BatchNorm2d(num_channels),
            nn.ReLU(),
        )
        self.rnn = nn.LSTM(input_size=num_channels, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.conv(x)
        batch_size, num_channels, seq_len, input_size = x.size()
        x = x.permute(0, 3, 1, 2)  # Swap dimensions to (batch_size, input_size, num_channels, seq_len)
        x = x.view(batch_size, input_size, num_channels*seq_len)  # Reshape to (batch_size, input_size, num_channels*seq_len)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return x


model = SpeechToTextModel(128, 256, len(vocab))

# Train the model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, len(vocab)), labels.view(-1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 100 == 99:
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0

# Evaluate the model
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 2)
        total += labels.size(0) * labels.size(1)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the test data: %d %%' % (100 * correct / total))
