In [98]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

In [99]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


In [None]:
class RussianCharTokenizer:
    def __init__(self):
        self.chars = [
            "_",  
            " ",
            "а","б","в","г","д","е","ё","ж","з","и","й",
            "к","л","м","н","о","п","р","с","т","у","ф",
            "х","ц","ч","ш","щ","ъ","ы","ь","э","ю","я"
        ]
        self.char2idx = {c: i for i, c in enumerate(self.chars)}
        self.idx2char = {i: c for i, c in enumerate(self.chars)}

    def encode(self, text):
        text = text.lower()
        return [self.char2idx[c] for c in text if c in self.char2idx]

    def decode(self, tokens):
        return "".join(self.idx2char[t] for t in tokens)

In [None]:
SAMPLE_RATE = 16000
MAX_AUDIO_LEN = 10 * SAMPLE_RATE  
EPOCHS = 5

mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=400,
    hop_length=160,
    n_mels=80
)

amplitude_to_db = torchaudio.transforms.AmplitudeToDB()


In [None]:
def load_audio_to_mel(path):
    waveform, sr = torchaudio.load(path)

    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    if sr != SAMPLE_RATE:
        waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform)

    waveform = waveform.squeeze(0)

    if waveform.shape[0] > MAX_AUDIO_LEN:
        waveform = waveform[:MAX_AUDIO_LEN]

    mel = mel_transform(waveform)
    mel = amplitude_to_db(mel)

    return mel  


In [2]:
class SpeechDataset(Dataset):
    def __init__(self, csv_path, tokenizer):
        self.df = pd.read_csv(csv_path)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.iloc[idx]["path"]
        text = self.df.iloc[idx]["text"]

        features = load_audio_to_mel(path)
        labels = torch.tensor(
            self.tokenizer.encode(text),
            dtype=torch.long
        )

        return features, labels


NameError: name 'Dataset' is not defined

In [None]:
def collate_fn(batch):
    features, labels = zip(*batch)

    features = [f.transpose(0, 1) for f in features]

    features = nn.utils.rnn.pad_sequence(
        features,
        batch_first=True
    ) 


    features = features.transpose(1, 2)  

    labels = nn.utils.rnn.pad_sequence(
        labels,
        batch_first=True,
        padding_value=-1
    )

    return features, labels


In [None]:
class SpeechRecognitionModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=(1,2), padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Conv2d(32, 64, kernel_size=3, stride=(1,2), padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Conv2d(64, 128, kernel_size=3, stride=(1,2), padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
        )


        self.lstm = nn.LSTM(
            input_size=128 * (N_MELS // 8),
            hidden_size=256,
            num_layers=3,
            batch_first=True,
            bidirectional=True,
            dropout=0.2
        )


        self.fc = nn.Sequential(
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, vocab_size)
        )

    def forward(self, x):
        x = x.unsqueeze(1)             
        x = self.cnn(x)              
        x = x.permute(0, 3, 1, 2)      
        x = x.flatten(2)                

        x, _ = self.lstm(x)
        x = self.fc(x)
        return x


In [None]:
tokenizer = RussianCharTokenizer()

dataset = SpeechDataset(r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\final_dataset.csv", tokenizer)
train_dataset = SpeechDataset(r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\train.csv", tokenizer)
test_dataset = SpeechDataset(r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\test.csv", tokenizer)
val_dataset = SpeechDataset(r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\val.csv", tokenizer)

train_loader = DataLoader(
    train_dataset,
    batch_size=8,      
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_dataset,
    batch_size=8,      
    shuffle=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size=8,     
    shuffle=True,
    collate_fn=collate_fn
)

dataloader = DataLoader(
    dataset,
    batch_size=8,     
    shuffle=True,
    collate_fn=collate_fn
)

model = SpeechRecognitionModel(
    vocab_size=len(tokenizer.chars)
).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CTCLoss(blank=0, zero_infinity=True)
# ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)


In [None]:
epochs = 5

for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    progress = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")

    for features, labels in progress:
        features = features.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(features)
        outputs = outputs.log_softmax(2)
        outputs = outputs.permute(1, 0, 2)  

        input_lengths = torch.full(
            size=(outputs.size(1),),
            fill_value=outputs.size(0),
            dtype=torch.long,
            device=device
        )

        target_lengths = torch.tensor(
            [len(l[l != -1]) for l in labels],
            dtype=torch.long,
            device=device
        )

        targets = torch.cat([l[l != -1] for l in labels])

        loss = criterion(
            outputs,
            targets,
            input_lengths,
            target_lengths
        )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress.set_postfix(loss=f"{loss.item():.4f}")

    print(f"Epoch {epoch+1} done | Avg loss: {total_loss / len(dataloader):.4f}")


Epoch 1/5: 100%|██████████| 12909/12909 [23:32<00:00,  9.14it/s, loss=2.9332]


Epoch 1 done | Avg loss: 3.1991


Epoch 2/5: 100%|██████████| 12909/12909 [18:46<00:00, 11.46it/s, loss=2.9921]


Epoch 2 done | Avg loss: 3.0993


Epoch 3/5: 100%|██████████| 12909/12909 [23:42<00:00,  9.08it/s, loss=2.9951]


Epoch 3 done | Avg loss: 3.0951


Epoch 4/5: 100%|██████████| 12909/12909 [25:20<00:00,  8.49it/s, loss=3.0294]


Epoch 4 done | Avg loss: 3.0916


Epoch 5/5: 100%|██████████| 12909/12909 [18:48<00:00, 11.44it/s, loss=3.4419]

Epoch 5 done | Avg loss: 3.0871





In [111]:
model.eval()

SpeechRecognitionModel(
  (cnn): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 2), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 2), padding=(1, 1))
    (3): ReLU()
  )
  (lstm): LSTM(5120, 128, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=35, bias=True)
)

In [112]:
def greedy_decode(output, tokenizer):
    tokens = torch.argmax(output, dim=-1)
    tokens = tokens.unique_consecutive()
    tokens = tokens[tokens != 0]
    return tokenizer.decode(tokens.tolist())


In [1]:
features, labels = next(iter(test_loader))
features = features.to(device)

with torch.no_grad():
    outputs = model(features)
    outputs = outputs.log_softmax(2)

pred = greedy_decode(outputs[0].cpu(), tokenizer)
true = tokenizer.decode(labels[0][labels[0] != -1].tolist())

print("GT :", true)
print("PRED:", pred)

NameError: name 'test_loader' is not defined

In [115]:
features, _ = train_dataset[0]

with torch.no_grad():
    output = model(features.unsqueeze(0).to(device))
    text = greedy_decode(output[0], tokenizer)

print("Распознано:", text)


Распознано: о рн


In [None]:
torch.save(model.state_dict(), r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Model\stt_model.pth")
torch.save(model, r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Model\stt_full_model.pth")
