In [27]:
import pandas as pd
import torch
import torchaudio
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm


In [28]:
SAMPLE_RATE = 16000
N_MELS = 80
BATCH_SIZE = 8
EPOCHS = 12

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", DEVICE)


DEVICE: cuda


In [29]:
train_df = pd.read_csv(r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\train.csv")
val_df   = pd.read_csv(r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\val.csv")
test_df  = pd.read_csv(r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\test.csv")


In [30]:
def clean_text(df):
    df = df.dropna(subset=["path", "text"]).reset_index(drop=True)
    df["text"] = df["text"].astype(str).str.lower()
    df["text"] = df["text"].str.replace(r"[^а-яё\s]", "", regex=True)
    df["text"] = df["text"].str.replace(r"\s+", " ", regex=True)
    df["text"] = df["text"].str.strip()
    df = df[df["text"].str.len() > 0].reset_index(drop=True)
    return df

train_df = clean_text(train_df)
val_df   = clean_text(val_df)
test_df  = clean_text(test_df)


In [31]:
class Tokenizer:
    def __init__(self):
        self.chars = list(" абвгдеёжзийклмнопрстуфхцчшщъыьэюя")
        self.char2idx = {c: i for i, c in enumerate(self.chars)}
        self.idx2char = {i: c for i, c in enumerate(self.chars)}

    def encode(self, text):
        return [self.char2idx[c] for c in text]

    def decode(self, ids):
        return "".join([self.idx2char[i] for i in ids])

tokenizer = Tokenizer()
VOCAB_SIZE = len(tokenizer.chars)
print("VOCAB SIZE:", VOCAB_SIZE)


VOCAB SIZE: 34


In [None]:
mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_mels=N_MELS
)

db_transform = torchaudio.transforms.AmplitudeToDB()


def load_audio_to_mel(path):
    try:
        waveform, sr = torchaudio.load(path)
    except:
        waveform = torch.zeros(1, SAMPLE_RATE)
        sr = SAMPLE_RATE

    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)


    if sr != SAMPLE_RATE:
        waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform)

    mel = mel_transform(waveform)     
    mel = db_transform(mel)         
    mel = mel.squeeze(0)             

    return mel



In [33]:
class SpeechDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.iloc[idx]["path"]
        text = self.df.iloc[idx]["text"]

        features = load_audio_to_mel(path)
        labels = torch.tensor(self.tokenizer.encode(text), dtype=torch.long)

        return features, labels


In [None]:
def collate_fn(batch):
    features, labels = zip(*batch)

    features = [f.transpose(0, 1) for f in features]

    features = nn.utils.rnn.pad_sequence(
        features,
        batch_first=True
    )  

    features = features.transpose(1, 2) 

    labels = nn.utils.rnn.pad_sequence(
        labels,
        batch_first=True,
        padding_value=-1
    )

    return features, labels


In [35]:
train_loader = DataLoader(
    SpeechDataset(train_df, tokenizer),
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    SpeechDataset(val_df, tokenizer),
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn
)


In [36]:
class SpeechRecognitionModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=(1,2), padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=(1,2), padding=1),
            nn.ReLU()
        )

        self.lstm = nn.LSTM(
            input_size=64 * 80,
            hidden_size=256,
            num_layers=3,
            bidirectional=True,
            batch_first=True
        )

        self.fc = nn.Linear(512, vocab_size)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.cnn(x)
        x = x.permute(0, 3, 1, 2)
        x = x.flatten(2)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x


In [37]:
model = SpeechRecognitionModel(VOCAB_SIZE).to(DEVICE)

ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=3e-4,
    weight_decay=1e-4
)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    patience=2,
    factor=0.5,
    verbose=True
)




In [38]:
def train_one_epoch(model, loader):
    model.train()
    total_loss = 0.0

    progress = tqdm(loader)

    for features, labels in progress:
        features = features.to(DEVICE)
        labels = labels.to(DEVICE)

        optimizer.zero_grad()

        outputs = model(features)
        outputs = outputs.log_softmax(2)

        target_lengths = (labels != -1).sum(dim=1)
        input_lengths = torch.full(
            (outputs.size(0),),
            outputs.size(1),   # ⬅️ ВАЖНО
            dtype=torch.long,
            device=DEVICE
        )

        loss = ctc_loss(
            outputs.transpose(0,1),
            labels,
            input_lengths,
            target_lengths
        )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress.set_postfix(loss=loss.item())

    return total_loss / len(loader)


In [39]:
for epoch in range(EPOCHS):
    avg_loss = train_one_epoch(model, train_loader)

    print(f"Epoch {epoch+1}/{EPOCHS} | Avg loss: {avg_loss:.4f}")

    scheduler.step(avg_loss)

    torch.save(model.state_dict(), f"Model\full_stt_epoch{epoch+1}.pth")


100%|██████████| 10327/10327 [44:15<00:00,  3.89it/s, loss=2.46]  


Epoch 1/12 | Avg loss: 2.4756


RuntimeError: File Modelull_stt_epoch1.pth cannot be opened.

In [None]:
def greedy_decode(output):
    tokens = torch.argmax(output, dim=-1)
    tokens = tokens.unique_consecutive()
    tokens = tokens[tokens != 0]
    return tokenizer.decode(tokens.tolist())


In [None]:
model.eval()
with torch.no_grad():
    for features, labels in val_loader:
        features = features.to(DEVICE)

        outputs = model(features)

        # ====== распознанный текст ======
        pred_text = greedy_decode(outputs[0])
        print("Распознано:", pred_text)

        # ====== оригинальный текст ======
        true_tokens = labels[0]
        true_tokens = true_tokens[true_tokens != -1]  # убираем padding
        true_text = tokenizer.decode(true_tokens.tolist())
        print("Оригинал:  ", true_text)

        break  # показываем один пример


In [None]:
def wer(pred, ref):
    pred_words = pred.split()
    ref_words = ref.split()

    import numpy as np
    dp = np.zeros((len(ref_words)+1, len(pred_words)+1), dtype=int)

    for i in range(len(ref_words)+1):
        dp[i][0] = i
    for j in range(len(pred_words)+1):
        dp[0][j] = j

    for i in range(1, len(ref_words)+1):
        for j in range(1, len(pred_words)+1):
            if ref_words[i-1] == pred_words[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(
                    dp[i-1][j],     # delete
                    dp[i][j-1],     # insert
                    dp[i-1][j-1]    # replace
                )

    return dp[-1, -1] / max(1, len(ref_words))

print("WER:", wer(pred_text, true_text))


In [None]:
# Предположим, модель называется model
torch.save(model.state_dict(), r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Model\full_stt_model.pth")
torch.save(model, r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Model\full_stt_full_model.pth")
