In [78]:
from __future__ import unicode_literals, print_function, division

from typing import List

import pandas as pd

import time

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from io import open
import unicodedata
import re
import random

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, Dataset
from torch.nn.utils.rnn import pad_sequence

from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [79]:
dataset = pd.read_csv('data/data_tokenize.csv')
pairs = list(dataset[["title", "text"]].itertuples(index=False, name=None))
train_pairs, val_pairs = train_test_split(pairs, test_size=0.1, random_state=42)

In [80]:
sos_token = 0
eos_token = 1
MAX_VOCAB_SIZE = 30_000

MAX_INPUT_LEN = 300
MAX_TARGET_LEN = 30


# Работа с данными

## Словарь частот

In [81]:
class Vocab:
    """Создаёт словари с частотами слов на основе входных данных"""

    def __init__(self, name):
        self.name = name
        self.word2index = {"<pad>": 0, "<unk>": 1, "sos": 2, "eos": 3}
        self.word2count = {"<pad>": 0, "<unk>": 0, "sos": 0, "eos": 0}
        self.index2word = {0: "<pad>", 1: "<unk>", 2: "sos", 3: "eos"}
        self.n_words = 4

        self._temp_word_counts = {}

    def addText(self, text: str):
        """Для каждого слова в тексте добавляет его во временный счётчик"""
        for word in text.split():
            self._temp_word_counts[word] = self._temp_word_counts.get(word, 0) + 1

    def build_vocab(self, is_text: bool = False):
        """Строит финальный словарь после подсчёта всех слов"""
        sorted_words = sorted(self._temp_word_counts.items(),
                            key=lambda x: x[1],
                            reverse=True)

        for word, count in sorted_words[:MAX_VOCAB_SIZE - 4]:
            if word not in self.word2index:
                if is_text:
                    if count > 10:
                        self.word2index[word] = self.n_words
                        self.word2count[word] = count
                        self.index2word[self.n_words] = word
                        self.n_words += 1
                    else:
                        self.word2count["<unk>"] += count

                else:
                    if count > 5:
                        self.word2index[word] = self.n_words
                        self.word2count[word] = count
                        self.index2word[self.n_words] = word
                        self.n_words += 1
                    else:
                        self.word2count["<unk>"] += count

        for word, count in sorted_words[MAX_VOCAB_SIZE - 4:]:
            self.word2count["<unk>"] += count

    def word_to_index(self, word: str) -> int:
        """Возвращает индекс слова или <unk>"""
        return self.word2index.get(word, self.word2index["<unk>"])

    def index_to_word(self, index: int) -> str:
        """Возвращает слово по индексу"""
        return self.index2word.get(index, self.word2index["<unk>"])

    def __str__(self):
        """Строковое представление словаря"""
        return (
            f"Vocab(name='{self.name}', "
            f"n_words={self.n_words}, "
        )

    def __len__(self):
        return len(self.word2index)

In [82]:
title_vocab = Vocab("title")
text_vocab = Vocab("text")

In [83]:
for text in dataset['text']:
    text_vocab.addText(text)

for title in dataset['title']:
    title_vocab.addText(title)

text_vocab.build_vocab()
title_vocab.build_vocab()

In [84]:
text_vocab.__str__(), title_vocab.__str__()

("Vocab(name='text', n_words=30000, ", "Vocab(name='title', n_words=19534, ")

In [85]:
input_vocab = Vocab("input")
target_vocab = Vocab("target")

for title, text in train_pairs:
    input_vocab.addText(text)
    target_vocab.addText(title)

input_vocab.build_vocab(is_text=True)
target_vocab.build_vocab(is_text=False)


## Преобразование текста в датасет

In [86]:
def text_to_tensor(text: str, vocab: Vocab, add_sos_eos=True, max_len: int | None = None, truncate_from_start=False) -> torch.Tensor:
    """Преобразует текст в тензоры, с опциональной обрезкой"""
    tokens = text.strip().split()

    if max_len is not None:
        if truncate_from_start:
            tokens = tokens[-max_len:]
        else:
            tokens = tokens[:max_len]

    indices = [vocab.word_to_index(w) for w in tokens]

    if add_sos_eos:
        indices = [vocab.word2index["<sos>"]] + indices + [vocab.word2index["<eos>"]]

    return torch.tensor(indices, dtype=torch.long)


In [87]:
class TitleDataset(Dataset):
    def __init__(self, pairs: list[tuple[str, str]], input_vocab: Vocab, output_vocab: Vocab):
        """
            pairs — список пар (название, текст),
            input_vocab - словарь с частотами слов из текстов,
            output_vocab - словарь с частотами слов из названий
        """
        self.pairs = pairs
        self.input_vocab = input_vocab
        self.output_vocab = output_vocab

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        title, text = self.pairs[idx]
        input_tensor = text_to_tensor(text, self.input_vocab, add_sos_eos=False, max_len=300)
        target_tensor = text_to_tensor(title, self.output_vocab, add_sos_eos=False, max_len=30)
        return input_tensor, target_tensor


In [88]:
def collate_fn(batch: List[tuple[str, str]]):
    """
    batch: list of (input_tensor, target_tensor)
    Returns:
        input_padded: [batch, src_len]
        target_padded: [batch, trg_len]
    """
    src_batch, trg_batch = zip(*batch)

    src_padded = pad_sequence(src_batch, padding_value=0, batch_first=True)
    trg_padded = pad_sequence(trg_batch, padding_value=0, batch_first=True)

    return src_padded, trg_padded


# Модель seq2seq

## Энкодер для seq2seq

In [89]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.3):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.lstm(embedded)
        return output, (hidden, cell)

## Декодер для seq2seq

In [90]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

## Модель

In [91]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.hidden_size == decoder.hid_dim, "Hidden dimensions must match!"
        assert decoder.n_layers == 1, "Encoder must produce compatible layers for decoder"

    def forward(self, src, trg, teacher_forcing_ratio=0.1):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        # [batch_size, trg_len, vocab_size]
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        encoder_outputs, (hidden, cell) = self.encoder(src)

        input = trg[:, 0]  # <sos>

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1

        return outputs



вайбкодинг

In [92]:
PAD_IDX = 0
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [93]:
def train(model, dataloader, optimizer, criterion, clip=1.0, device='cpu'):
    model.train()
    epoch_loss = 0
    total_grad_norm = 0
    batch_count = 0

    for src, trg in dataloader:
        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()

        current_grad_norm = 0
        non_zero_grads = 0

        for p in model.parameters():
            if p.grad is not None:
                grad_mean = p.grad.abs().mean()
                if grad_mean < 0.01:
                    p.grad *= 2.0

                current_grad_norm += p.grad.norm().item()
                non_zero_grads += 1

        avg_grad_norm = current_grad_norm / max(1, non_zero_grads)
        dynamic_clip = min(clip, avg_grad_norm * 1.5)

        torch.nn.utils.clip_grad_norm_(model.parameters(), dynamic_clip)

        optimizer.step()

        epoch_loss += loss.item()
        total_grad_norm += current_grad_norm
        batch_count += 1

    return epoch_loss / len(dataloader)

In [94]:
def evaluate(model, dataloader, criterion, device='cpu'):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for src, trg in dataloader:
            src = src.to(device)
            trg = trg.to(device)

            output = model(src, trg, teacher_forcing_ratio=0.0)

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


In [95]:
INPUT_DIM = input_vocab.n_words
OUTPUT_DIM = target_vocab.n_words
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
HID_DIM = 512
EMB_DIM = 128
ENC_DROPOUT = 0.4
DEC_DROPOUT = 0.4
N_LAYERS = 1

In [96]:
encoder = EncoderLSTM(INPUT_DIM, HID_DIM, dropout_p=ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(encoder, decoder, device).to(device)




In [97]:
train_dataset = TitleDataset(train_pairs, input_vocab, target_vocab)
val_dataset = TitleDataset(val_pairs, input_vocab, target_vocab)

In [98]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [99]:
AD_IDX = target_vocab.word2index["<pad>"]
num_epochs = 10

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=0.0003, weight_decay=1e-4)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.3,
    patience=1,
    threshold=0.01
)

early_stopping_patience = 3
best_val_loss = float('inf')
epochs_without_improvement = 0
best_model_state = None
previous_val_loss = None

for epoch in range(1, num_epochs + 1):
    start_time = time.time()
    train_loss = train(model, train_loader, optimizer, criterion, clip=1.0, device=device)
    val_loss = evaluate(model, val_loader, criterion, device)
    epoch_time = time.time() - start_time

    scheduler.step(val_loss)

    if previous_val_loss is not None and (abs(val_loss - previous_val_loss) <= 0.01 or val_loss > previous_val_loss):
        epochs_without_improvement += 1
    else:
        epochs_without_improvement = 0

    previous_val_loss = val_loss

    if epochs_without_improvement >= early_stopping_patience:
        print(f"Ранний останов после {epoch:02} эпох (val_loss изменяется менее чем на ±0.01 в течение {early_stopping_patience} эпох)!")
        torch.save(model.state_dict(), 'models/best_model.pt')
        break

    print(
        f"{epochs_without_improvement}\n"
        f"Epoch {epoch:02} | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f} "
        f"| LR: {optimizer.param_groups[0]['lr']:.6f} | Time: {epoch_time:3f}s"
    )

0
Epoch 01 | Train Loss: 7.620 | Val Loss: 7.451 | LR: 0.000300 | Time: 193.798814s
0
Epoch 02 | Train Loss: 7.130 | Val Loss: 7.425 | LR: 0.000300 | Time: 195.494848s
0
Epoch 03 | Train Loss: 6.940 | Val Loss: 7.391 | LR: 0.000090 | Time: 192.276566s
1
Epoch 04 | Train Loss: 6.711 | Val Loss: 7.398 | LR: 0.000090 | Time: 200.260401s
2
Epoch 05 | Train Loss: 6.638 | Val Loss: 7.403 | LR: 0.000027 | Time: 196.232395s
Ранний останов после 06 эпох (val_loss изменяется менее чем на ±0.01 в течение 3 эпох)!


In [100]:
if best_model_state is not None:
    model.load_state_dict(best_model_state)


In [106]:
def generate_title(model, input_text, input_vocab, target_vocab, max_len=50, device="cpu", temperature=0.7):
    model.eval()

    tokens = re.findall(r"\w+|[.,!?;]", input_text.lower())
    src_indexes = [input_vocab.word2index.get(token, input_vocab.word2index["<unk>"]) for token in tokens]

    if not src_indexes:
        return "Невозможно проанализировать текст"

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    trg_indexes = [target_vocab.word2index["<sos>"]]

    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        with torch.no_grad():
            output = model(src_tensor, trg_tensor)

        output_dist = output[0,-1].div(temperature).exp()
        pred_token = torch.multinomial(output_dist, 1).item()

        if pred_token == target_vocab.word2index["<eos>"] or (i > 10 and len(set(trg_indexes[-5:])) < 2):
            break

        trg_indexes.append(pred_token)

    filtered = []
    for idx in trg_indexes[1:]:
        word = target_vocab.index_to_word(idx)
        if word not in ["<pad>", "<unk>", "<sos>", "<eos>"] and not word.isdigit():
            filtered.append(word)

    result = ' '.join(filtered).capitalize()
    result = re.sub(r'\s([?.!,](?:\s|$))', r'\1', result)

    return result

In [107]:
INPUT_DIM = input_vocab.n_words
OUTPUT_DIM = target_vocab.n_words
HID_DIM = 512
N_LAYERS = 1

encoder = EncoderLSTM(INPUT_DIM, HID_DIM, dropout_p=0.4)
decoder = Decoder(OUTPUT_DIM, 128, HID_DIM, N_LAYERS, 0.4)

model = Seq2Seq(encoder, decoder, device).to(device)
model.load_state_dict(torch.load('models/best_model.pt', map_location=device))
model.eval()

print("Генератор названий")
print("Введите текст (или 'выход' для завершения):")

while True:
    input_text = input("\n> ")

    if input_text.lower() in ['выход', 'exit', 'quit']:
        break

    if len(input_text.strip()) == 0:
        print("Пожалуйста, введите текст.")
        continue

    title = generate_title(model, input_text, input_vocab, target_vocab, device=device)
    print("\nСгенерированное название:")
    print(title)
    print("\nВведите следующий текст или 'выход' для завершения:")




Генератор заголовков новостей
Введите текст новости (или 'выход' для завершения):

Сгенерированный заголовок:
Сердцем волк

Введите следующий текст или 'выход' для завершения:

Сгенерированный заголовок:
Qu корнеслов ценностях

Введите следующий текст или 'выход' для завершения:

Сгенерированный заголовок:
Шнурки безвременье. .

Введите следующий текст или 'выход' для завершения:


KeyboardInterrupt: Interrupted by user

In [103]:
print("Размер словаря:", len(target_vocab))
print("Примеры слов:", [target_vocab.index_to_word(i) for i in range(10)])

Размер словаря: 18040
Примеры слов: ['<pad>', '<unk>', 'sos', 'eos', '<sos>', '<eos>', '.', ',', '...', 'и']
