In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('slava_kpss_lyrics.csv')

In [None]:
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
import os
import re
import time
import random

import numpy as np
import tensorflow as tf

from tokenizers import ByteLevelBPETokenizer, Tokenizer
from tokenizers.models import BPE
from tokenizers.normalizers import Sequence, NFD, Lowercase, StripAccents
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.processors import BertProcessing

def remove_bracketed_sections(text):
    text = re.sub(r'\[.?\]', '', text)
    table = str.maketrans({';': '', '"': '', ' ': ' '})
    return text.translate(table)

def preprocess_lyrics(text):
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r'\(.?\)', '', text)
    text = re.sub(r'\s\n\s', '\n', text)
    return text.strip()

df['lyrics'] = df['lyrics'].apply(remove_bracketed_sections)
df['lyrics'] = df['lyrics'].apply(preprocess_lyrics)

all_sentences = []
for s in df['lyrics'].dropna():
    all_sentences += sent_tokenize(s, language='russian')

all_songs = "\n".join(all_sentences)
with open("all_songs.txt", "w", encoding="utf-8") as f:
    f.write(all_songs)


os.makedirs("tokenizer", exist_ok=True)
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(
    files="all_songs.txt",
    vocab_size=30_000,
    min_frequency=2,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<eos>"]
)
tokenizer.save_model("tokenizer")

bpe = Tokenizer(BPE("tokenizer/vocab.json", "tokenizer/merges.txt"))
bpe.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
bpe.pre_tokenizer = ByteLevel()
bpe.decoder = ByteLevelDecoder()
bpe.post_processor = BertProcessing(("</s>", bpe.token_to_id("</s>")), ("<s>", bpe.token_to_id("<s>")))

  bpe = Tokenizer(BPE("tokenizer/vocab.json", "tokenizer/merges.txt"))


In [None]:
sequence_len = 100
batch_size   = 64
buffer_size  = 10_000

def encode_bpe(text):
    try:
        if not text:
            return np.zeros((1,), dtype=np.int32)
        text_str = text.numpy().decode('utf-8')
        ids = bpe.encode(text_str).ids
        ids.append(bpe.token_to_id("<eos>"))
        return np.array(ids, dtype=np.int32)

def tf_encode(txt):
    result = tf.py_function(encode_bpe, inp=[txt], Tout=tf.int32)
    result.set_shape([None])
    return result

print(f"Количество предложений: {len(all_sentences)}")
ds = tf.data.Dataset.from_tensor_slices(all_sentences)
ds = ds.map(tf_encode)
ds = ds.filter(lambda x: tf.size(x) > sequence_len + 1)

def chunk_and_split(sequence):
    chunks = tf.reshape(sequence[:((tf.shape(sequence)[0]) // (sequence_len+1)) * (sequence_len+1)], [-1, sequence_len+1])
    inputs = chunks[:, :-1]
    targets = chunks[:, 1:]
    return inputs, targets

ds = ds.map(chunk_and_split)
ds = ds.unbatch()  # Разбиваем пакеты примеров из одной последовательности
ds = ds.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
ds = ds.prefetch(tf.data.AUTOTUNE)

vocab_size = bpe.get_vocab_size()
embed_dim  = 256
rnn_units  = 1024
EPOCHS = 4

def build_model():
    inputs = tf.keras.layers.Input(shape=(sequence_len,), dtype=tf.int32)
    x = tf.keras.layers.Embedding(vocab_size, embed_dim)(inputs)
    x = tf.keras.layers.LSTM(rnn_units, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(x)
    x = tf.keras.layers.LSTM(rnn_units//2, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    x = tf.keras.layers.BatchNormalization()(x)
    outputs = tf.keras.layers.Dense(vocab_size)(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

model = build_model()
model.summary()
model.compile(
    optimizer="adam",
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["sparse_categorical_accuracy"]
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

try:
    train_size = int(0.9 * len(list(ds)))
    train_ds = ds.take(train_size)
    val_ds = ds.skip(train_size)

    history = model.fit(
        train_ds,
        epochs=EPOCHS,
        validation_data=val_ds,
        callbacks=[early_stopping]
    )

Количество предложений: 5992


Epoch 1/4
     41/Unknown [1m1067s[0m 26s/step - loss: 10.0981 - sparse_categorical_accuracy: 0.0646



[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1111s[0m 27s/step - loss: 10.0841 - sparse_categorical_accuracy: 0.0652 - val_loss: 8.0743 - val_sparse_categorical_accuracy: 0.1043
Epoch 2/4
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1149s[0m 26s/step - loss: 8.2426 - sparse_categorical_accuracy: 0.1268 - val_loss: 6.8057 - val_sparse_categorical_accuracy: 0.1047
Epoch 3/4
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1101s[0m 26s/step - loss: 6.9738 - sparse_categorical_accuracy: 0.1510 - val_loss: 6.2294 - val_sparse_categorical_accuracy: 0.1414
Epoch 4/4
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1103s[0m 27s/step - loss: 6.4147 - sparse_categorical_accuracy: 0.1799 - val_loss: 5.6994 - val_sparse_categorical_accuracy: 0.1576


In [None]:
def generate_text(model, start_string, temperature=1.0, max_length=1000, greedy=False):
    """
    greedy: если True, выбирает наиболее вероятный токен на каждом шаге
            если False, выбирает случайно, с весами на основе вероятностей
    """
    input_ids = encode_bpe(tf.constant(start_string))
    if len(input_ids) > sequence_len:
        input_ids = input_ids[-sequence_len:]
    elif len(input_ids) < sequence_len:
        padding = np.zeros(sequence_len - len(input_ids), dtype=np.int32)
        input_ids = np.concatenate([padding, input_ids])
    text_generated = []
    input_tensor = tf.convert_to_tensor(input_ids, dtype=tf.int32)
    input_tensor = tf.expand_dims(input_tensor, 0)
    for i in range(max_length):
        predictions = model(input_tensor)
        predictions = predictions[:, -1, :]

        if greedy:
            predicted_id = tf.argmax(predictions, axis=-1).numpy()[0]
        else:
            predictions = predictions / temperature
            probs = tf.nn.softmax(predictions, axis=-1).numpy()[0]
            predicted_id = np.random.choice(len(probs), p=probs)
        text_generated.append(predicted_id)
        input_tensor = tf.concat([input_tensor[:, 1:],
                                  tf.expand_dims([predicted_id], 0)], axis=-1)
        if predicted_id == bpe.token_to_id("<eos>"):
            break
    result_ids = np.array(text_generated, dtype=np.int32)
    result_text = bpe.decode(result_ids.tolist())
    return start_string + result_text

def compare_generation_methods(model, start_string, temperatures=[0.5, 1.0, 1.5]):
    greedy_text = generate_text(model, start_string, temperature=1.0, greedy=True)
    for temp in temperatures:
        random_text = generate_text(model, start_string, temperature=temp, greedy=False)
# model = tf.keras.models.load_model('saved_model_path')
start_texts = [
    "Я встретил тебя",
    "В небе сияла",
    "Когда наступит время"
]

for start_text in start_texts:
    compare_generation_methods(model, start_text, temperatures=[0.5, 1.0, 1.5])
    print("\n" + "=" * 70 + "\n")

Начальный текст: 'Я встретил тебя'
--------------------------------------------------
ЖАДНАЯ ГЕНЕРАЦИЯ (выбор наиболее вероятного токена):
Я встретил тебя
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава кпсс]
[куплет 1: слава 

KeyboardInterrupt: 

In [None]:
def save_model_and_tokenizer(model, model_path='lstm_model.keras', tokenizer_path='tokenizer'):
    model.save(model_path)

def load_model_and_tokenizer(model_path='lstm_model', tokenizer_path='tokenizer'):
    model = tf.keras.models.load_model(model_path)
    bpe = Tokenizer(BPE(f"{tokenizer_path}/vocab.json", f"{tokenizer_path}/merges.txt"))
    bpe.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
    bpe.pre_tokenizer = ByteLevel()
    bpe.decoder = ByteLevelDecoder()
    bpe.post_processor = BertProcessing(
        ("</s>", bpe.token_to_id("</s>")), ("<s>", bpe.token_to_id("<s>"))
    )
    return model, bpe


def interactive_generation(model, bpe):
    while True:
        start_text = input("\nНачальный текст: ")
        if start_text.lower() == 'q':
            break

        temp = float(input("Температура (0.1-2.0, где 1.0 - нейтральная): ") or "1.0")
        max_len = int(input("Максимальная длина (в токенах): ") or "100")

        greedy = input("Использовать жадный выбор? (y/n): ").lower() == 'y'

        print("\nГенерация...")
        generated_text = generate_text(
            model,
            start_text,
            temperature=temp,
            max_length=max_len,
            greedy=greedy
        )

save_model_and_tokenizer(model)

Модель сохранена в lstm_model.keras
Токенизатор находится в tokenizer
