1. **Классификация текстовых отрывков**
   - Подбор архитектур RNN (LSTM, GRU) и их гиперпараметров для анализа естественных текстов (Чехов, Достоевский, Маяковский и пр.).
   - Оценка качества (Accuracy, Loss) обучения на различных наборах текстов и конфигурациях сетей.

In [None]:
!pip install sentencepiece



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential, layers, optimizers, callbacks
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score


In [None]:

# 1. Settings
authors = ["Чехов", "Маяковский", "Достоевский", "Корь", "Чесотка", "Шизофрения"]
model_types = ["LSTM", "GRU"]
seq_length = 40
vocab_size = 4000      # adjust as needed
embedding_dim = 64
rnn_units = 64
batch_size = 32
epochs = 10            # for demo; increase as needed
dropout = 0.2
recurennt_dropout = 0.2
func_activation = "softmax" # defualt softmax
bidirectional_rnn_units = int(embedding_dim)

results = []


In [None]:
# 2. Helper: build model
import tensorflow as tf
from tensorflow.keras import Model, Input, layers, optimizers

def build_model_with_self_attention(model_type):
    # 1) вход
    inp = Input(shape=(seq_length,), name="input_tokens")

    # 2) эмбеддинги
    x = layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        input_length=seq_length,
        name="embedding"
    )(inp)

    # 3) Bi-RNN, возвращает всю последовательность
    if model_type == "LSTM":
        rnn = layers.LSTM(
            bidirectional_rnn_units,
            return_sequences=True,
            recurrent_dropout=recurennt_dropout,
            name="bi_lstm"
        )
    else:
        rnn = layers.GRU(
            bidirectional_rnn_units,
            return_sequences=True,
            recurrent_dropout=recurennt_dropout,
            name="bi_gru"
        )
    x = layers.Bidirectional(rnn, merge_mode="concat", name="bidirectional")(x)
    x = layers.Dropout(dropout, name="dropout_after_bi_rnn")(x)

    # 4) Self-Attention
    #    query = key = value = все тайм-степы из Bi-RNN
    attn = layers.MultiHeadAttention(
        num_heads=4,
        key_dim=bidirectional_rnn_units,    # обычно = units
        name="self_attention"
    )
    # в Functional API: attn(query, value)
    attn_out = attn(x, x)
    # добавим residual-связь
    x = layers.Add(name="residual_after_attention")([x, attn_out])

    # 5) Второй RNN без return_sequences
    if model_type == "LSTM":
        x = layers.LSTM(
            rnn_units,
            recurrent_dropout=recurennt_dropout,
            name="final_lstm"
        )(x)
    else:
        x = layers.GRU(
            rnn_units,
            recurrent_dropout=recurennt_dropout,
            name="final_gru"
        )(x)
    x = layers.Dropout(dropout, name="dropout_before_dense")(x)

    # 6) Выходной плотный слой
    out = layers.Dense(
        vocab_size,
        activation=func_activation,
        name="softmax_output"
    )(x)

    model = Model(inputs=inp, outputs=out, name=f"{model_type}_with_self_attention")

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=optimizers.Adam(3e-4, clipnorm=1.0),
        metrics=["accuracy"]
    )
    return model

# пример создания
model = build_model_with_self_attention("GRU")
model.summary()




In [None]:
# 3. Loop over authors & model types
for author in authors:
    # 3.1 Load & preprocess text
    with open(f"/content/drive/MyDrive/Text_MLmedicine/data/{author}.txt", encoding="utf-8") as f:
        text = f.read().lower()
    # simple clean: keep cyrillic, whitespace, basic punctuation
    text = tf.strings.regex_replace(text, r"[^а-яё0-9\s\.,;:!\?—\-]", " ")
    text = tf.strings.regex_replace(text, r"\s+", " ").numpy().decode().strip()

    # 3.2 Tokenize with Keras Tokenizer
    tok = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<unk>")
    tok.fit_on_texts([text])
    seq = tok.texts_to_sequences([text])[0]

    # 3.3 Create input-output pairs
    X, y = [], []
    for i in range(len(seq) - seq_length):
        X.append(seq[i:i+seq_length])
        y.append(seq[i+seq_length])
    X = np.array(X)
    y = np.array(y)

    # 3.4 Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=5641)

    for mtype in model_types:
        print(f"Training {mtype} for {author}...")
        model = build_model_with_self_attention(mtype)
        # callback to save best
        cb = callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)
        history = model.fit(
            X_train, y_train,
            validation_split=0.2,
            epochs=epochs,
            batch_size=batch_size,
            callbacks=[cb],
            verbose=1
        )

        # 3.5 Evaluate
        loss, acc = model.evaluate(X_test, y_test, verbose=0)
        # predict probabilities
        y_pred_probs = model.predict(X_test, verbose=0)
        y_pred = np.argmax(y_pred_probs, axis=1)
        # metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        ev = explained_variance_score(y_test, y_pred)

        # 3.6 Record
        results.append({
            "Author": author,
            "Model": mtype,
            "Embedding dim": embedding_dim,
            "RNN units": rnn_units,
            "Dropout": dropout,
            "Epochs": len(history.history["loss"]),
            "Batch size": batch_size,
            "Final Loss": round(loss, 4),
            "Final Accuracy": round(acc, 4),
            "MSE": round(mse, 4),
            "RMSE": round(rmse, 4),
            "MAE": round(mae, 4),
            "R2": round(r2, 4),
            "Explained Variance": ev
        })



Training LSTM for Чехов...
Epoch 1/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 257ms/step - accuracy: 0.0359 - loss: 7.9377 - val_accuracy: 0.0490 - val_loss: 7.2815
Epoch 2/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 250ms/step - accuracy: 0.0528 - loss: 6.9724 - val_accuracy: 0.0490 - val_loss: 7.5235
Epoch 3/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 235ms/step - accuracy: 0.0516 - loss: 6.8879 - val_accuracy: 0.0490 - val_loss: 7.6462
Training GRU for Чехов...
Epoch 1/10




[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 210ms/step - accuracy: 0.0439 - loss: 7.9430 - val_accuracy: 0.0490 - val_loss: 7.2699
Epoch 2/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 200ms/step - accuracy: 0.0540 - loss: 6.9288 - val_accuracy: 0.0490 - val_loss: 7.4647
Epoch 3/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 198ms/step - accuracy: 0.0489 - loss: 6.8450 - val_accuracy: 0.0490 - val_loss: 7.6198
Training LSTM for Маяковский...
Epoch 1/10




[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 349ms/step - accuracy: 0.0241 - loss: 8.2929 - val_accuracy: 0.0114 - val_loss: 8.2867
Epoch 2/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 200ms/step - accuracy: 0.0564 - loss: 8.2429 - val_accuracy: 0.0114 - val_loss: 8.1607
Epoch 3/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 295ms/step - accuracy: 0.0436 - loss: 7.7030 - val_accuracy: 0.0114 - val_loss: 7.9909
Epoch 4/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 202ms/step - accuracy: 0.0371 - loss: 7.0221 - val_accuracy: 0.0114 - val_loss: 8.0098
Epoch 5/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 283ms/step - accuracy: 0.0400 - loss: 6.6234 - val_accuracy: 0.0114 - val_loss: 8.0901
Training GRU for Маяковский...
Epoch 1/10




[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 360ms/step - accuracy: 0.0140 - loss: 8.2930 - val_accuracy: 0.0343 - val_loss: 8.2885
Epoch 2/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 218ms/step - accuracy: 0.0421 - loss: 8.2635 - val_accuracy: 0.0114 - val_loss: 8.1994
Epoch 3/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 291ms/step - accuracy: 0.0317 - loss: 7.7598 - val_accuracy: 0.0114 - val_loss: 7.9953
Epoch 4/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 217ms/step - accuracy: 0.0277 - loss: 7.0402 - val_accuracy: 0.0114 - val_loss: 8.0157
Epoch 5/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 293ms/step - accuracy: 0.0298 - loss: 6.6247 - val_accuracy: 0.0114 - val_loss: 8.1068
Training LSTM for Достоевский...
Epoch 1/10




[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 215ms/step - accuracy: 0.0406 - loss: 7.7977 - val_accuracy: 0.0529 - val_loss: 7.2110
Epoch 2/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 205ms/step - accuracy: 0.0635 - loss: 6.7917 - val_accuracy: 0.0529 - val_loss: 7.4173
Epoch 3/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 210ms/step - accuracy: 0.0617 - loss: 6.7173 - val_accuracy: 0.0529 - val_loss: 7.5480
Training GRU for Достоевский...
Epoch 1/10




[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 219ms/step - accuracy: 0.0523 - loss: 7.7802 - val_accuracy: 0.0529 - val_loss: 7.2195
Epoch 2/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 214ms/step - accuracy: 0.0580 - loss: 6.7400 - val_accuracy: 0.0529 - val_loss: 7.3802
Epoch 3/10
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 214ms/step - accuracy: 0.0618 - loss: 6.6790 - val_accuracy: 0.0529 - val_loss: 7.4779
Training LSTM for Корь...




Epoch 1/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 264ms/step - accuracy: 0.0159 - loss: 8.2882 - val_accuracy: 0.0359 - val_loss: 8.1127
Epoch 2/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 222ms/step - accuracy: 0.0202 - loss: 7.7204 - val_accuracy: 0.0221 - val_loss: 7.3581
Epoch 3/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 247ms/step - accuracy: 0.0286 - loss: 6.7241 - val_accuracy: 0.0221 - val_loss: 7.2089
Epoch 4/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 240ms/step - accuracy: 0.0245 - loss: 6.4414 - val_accuracy: 0.0221 - val_loss: 7.2230
Epoch 5/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 248ms/step - accuracy: 0.0220 - loss: 6.2918 - val_accuracy: 0.0221 - val_loss: 7.2713
Training GRU for Корь...
Epoch 1/10




[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 253ms/step - accuracy: 0.0087 - loss: 8.2879 - val_accuracy: 0.0221 - val_loss: 8.1351
Epoch 2/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 196ms/step - accuracy: 0.0266 - loss: 7.6465 - val_accuracy: 0.0221 - val_loss: 7.3480
Epoch 3/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 180ms/step - accuracy: 0.0270 - loss: 6.7300 - val_accuracy: 0.0221 - val_loss: 7.2203
Epoch 4/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 217ms/step - accuracy: 0.0304 - loss: 6.3788 - val_accuracy: 0.0221 - val_loss: 7.2368
Epoch 5/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 220ms/step - accuracy: 0.0303 - loss: 6.3112 - val_accuracy: 0.0221 - val_loss: 7.2849
Training LSTM for Чесотка...
Epoch 1/10




[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 299ms/step - accuracy: 0.0068 - loss: 8.2928 - val_accuracy: 0.0282 - val_loss: 8.2847
Epoch 2/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 189ms/step - accuracy: 0.0263 - loss: 8.2239 - val_accuracy: 0.0282 - val_loss: 7.9916
Epoch 3/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 228ms/step - accuracy: 0.0235 - loss: 7.4601 - val_accuracy: 0.0282 - val_loss: 7.6199
Epoch 4/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 190ms/step - accuracy: 0.0272 - loss: 6.7687 - val_accuracy: 0.0282 - val_loss: 7.5514
Epoch 5/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 191ms/step - accuracy: 0.0303 - loss: 6.4579 - val_accuracy: 0.0282 - val_loss: 7.5805
Epoch 6/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 218ms/step - accuracy: 0.0345 - loss: 6.2613 - val_accuracy: 0.0282 - val_loss: 7.6371
Training GRU for Чесотка...




Epoch 1/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 276ms/step - accuracy: 0.0101 - loss: 8.2931 - val_accuracy: 0.0282 - val_loss: 8.2871
Epoch 2/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 277ms/step - accuracy: 0.0485 - loss: 8.2472 - val_accuracy: 0.0282 - val_loss: 8.0261
Epoch 3/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 214ms/step - accuracy: 0.0416 - loss: 7.4861 - val_accuracy: 0.0282 - val_loss: 7.6635
Epoch 4/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 262ms/step - accuracy: 0.0387 - loss: 6.7937 - val_accuracy: 0.0282 - val_loss: 7.5888
Epoch 5/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 287ms/step - accuracy: 0.0383 - loss: 6.4480 - val_accuracy: 0.0282 - val_loss: 7.6110
Epoch 6/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 215ms/step - accuracy: 0.0437 - loss: 6.2520 - val_accuracy: 0.0282 - val_loss: 7.6779
Training LSTM for Шизофре



[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 265ms/step - accuracy: 0.0046 - loss: 8.2934 - val_accuracy: 0.0333 - val_loss: 8.2899
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 285ms/step - accuracy: 0.0220 - loss: 8.2740 - val_accuracy: 0.0333 - val_loss: 8.2173
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 191ms/step - accuracy: 0.0418 - loss: 8.0098 - val_accuracy: 0.0333 - val_loss: 7.9303
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 233ms/step - accuracy: 0.0472 - loss: 7.2796 - val_accuracy: 0.0333 - val_loss: 7.7492
Epoch 5/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 186ms/step - accuracy: 0.0296 - loss: 6.7986 - val_accuracy: 0.0333 - val_loss: 7.7009
Epoch 6/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 262ms/step - accuracy: 0.0317 - loss: 6.5112 - val_accuracy: 0.0333 - val_loss: 7.7185
Epoch 7/10
[1m19/19[0m [32m━━━━━━━━



[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 365ms/step - accuracy: 0.0156 - loss: 8.2934 - val_accuracy: 0.0333 - val_loss: 8.2894
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 194ms/step - accuracy: 0.0449 - loss: 8.2762 - val_accuracy: 0.0333 - val_loss: 8.2418
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 285ms/step - accuracy: 0.0342 - loss: 8.0320 - val_accuracy: 0.0333 - val_loss: 7.8719
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 195ms/step - accuracy: 0.0295 - loss: 7.1654 - val_accuracy: 0.0333 - val_loss: 7.7259
Epoch 5/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 266ms/step - accuracy: 0.0371 - loss: 6.7222 - val_accuracy: 0.0333 - val_loss: 7.6930
Epoch 6/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 200ms/step - accuracy: 0.0446 - loss: 6.4265 - val_accuracy: 0.0333 - val_loss: 7.7208
Epoch 7/10
[1m19/19[0m [32m━━━━━━━━

In [None]:
# 4. Save results table
df = pd.DataFrame(results)
df.to_csv("model_comparison_results_self_attention_more.csv", index=False)
print("Done. Results saved to model_comparison_results.csv")

Done. Results saved to model_comparison_results.csv


In [None]:
df

Unnamed: 0,Author,Model,Embedding dim,RNN units,Dropout,Epochs,Batch size,Final Loss,Final Accuracy,MSE,RMSE,MAE,R2,Explained Variance
0,Чехов,LSTM,64,64,0.2,3,32,7.3243,0.0433,1885206.0,1373.028,805.6142,-0.525,0.0
1,Чехов,GRU,64,64,0.2,3,32,7.336,0.0433,1885206.0,1373.028,805.6142,-0.525,0.0
2,Маяковский,LSTM,64,64,0.2,5,32,7.7807,0.0594,127829.2,357.5321,254.105,-1.016,0.0
3,Маяковский,GRU,64,64,0.2,5,32,7.7863,0.0594,127829.2,357.5321,254.105,-1.016,0.0
4,Достоевский,LSTM,64,64,0.2,3,32,7.1098,0.0638,1619774.0,1272.7035,696.9266,-0.4282,0.0
5,Достоевский,GRU,64,64,0.2,3,32,7.1006,0.0638,1619774.0,1272.7035,696.9266,-0.4282,0.0
6,Корь,LSTM,64,64,0.2,5,32,7.2298,0.0243,107304.9,327.5743,231.2693,-0.9929,0.0
7,Корь,GRU,64,64,0.2,5,32,7.2302,0.0243,107304.9,327.5743,231.2693,-0.9929,0.0
8,Чесотка,LSTM,64,64,0.2,6,32,7.4548,0.0226,101175.7,318.0812,221.1541,-0.9348,0.0
9,Чесотка,GRU,64,64,0.2,6,32,7.466,0.0301,101618.8,318.7771,222.094,-0.9433,0.0


In [None]:
# 4) Функция генерации
UNK_ID = sp.PieceToId("<unk>")  # обычно стоит 0, но на всякий случай

def generate_text(model, sp, seed_text, gen_length=200, temperature=1.0):
    # 4.1) Токенизируем seed
    seed_ids = sp.EncodeAsIds(seed_text)
    # если seed короче чем seq_length — дополняем слева нулями
    if len(seed_ids) < seq_length:
        seed_ids = [0] * (seq_length - len(seed_ids)) + seed_ids
    else:
        seed_ids = seed_ids[-seq_length:]

    generated = seed_ids.copy()

    for _ in range(gen_length):
        # готовим батч размером (1, seq_length)
        x_pred = np.array(generated[-seq_length:])[None, :]
        # предсказываем следующий логит
        preds = model.predict(x_pred, verbose=0)[0]   # shape=(vocab_size,)
        # сэмплируем id
        next_id = sample_with_temperature(preds, temperature)
        # next_id = top_k_sample(preds, 5)

        # если это <unk> — пропускаем и пробуем ещё раз
        if next_id == UNK_ID:
            continue

        generated.append(next_id)

    # декодируем весь список обратно в строку
    return sp.DecodeIds(generated)



In [None]:
import re

prompt = "Поручик Дубов уже не молодой армейский служака"
generated = generate_text(model, sp, prompt, gen_length=50, temperature=0.8)

# допустимый набор: кириллица, латиница, знаки пунктуации, пробел
pattern = re.compile(r"[^a-яА-ЯёЁa-zA-Z0-9\.\,\!\?\:\;\—\«\»\s]")
cleaned = pattern.sub("", generated)
print( cleaned )

                                                         оручик   убов уже не молодой армейский служака,, и где которую он не что и эту у — глаза, на радости я что и может то ложитесь. сонные вот озабоченно? лицо никитин, как тени и угрызения ее, себя,, на. человек никакой из, барыня в! что,
