# Скачаем датасет для задачи суммаризации статей на английском

In [None]:
import kagglehub

path = kagglehub.dataset_download("evilspirit05/daily-mail-summarization-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/daily-mail-summarization-dataset


# Импортируем нужные библиотеки

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Attention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.mixed_precision import set_global_policy
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import os

set_global_policy('mixed_float16')  # Enable mixed precision
nltk.download('stopwords')

# Загружаем датасет в датафрейм

In [None]:
df = pd.read_csv(path + '/article_highlights.csv', low_memory=False)

# Функция для простой предобработки текста, нам больше и не нужно, текст уже достаточно чистый

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = " ".join(words)
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Проведем предобработку и убедимся, что все значения в колонке строкового типа (у меня были с этим проблемы)

In [None]:
df['article'] = df['article'].astype(str).apply(preprocess_text)
df['highlights'] = df['highlights'].astype(str).apply(preprocess_text)

# Присвоим значения двум переменным, одна из них - инпут, другая - таргет текст

In [None]:
input_texts = df["article"].values
target_texts = df["highlights"].values

# Определим параметры будущей модели

In [None]:
# Параметры
max_input_length = 100  # Максимальная длина входной последовательности
max_target_length = 50  # Максимальная длина выходной последовательности
vocab_size = 10000  # Размер словаря
embedding_dim = 256  # Размерность эмбеддингов
lstm_units = 512  # Размерность слоя LSTM

# Проведем токенизацию + настроим слои модели

In [None]:
# Токенизируем тексты
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(np.concatenate([input_texts, target_texts]))

# Переводим тексты в последовательность целых чисел
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

# Добавляем токены выходной последовательности
start_token = vocab_size  # Assign a unique token for <start>
end_token = vocab_size + 1  # Assign a unique token for <end>

target_sequences_in = [[start_token] + seq for seq in target_sequences]  # Вход декодеру
target_sequences_out = [seq + [end_token] for seq in target_sequences]  # Выход из декодера

# Паддинг
input_data = pad_sequences(input_sequences, maxlen=max_input_length, padding="post")
target_data_in = pad_sequences(target_sequences_in, maxlen=max_target_length, padding="post")
target_data_out = pad_sequences(target_sequences_out, maxlen=max_target_length, padding="post")

# Определим энкодер
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_dim=vocab_size + 2, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True, dropout=0.2, recurrent_dropout=0.2, recurrent_regularizer=l2(0.01))
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Определим декодер
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=vocab_size + 2, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True, dropout=0.2, recurrent_dropout=0.2, recurrent_regularizer=l2(0.01))
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size + 2, activation="softmax")
output = decoder_dense(decoder_outputs)

# Компилируем модель

In [None]:
# Определим модель
model = Model([encoder_inputs, decoder_inputs], output)


# Компиляция
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss="sparse_categorical_crossentropy", metrics=["accuracy"])



# Обучение модели

In [None]:
# Последние приготовления
target_data_out_one_hot = np.expand_dims(target_data_out, -1)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


model.fit(
    [input_data, target_data_in[:, :-1]],
    target_data_out_one_hot[:, 1:],
    batch_size=128,
    epochs=30,
    validation_split=0.1,
    callbacks=[early_stopping]
)

print("Model training complete.")

Epoch 1/30
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 941ms/step - accuracy: 0.6582 - loss: 10.2635 - val_accuracy: 0.6913 - val_loss: 3.1850
Epoch 2/30
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 888ms/step - accuracy: 0.7185 - loss: 1.6257 - val_accuracy: 0.6981 - val_loss: 2.8905
Epoch 3/30
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 889ms/step - accuracy: 0.7446 - loss: 1.1952 - val_accuracy: 0.7147 - val_loss: 2.9388
Epoch 4/30
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 970ms/step - accuracy: 0.8718 - loss: 0.7995 - val_accuracy: 0.7418 - val_loss: 2.9018
Epoch 5/30
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 884ms/step - accuracy: 0.9419 - loss: 0.4747 - val_accuracy: 0.7580 - val_loss: 2.8454
Epoch 6/30
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 885ms/step - accuracy: 0.9633 - loss: 0.2994 - val_accuracy: 0.7521 - val_loss: 2.8057
Epoch 7/30
[1m58/58

# Код для проверки работы модели. Спойлер - не очень...

In [None]:
encoder_model = Model(inputs=model.input[0], outputs=encoder_states)
decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(inputs=[decoder_inputs] + decoder_states_inputs, outputs=[decoder_outputs] + decoder_states)


def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token
    stop_condition = False
    decoded_sentence = ""

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer.index_word.get(sampled_token_index, "")

        if sampled_char == "" or sampled_token_index == end_token:
            stop_condition = True
        else:
            decoded_sentence += sampled_char + " "

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    return decoded_sentence.strip()


def summarize_text(text):
    input_seq = tokenizer.texts_to_sequences([text])
    input_seq = pad_sequences(input_seq, maxlen=max_input_length, padding="post")
    summary = decode_sequence(input_seq)
    return summary

# Применим на практике
text_to_summarize = '''Marco Rubio, the secretary of state, spoke on Tuesday with his Hungarian counterpart, the foreign minister, Péter Szijjártó, and informed him of the move, state department spokesperson Tammy Bruce said in a statement.

“The Secretary informed foreign minister Szijjártó of senior Hungarian official Antal Rogán’s removal from the US Department of the Treasury’s Specially Designated Nationals and Blocked Persons List, noting that continued designation was inconsistent with US foreign policy interests,” Bruce said.

The two also discussed strengthening US-Hungary alignment on critical issues and opportunities for economic cooperation, Bruce said.

Rogán is a close aide of Orbán and has run his cabinet office since 2015.'''  # Replace with the actual text
summary = summarize_text(text_to_summarize)
summary


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 893ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 374ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step


'bush london london'

# Люблю рнн... (может, я ее не смог понять, но работать нормально не заставил)