In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed
from keras.callbacks import EarlyStopping
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
MAX_TEXT_LEN = 100
MAX_SUMMARY_LEN = 15
LATENT_DIM = 300
EMBEDDING_DIM = 200

In [3]:


# Data preprocessing function
def preprocess_data():
    # Load data
    summary = pd.read_csv('/content/news_summary.csv', encoding='iso-8859-1')
    raw = pd.read_csv('/content/news_summary_more.csv', encoding='iso-8859-1')

    # Data cleaning and processing
    pre1 = raw.iloc[:, 0:2].copy()
    pre2 = summary.iloc[:, 0:6].copy()
    pre = pd.DataFrame()
    pre['text'] = pd.concat([pre1['text'], pre2['text']], ignore_index=True)
    pre['summary'] = pd.concat([pre1['headlines'], pre2['headlines']], ignore_index=True)

    # Clean text using NLTK
    pre['cleaned_text'] = pre['text'].apply(lambda x: ' '.join(word_tokenize(x)))
    pre['cleaned_summary'] = pre['summary'].apply(lambda x: ' '.join(word_tokenize(x)))

    # Tokenize text and summary
    x_tokenizer = Tokenizer()
    x_tokenizer.fit_on_texts(list(pre['cleaned_text']))
    y_tokenizer = Tokenizer()
    y_tokenizer.fit_on_texts(list(pre['cleaned_summary']))

    # Pad sequences
    x_tr_seq = x_tokenizer.texts_to_sequences(x_tr)
    x_val_seq = x_tokenizer.texts_to_sequences(x_val)
    y_tr_seq = y_tokenizer.texts_to_sequences(y_tr)
    y_val_seq = y_tokenizer.texts_to_sequences(y_val)

    x_tr = pad_sequences(x_tr_seq, maxlen=MAX_TEXT_LEN, padding='post')
    x_val = pad_sequences(x_val_seq, maxlen=MAX_TEXT_LEN, padding='post')
    y_tr = pad_sequences(y_tr_seq, maxlen=MAX_SUMMARY_LEN, padding='post')
    y_val = pad_sequences(y_val_seq, maxlen=MAX_SUMMARY_LEN, padding='post')

    # Split data
    x_tr, x_val, y_tr, y_val = train_test_split(x_tr, y_tr, test_size=0.1, random_state=42, shuffle=True)

    return x_tr, x_val, y_tr, y_val, x_tokenizer, y_tokenizer


In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
def build_model(x_voc, y_voc):
    # Encoder
    encoder_inputs = Input(shape=(MAX_TEXT_LEN,))
    enc_emb = Embedding(x_voc, EMBEDDING_DIM, trainable=True)(encoder_inputs)
    # ...

    # Decoder
    decoder_inputs = Input(shape=(None,))
    dec_emb_layer = Embedding(y_voc, EMBEDDING_DIM, trainable=True)
    dec_emb = dec_emb_layer(decoder_inputs)
    # ...

    # Compile the model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

    return model


In [6]:
def train_model(model, x_tr, y_tr, x_val, y_val):
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

    history = model.fit([x_tr, y_tr[:, :-1]], y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)[:, 1:],
                        epochs=50, callbacks=[es], batch_size=128,
                        validation_data=([x_val, y_val[:, :-1]], y_val.reshape(y_val.shape[0], y_val.shape[1], 1)[:, 1:]))

    return history



In [7]:
# Inference function
def generate_summary(model, input_seq, x_tokenizer, y_tokenizer, max_summary_len):
    # Encode the input sequence to get the feature vector
    encoder_model = Model(inputs=model.input[0], outputs=model.get_layer('encoder_lstm_3').output[0])

    # Decoder setup
    decoder_state_input_h = Input(shape=(LATENT_DIM,))
    decoder_state_input_c = Input(shape=(LATENT_DIM,))
    decoder_hidden_state_input = Input(shape=(MAX_TEXT_LEN, LATENT_DIM))

    dec_emb_layer = model.get_layer('embedding_2')
    dec_emb2 = dec_emb_layer(model.input[1])

    decoder_lstm = model.get_layer('lstm_2')
    decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

    decoder_dense = model.get_layer('time_distributed_1')
    decoder_outputs2 = decoder_dense(decoder_outputs2)

    decoder_model = Model(
        [model.input[1]] + [decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c],
        [decoder_outputs2] + [state_h2, state_c2])

    # Inference
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = y_tokenizer.word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [encoder_model.predict(input_seq)] + [np.zeros((1, MAX_TEXT_LEN, LATENT_DIM)),
                                                                                                       np.zeros((1, LATENT_DIM)),
                                                                                                       np.zeros((1, LATENT_DIM))])

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]

        if sampled_token != 'eostok':
            decoded_sentence += ' ' + sampled_token

        if sampled_token == 'eostok' or len(decoded_sentence.split()) >= (max_summary_len - 1):
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

    return decoded_sentence


In [None]:
# Main code
if __name__ == "__main__":
    x_tr, x_val, y_tr, y_val, x_tokenizer, y_tokenizer = preprocess_data()

    # Build and compile the model
    model = build_model(x_tokenizer.num_words + 1, y_tokenizer.num_words + 1)

    # Train the model
    history = train_model(model, x_tr, y_tr, x_val, y_val)

    # Sample inference
    for i in range(5):
        print("Review:", seq2text(x_tr[i], x_tokenizer))
        print("Original summary:", seq2summary(y_tr[i], y_tokenizer))
        print("Predicted summary:", generate_summary(model, x_tr[i].reshape(1, MAX_TEXT_LEN), x_tokenizer, y_tokenizer))
        print("\n")