# TEXT SUMMARIZATION

In [9]:
!pip install attention
!pip install keras-self-attention



In [10]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
# Change import statement to use tensorflow.keras
from tensorflow.keras.preprocessing.text import Tokenizer
# Import AttentionLayer from keras_self_attention
# from keras_self_attention import SeqSelfAttention as AttentionLayer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

## Train-Test Split and Prepare the Tokenizer

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# from sklearn.model_selection import train_test_split

# x_tr,x_val,y_tr,y_val=train_test_split(np.array(df['text']), np.array(df['summary']),
#                                        test_size=0.1, random_state=0, shuffle=True)

max_text_len=30
max_summary_len=8

df_train = pd.read_csv('/content/drive/MyDrive/nlp/train.csv')
df_val = pd.read_csv('/content/drive/MyDrive/nlp/validation.csv')
df_test = pd.read_csv('/content/drive/MyDrive/nlp/test.csv')

x_tr_text = df_train['article']     # source text (encoder input)
y_tr_text = df_train['highlights']  # target summary (decoder input/output)

x_val_text = df_val['article']
y_val_text = df_val['highlights']

def text_cleaner(text):
    new_string = text.lower()
    new_string = re.sub(r'\([^)]*\)', '', new_string)
    new_string = re.sub('"','', new_string)
    new_string = re.sub(r"'s\b","",new_string)
    new_string = re.sub("[^a-zA-Z]", " ", new_string)
    new_string = re.sub('[m]{2,}', 'mm', new_string)
    new_string = re.sub('\s+', ' ', new_string)
    return new_string

x_tr_text = x_tr_text.apply(text_cleaner)
x_val_text = x_val_text.apply(text_cleaner)

y_tr_text = y_tr_text.apply(lambda x: 'sostok ' + x + ' eostok')
y_val_text = y_val_text.apply(lambda x: 'sostok ' + x + ' eostok')

x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(x_tr_text))

## Rarewords and its Coverage on Reviews column

The threshold is taken as 4 which means word whose count is below 4 is considered as a **rare word**

In [13]:
thresh=4

cnt=0
tot_cnt=0
freq=0
tot_freq=0

for key,value in x_tokenizer.word_counts.items():
    tot_cnt=tot_cnt+1
    tot_freq=tot_freq+value
    if(value<thresh):
        cnt=cnt+1
        freq=freq+value

print("% of rare words in vocabulary:", (cnt/tot_cnt)*100)
print("Total Coverage of rare words:", (freq/tot_freq)*100)

% of rare words in vocabulary: 56.46290098581609
Total Coverage of rare words: 0.19136069968023398


In [14]:
print(cnt),print(tot_cnt)

248002
439230


(None, None)

In [15]:
print(freq),print(tot_freq)

371202
193980269


(None, None)

NOTE:

* **tot_cnt** gives the size of vocabulary (which means every unique words in the text)

*   **cnt** gives me the no. of rare words whose count falls below threshold

*  **tot_cnt - cnt** gives me the top most common words

Let us define the tokenizer with **top most common words** for reviews.

## Reviews Tokenizer

In [16]:
# prepare a tokenizer for reviews on training data

x_tokenizer = Tokenizer(num_words=tot_cnt-cnt) # num_words: the maximum number of words to keep, based on word frequency.
x_tokenizer.fit_on_texts(list(x_tr_text))

#convert text sequences into integer sequences
x_tr_seq    =   x_tokenizer.texts_to_sequences(x_tr_text)
x_val_seq   =   x_tokenizer.texts_to_sequences(x_val_text)

#padding zero upto maximum length
x_tr    =   pad_sequences(x_tr_seq,  maxlen=max_text_len, padding='post')
x_val   =   pad_sequences(x_val_seq, maxlen=max_text_len, padding='post')

#size of vocabulary ( +1 for padding token)
x_voc   =  x_tokenizer.num_words + 1

In [17]:
x_voc

191229

## Summary Tokenizer

In [18]:
#prepare a tokenizer for reviews on training data

y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(y_tr_text))

## Rarewords and its Coverage on the summary column

The threshold is taken as 6 which means word whose count is below 6 is considered as a **rare word**

In [19]:
thresh=6

cnt=0
tot_cnt=0
freq=0
tot_freq=0

for key,value in y_tokenizer.word_counts.items():
    tot_cnt=tot_cnt+1
    tot_freq=tot_freq+value
    if(value<thresh):
        cnt=cnt+1
        freq=freq+value

print("% of rare words in vocabulary:",(cnt/tot_cnt)*100)
print("Total Coverage of rare words:",(freq/tot_freq)*100)

% of rare words in vocabulary: 75.44169841566351
Total Coverage of rare words: 2.0002627121592447


In [20]:
print(cnt),print(tot_cnt)

173660
230191


(None, None)

In [21]:
print(freq),print(tot_freq)

292069
14601532


(None, None)

Let us define the tokenizer with **top most common words for summary**.

In [22]:
#prepare a tokenizer for reviews on training data
y_tokenizer = Tokenizer(num_words=tot_cnt-cnt)
y_tokenizer.fit_on_texts(list(y_tr_text))

#convert text sequences into integer sequences
y_tr_seq    =   y_tokenizer.texts_to_sequences(y_tr_text)
y_val_seq   =   y_tokenizer.texts_to_sequences(y_val_text)

#padding zero upto maximum length
y_tr    =   pad_sequences(y_tr_seq, maxlen=max_summary_len, padding='post')
y_val   =   pad_sequences(y_val_seq, maxlen=max_summary_len, padding='post')

#size of vocabulary
y_voc  =   y_tokenizer.num_words +1

In [23]:
#deleting the rows that contain only START and END tokens

In [24]:
ind=[]
for i in range(len(y_tr)):
    cnt=0
    for j in y_tr[i]:
        if j!=0:
            cnt=cnt+1
    if(cnt==2):
        ind.append(i)

y_tr=np.delete(y_tr,ind, axis=0)
x_tr=np.delete(x_tr,ind, axis=0)

In [25]:
ind=[]
for i in range(len(y_val)):
    cnt=0
    for j in y_val[i]:
        if j!=0:
            cnt=cnt+1
    if(cnt==2):
        ind.append(i)

y_val=np.delete(y_val,ind, axis=0)
x_val=np.delete(x_val,ind, axis=0)

# Abstractive Text Summarization - Model building

We are finally at the model building part. But before we do that, we need to familiarize ourselves with a few terms which are required prior to building the model.

**Return Sequences = True**: When the return sequences parameter is set to True, LSTM produces the hidden state and cell state for every timestep

**Return State = True**: When return state = True, LSTM produces the hidden state and cell state of the last timestep only

**Initial State**: This is used to initialize the internal states of the LSTM for the first timestep

**Stacked LSTM**: Stacked LSTM has multiple layers of LSTM stacked on top of each other.
This leads to a better representation of the sequence.

Here, we are building a 3 stacked LSTM for the encoder:

Sparse categorical cross-entropy as the loss function since it converts the integer sequence to a one-hot vector on the fly. This overcomes any memory issues.

In [26]:
# NO ATTENTION LAYER

latent_dim = 300
embedding_dim = 100

# Encoder
encoder_inputs = Input(shape=(max_text_len,))
enc_emb = Embedding(x_voc, embedding_dim, trainable=True)(encoder_inputs)

# Encoder LSTM stack
encoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output1, _, _ = encoder_lstm1(enc_emb)

encoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output2, _, _ = encoder_lstm2(encoder_output1)

encoder_lstm3 = LSTM(latent_dim, return_sequences=False, return_state=True, dropout=0.4, recurrent_dropout=0.4)
_, state_h, state_c = encoder_lstm3(encoder_output2)

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(y_voc, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# Output layer
decoder_dense = TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Define model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()


In [27]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

EarlyStopping monitors the validation loss (val_loss). Our model will stop training once the validation loss increases.

In [28]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)

Fit the model

In [29]:
history=model.fit([x_tr, y_tr[:,:-1]],
                  y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)[:,1:],
                  epochs=2,
                  callbacks=[es],
                  batch_size=128,
                  validation_data=([x_val,y_val[:,:-1]], y_val.reshape(y_val.shape[0],y_val.shape[1], 1)[:,1:])
                 )

Epoch 1/2
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m893s[0m 389ms/step - loss: 7.0191 - val_loss: 6.5180
Epoch 2/2
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m876s[0m 390ms/step - loss: 6.5126 - val_loss: 6.1883


Next, let’s build the dictionary to convert the index to word for target and source vocabulary:

In [30]:
reverse_target_word_index=y_tokenizer.index_word
reverse_source_word_index=x_tokenizer.index_word
target_word_index=y_tokenizer.word_index

## Inference

Set up the inference for the encoder and decoder:

In [31]:
# Encoder model (no attention, so only states are needed)
encoder_model = Model(inputs=encoder_inputs, outputs=[state_h, state_c])

# Decoder setup
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))

decoder_inputs_single = Input(shape=(1,))  # Shape=(batch_size, 1) for one timestep at a time
dec_emb2 = dec_emb_layer(decoder_inputs_single)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs_single, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2, state_h2, state_c2]
)


We are defining a function below which is the implementation of the inference process

In [32]:
# def decode_sequence(input_seq):
#     # Encode the input as state vectors
#     e_h, e_c = encoder_model.predict(input_seq)

#     # Generate empty target sequence of length 1
#     target_seq = np.zeros((1, 1))

#     # Populate the first word of target sequence with the start token
#     start_token_idx = target_word_index.get('sostok')
#     if start_token_idx is None:
#         raise ValueError("'sostok' token not found in target_word_index.")
#     target_seq[0, 0] = start_token_idx

#     stop_condition = False
#     decoded_sentence = ''
#     while not stop_condition:
#         output_tokens, h, c = decoder_model.predict([target_seq, e_h, e_c])

#         # Sample a token
#         sampled_token_index = np.argmax(output_tokens[0, -1, :])
#         sampled_token = reverse_target_word_index.get(sampled_token_index, '')

#         if sampled_token != 'eostok':
#             decoded_sentence += ' ' + sampled_token

#         # Exit condition
#         if (sampled_token == 'eostok' or len(decoded_sentence.split()) >= (max_summary_len - 1)):
#             stop_condition = True

#         # Update the target sequence and states
#         target_seq = np.zeros((1, 1))
#         target_seq[0, 0] = sampled_token_index
#         e_h, e_c = h, c

#     return decoded_sentence.strip()


In [33]:
def decode_sequence(input_seq):
    # Encode the input as state vectors
    e_h, e_c = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Start with a default word (e.g., most frequent word or index 1)
    target_seq[0, 0] = 1  # Use the <start> token index or index 1 if not using special tokens

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index.get(sampled_token_index, '')

        decoded_sentence += ' ' + sampled_token

        # Exit condition
        if len(decoded_sentence.split()) >= (max_summary_len - 1):
            stop_condition = True

        # Update the target sequence and states
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        e_h, e_c = h, c

    return decoded_sentence.strip()


Let us define the functions to convert an integer sequence to a word sequence for summary as well as the reviews:

In [34]:
def seq2summary(input_seq):
    return ' '.join([reverse_target_word_index.get(i, '') for i in input_seq if i != 0]).strip()


In [35]:
# def seq2summary(input_seq):
#     newString = ''
#     sostok_idx = target_word_index.get('sostok')
#     eostok_idx = target_word_index.get('eostok')
#     for i in input_seq:
#         if i != 0 and i != sostok_idx and i != eostok_idx:
#             word = reverse_target_word_index.get(i, '')
#             if word:
#                 newString += word + ' '
#     return newString.strip()

In [36]:
def seq2text(input_seq):
    newString = ''
    for i in input_seq:
        if i != 0:
            word = reverse_source_word_index.get(i, '')
            if word:
                newString += word + ' '
    return newString.strip()


Here are a few summaries generated by the model:

In [37]:
for i in range(0, 10):
    print("Review:",seq2text(x_tr[i]))
    print("Original summary:",seq2summary(y_tr[i]))
    print("Predicted summary:", decode_sequence(x_tr[i].reshape(1,max_text_len)))
    print("\n")

Review: in italy last month symptoms of hepatitis a include fever tiredness loss of appetite nausea and abdominal discomfort fargo catholic diocese in north dakota is where the bishop is located
Original summary: forks and jamestown could have been exposed eostok
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 998ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 365ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Predicted summary: time of the time of saturday eostok


Review: specified unlawful activity he is scheduled to appear in federal court in florida on wednesda