<a href="https://colab.research.google.com/github/SteveWangzx/CS509GroupWEB/blob/main/news_seq2seq_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load Data

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df=pd.read_csv('/content/drive/MyDrive/NYT_Dataset.csv')

Data Preprocess

In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# clean text

import re
import string
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
nltk.download('omw-1.4')
nltk.download('wordnet')

def clean_text(text):
    """Process text function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    """
    text=str(text)
    lemmatizer = WordNetLemmatizer()
    stopwords_english = stopwords.words('english')
    text= re.sub('\[[^]]*\]', '', text)
    # remove stock market tickers like $GE
    text = re.sub(r'\$\w*', '', text)
    #removal of html tags
    review =re.sub(r'<.*?>',' ',text) 
    # remove old style retweet text "RT"
    text = re.sub(r'^RT[\s]+', '', text)
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    text = re.sub("["u"\U0001F600-\U0001F64F"  # removal of emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+",' ',text)
    text = re.sub('[^a-zA-Z]',' ',text) 
    text = text.lower()
    text_tokens =word_tokenize(text)

    text_clean = []
    for word in  text_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            lem_word =lemmatizer.lemmatize(word)  # lemmitiging word
            text_clean.append(lem_word)
    text_mod=[i for i in text_clean if len(i)>2]
    text_clean=' '.join(text_mod)
    text_clean = '<start> ' + text_clean + ' <end>'
    return  text_clean

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
ddf=df[['title','abstract','keywords']].reset_index()

ddf['title']=ddf['title'].apply(lambda x:clean_text(x))
ddf['abstract']=ddf['abstract'].apply(lambda x: clean_text(x))

In [None]:
ddf['keywords'][0]

"['Assassinations and Attempted Assassinations', 'Pakistan', 'Bhutto, Benazir', 'Federal Bureau of Investigation', 'United Nations']"

In [None]:
def join_word(text):
  text=str(text)
  list=text.strip('][').split(',')
  clean_text=[]
  for i in list:
    clean_text.append(i.replace('\'',''))
  text_clean=' '.join(clean_text)
  return text_clean

In [None]:
ddf['keywords']=ddf['keywords'].apply(lambda x:join_word(x))
ddf['keywords']=ddf['keywords'].apply(lambda x:clean_text(x))

In [None]:
ddf['keywords'][0]

'<start> assassination attempted assassination pakistan bhutto benazir federal bureau investigation united nation <end>'

In [None]:
!pip install tensorflow-addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 23.0 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.19.0


In [None]:
import tensorflow as tf
import tensorflow_addons as tfa

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [None]:
def tokenize(lang):
  lang=lang.apply(lambda x:str(x))
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>')
  lang_tokenizer.fit_on_texts(lang)
  tensor = lang_tokenizer.texts_to_sequences(lang) 
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
  return tensor, lang_tokenizer

In [None]:
input_tensor, inp_lang_tokenizer = tokenize(ddf['abstract'])
target_tensor, tar_lang_tokenizer = tokenize(ddf['title'])
key_tensor, key_lang_tokenizer = tokenize(ddf['keywords'])

In [None]:
input_tensor_train, input_tensor_test, target_tensor_train, target_tensor_test,key_tensor_train,key_tensor_test = \
train_test_split(input_tensor, target_tensor, key_tensor, test_size=0.2)

In [None]:
print(input_tensor_train.shape, key_tensor_train.shape, target_tensor_train.shape)

(85204, 132) (85204, 175) (85204, 17)


In [None]:
BUFFER_SIZE = 32000
BATCH_SIZE = 64

Create Dataset for Train

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, key_tensor_train, target_tensor_train))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

test_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_test, key_tensor_test, target_tensor_test))
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)

Input Example

In [None]:
example_input_batch, example_key_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_key_batch.shape, example_target_batch.shape

(TensorShape([64, 132]), TensorShape([64, 175]), TensorShape([64, 17]))

In [None]:
inp_vocab_size = len(inp_lang_tokenizer.word_index)+1
tar_vocab_size = len(tar_lang_tokenizer.word_index)+1
max_length_input = example_input_batch.shape[1]
max_length_key = example_key_batch.shape[1]
max_length_output = example_target_batch.shape[1]

embedding_dim = 256
units = 512
steps_per_epoch = num_examples=30000

In [None]:
max_length_input, max_length_output, max_length_key, inp_vocab_size, tar_vocab_size

(132, 17, 175, 43658, 25870)

Encode

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    ##-------- LSTM layer in Encoder ------- ##
    self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')



  def call(self, x, hidden):
    x = self.embedding(x)
    output, h, c = self.lstm_layer(x, initial_state = hidden)
    return output, h, c

  def initialize_hidden_state(self):
    return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))]

Encoder Test Input

In [None]:
## Test Encoder Stack

encoder = Encoder(inp_vocab_size, embedding_dim, units, BATCH_SIZE)


# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_h.shape))
print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))


Encoder output shape: (batch size, sequence length, units) (64, 132, 512)
Encoder h vecotr shape: (batch size, units) (64, 512)
Encoder c vector shape: (batch size, units) (64, 512)


Encoder Test Key

In [None]:
## Test Encoder Stack

encoder = Encoder(inp_vocab_size, embedding_dim, units, BATCH_SIZE)

key_encoder = Encoder(inp_vocab_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_key_output, sample_key_h, sample_key_c = key_encoder(example_key_batch, sample_hidden)
print ('Encoder output key shape: (batch size, sequence length, units) {}'.format(sample_key_output.shape))
print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_key_h.shape))
print ('Encoder c vector shape: (batch size, units) {}'.format(sample_key_c.shape))

Encoder output key shape: (batch size, sequence length, units) (64, 175, 512)
Encoder h vecotr shape: (batch size, units) (64, 512)
Encoder c vector shape: (batch size, units) (64, 512)


Cross Attention

In [None]:
class CrossAttention(tf.keras.layers.Layer):
  def __init__(self, units, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(key_dim=units, num_heads=2, **kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()

  def call(self, key_word, context):
    attn_output, attn_scores = self.mha(
        query=key_word,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    attn_scores = tf.reduce_mean(attn_scores, axis=1)
    self.last_attention_weights = attn_scores

#    attn_output = self.layernorm(attn_output)

    return attn_output

Decoder With Two Attention Layers: Self & Cross

Cross -> Self

In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type='luong'):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.attention_type = attention_type
    
    # Embedding Layer
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    
    # Cross Attention Layer
    self.cross_attn = CrossAttention(dec_units)

    # Cross Attention Rnn
    self.cross_rnn = tf.keras.layers.LSTM(self.dec_units)

    #Final Dense layer on which softmax will be applied
    self.fc = tf.keras.layers.Dense(vocab_size)

    # Define the fundamental cell for decoder recurrent structure
    self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)

    # Sampler
    self.sampler = tfa.seq2seq.sampler.TrainingSampler()

    # Create attention mechanism with memory = None
    self.attention_mechanism = self.build_attention_mechanism(self.dec_units, 
                                None, self.batch_sz*[max_length_input], self.attention_type)

    # Wrap attention mechanism with the fundamental rnn cell of decoder
    self.rnn_cell = self.build_rnn_cell(batch_sz)

    # Define the decoder with respect to fundamental rnn cell
    self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)

    
  def build_rnn_cell(self, batch_sz):
    rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, 
                                  self.attention_mechanism, attention_layer_size=self.dec_units)
    return rnn_cell

  def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='luong'):
    # ------------- #
    # typ: Which sort of attention (Bahdanau, Luong)
    # dec_units: final dimension of attention outputs 
    # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)
    # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)

    if(attention_type=='bahdanau'):
      return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
    else:
      return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)

  def build_initial_state(self, batch_sz, encoder_state, Dtype):
    decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)
    decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
    return decoder_initial_state


  def run_cross_attn(self, context, key_word):
    x = self.cross_attn(key_word, context)
    # x = self.cross_rnn(x)
    return x


  def call(self, inputs, initial_state):
    x = self.embedding(inputs)
    outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[max_length_output-1])
    return outputs


Self -> Cross

In [None]:
# Test decoder stack

decoder = Decoder(tar_vocab_size, embedding_dim, units, BATCH_SIZE, 'luong')
struct_output = tf.random.uniform((BATCH_SIZE, max_length_output))

In [None]:
x = decoder.run_cross_attn(sample_output, sample_key_output)


decoder.attention_mechanism.setup_memory(x)
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c], tf.float32)


sample_decoder_outputs = decoder(struct_output, initial_state)

print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)

Decoder Outputs Shape:  (64, 16, 25870)


Define Optimizer & Loss

In [None]:
optimizer = tf.keras.optimizers.Adam()


def loss_function(real, pred):
  # real shape = (BATCH_SIZE, max_length_output)
  # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  loss = cross_entropy(y_true=real, y_pred=pred)
  mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
  mask = tf.cast(mask, dtype=loss.dtype)  
  loss = mask* loss
  loss = tf.reduce_mean(loss)
  return loss  

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                encoder=encoder, key_encoder=key_encoder, decoder=decoder)

Train Operation in One Iteration

In [None]:
@tf.function
def train_step(inp, key, targ, enc_hidden, enc_key_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_h, enc_c = encoder(inp, enc_hidden)
    enc_key, _, _ = key_encoder(key, enc_key_hidden)

    dec_input = targ[ : , :-1 ] # Ignore <end> token
    real = targ[ : , 1: ]         # ignore <start> token

    # Cross Attention Layer Call
    cross_output = decoder.run_cross_attn(enc_output, enc_key)


    # Set the AttentionMechanism object with encoder_outputs
    decoder.attention_mechanism.setup_memory(cross_output)

    # Create AttentionWrapperState as initial_state for decoder
    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
    pred = decoder(dec_input, decoder_initial_state)
    logits = pred.rnn_output
    loss = loss_function(real, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

Train

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  enc_key_hidden = key_encoder.initialize_hidden_state()
  total_loss = 0
  # print(enc_hidden[0].shape, enc_hidden[1].shape)

  for (batch, (inp, key, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, key, targ, enc_hidden, enc_key_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.3871
Epoch 1 Batch 100 Loss 0.3409
Epoch 1 Batch 200 Loss 0.4149
Epoch 1 Batch 300 Loss 0.3485
Epoch 1 Batch 400 Loss 0.3850
Epoch 1 Batch 500 Loss 0.4308
Epoch 1 Batch 600 Loss 0.4604
Epoch 1 Batch 700 Loss 0.4702
Epoch 1 Batch 800 Loss 0.5298
Epoch 1 Batch 900 Loss 0.4277
Epoch 1 Batch 1000 Loss 0.4808
Epoch 1 Batch 1100 Loss 0.3997
Epoch 1 Batch 1200 Loss 0.5857
Epoch 1 Batch 1300 Loss 0.5287
Epoch 1 Loss 0.0198
Time taken for 1 epoch 153.79948472976685 sec

Epoch 2 Batch 0 Loss 0.4255
Epoch 2 Batch 100 Loss 0.3585
Epoch 2 Batch 200 Loss 0.4135
Epoch 2 Batch 300 Loss 0.3188
Epoch 2 Batch 400 Loss 0.3802
Epoch 2 Batch 500 Loss 0.3382
Epoch 2 Batch 600 Loss 0.3875
Epoch 2 Batch 700 Loss 0.3150
Epoch 2 Batch 800 Loss 0.4583
Epoch 2 Batch 900 Loss 0.4248
Epoch 2 Batch 1000 Loss 0.4337
Epoch 2 Batch 1100 Loss 0.4140
Epoch 2 Batch 1200 Loss 0.4594
Epoch 2 Batch 1300 Loss 0.5458
Epoch 2 Loss 0.0185
Time taken for 1 epoch 153.88943195343018 sec

Epoch 3 Batch 0 Loss 0

In [None]:
valid = ddf[:1000]

x = valid['abstract'].to_numpy()
y = valid['title'].to_numpy()
gene_valid = valid.to_numpy()
gene_valid[0]

array([0,
       '<start> reversal pakistan welcome outside help inquiry bhutto <end>',
       '<start> pakistan ambassador said government would endorse separate inquiry modeled one carried assassination rafik hariri lebanon <end>',
       '<start> assassination attempted assassination pakistan bhutto benazir federal bureau investigation united nation <end>'],
      dtype=object)

In [None]:
# Evaluation

def evaluate_sentence(sentence, key_word):
  inputs = [inp_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_input,
                                                          padding='post')
  inputs = tf.convert_to_tensor(inputs)
  inference_batch_size = inputs.shape[0]

  key_inputs = [key_lang_tokenizer.word_index[i] for i in key_word.split(' ')]
  key_inputs = tf.keras.preprocessing.sequence.pad_sequences([key_inputs],
                                                          maxlen=max_length_key,
                                                          padding='post')
  key_inputs = tf.convert_to_tensor(key_inputs)
  key_batch_size = key_inputs.shape[0]

  result = ''

  enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
  enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

  key_start_state = [tf.zeros((key_batch_size, units)), tf.zeros((key_batch_size,units))]
  enc_key_out, _, _ = key_encoder(key_inputs, key_start_state)

  dec_h = enc_h
  dec_c = enc_c

  start_tokens = tf.fill([inference_batch_size], tar_lang_tokenizer.word_index['<start>'])
  end_token = tar_lang_tokenizer.word_index['<end>']

  greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

  # Instantiate BasicDecoder object
  decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)

  # Cross Attention
  cross_output = decoder.run_cross_attn(enc_out, enc_key_out)

  # Setup Memory in decoder stack
  decoder.attention_mechanism.setup_memory(cross_output)

  # set decoder_initial_state
  decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)


  ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
  ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
  ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function

  decoder_embedding_matrix = decoder.embedding.variables[0]
  
  outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)
  return outputs.sample_id.numpy()


In [None]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from rouge import Rouge
import numpy as np
rouge = Rouge()

def rouge_score(title, sentence, key_word):
  result = evaluate_sentence(sentence, key_word)
  result = tar_lang_tokenizer.sequences_to_texts(result)
  title = title.replace('<start> ', '')
  title = title.replace(' <end>', '')
  result[0] = result[0].replace('<start> ', '')
  result[0] = result[0].replace(' <end>', '')
  scores = rouge.get_scores(result[0], title)
  max_score = max(scores[0]['rouge-1']['p'], scores[0]['rouge-1']['f'], scores[0]['rouge-1']['r'], 
                  scores[0]['rouge-2']['p'],scores[0]['rouge-2']['f'], scores[0]['rouge-2']['r'])
  return max_score
  

scores = []
for rec in gene_valid:
  keyword = rec[3]
  keyword = keyword.replace('<start> ', '')
  keyword = keyword.replace(' <end>', '')
  if keyword == '':
    continue
  score = rouge_score(rec[1], rec[2], rec[3])
  scores.append(score)

print(np.mean(scores))

0.6350847668910496


In [None]:
scores[5]

[{'rouge-1': {'r': 0.16666666666666666, 'p': 0.2, 'f': 0.18181817685950424},
  'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0},
  'rouge-l': {'r': 0.16666666666666666, 'p': 0.2, 'f': 0.18181817685950424}}]