In [2]:
!pip install datasets
!pip install py7zr
!pip install rouge
!pip install accelerate -U
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.1

In [3]:
from datasets import load_dataset
import re
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import string
import os
import os
import logging
import tensorflow as tf
import time

In [4]:
dataset = load_dataset("samsum")

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
train_df = dataset["train"].to_pandas()

In [None]:
test_df = dataset["test"].to_pandas()

## Data Preprocess

Replacing the names as they dont introduce any bias into the model.

In [None]:
def replace_names_per_row(train_df):
    processed_dialogue = []
    processed_summary = []

    for index, row in train_df.iterrows():
        name_map = {}

        matches = re.findall('([A-Z]\w+):', row['dialogue'])
        for match in matches:
            if match not in name_map:
                name_map[match] = f"Person{len(name_map) + 1}"

        processed_text = row['dialogue']
        for name, person in name_map.items():
            processed_text = processed_text.replace(name, person)
        processed_dialogue.append(processed_text)

        processed_text = row['summary']
        for name, person in name_map.items():
            processed_text = processed_text.replace(name, person)
        processed_summary.append(processed_text)

    train_df['processed_dialogue'] = processed_dialogue
    train_df['processed_summary'] = processed_summary

    return train_df

In [None]:
train_df = replace_names_per_row(train_df)

Changing new lines to "TURN" as they indicate better context switch

In [None]:
train_df['processed_dialogue'] = [x.replace('\r\n', "<TURN>") for x in train_df['processed_dialogue']]

In [None]:
def remove_punctuations_and_special_characters(text):
  """Removes punctuations and special characters from a text, but keeps colons and <> characters."""
  # Define the set of characters to remove
  remove_characters = set(string.punctuation + r'&\'()~`') - set('<>:')

  # Remove the characters from the text
  filtered_text = ''
  for char in text:
    if char not in remove_characters:
      filtered_text += char

  return filtered_text

train_df['processed_dialogue'] = train_df['processed_dialogue'].apply(remove_punctuations_and_special_characters)

In [None]:
# Remove extra white spaces
train_df['processed_dialogue'] = [re.sub('\s\s+', " ", x) for x in train_df['processed_dialogue'] ]

In [None]:
# Replace text to lower characters
train_df['processed_dialogue'] = [x.lower() for x in train_df['processed_dialogue']]
train_df['processed_summary'] = [x.lower() for x in train_df['processed_summary']]

In [None]:
train_df['processed_dialogue'][0]

'person1: i baked cookies do you want some<turn>person2: sure<turn>person1: ill bring you tomorrow :'

## Tokenization

Adding `<start>` and `<end>` tokens to the processed summary

In [None]:
train_df['processed_summary'] = ['<start>' + x + '<end>'  for x in train_df['processed_summary']]

In [None]:
# since < and > from default tokens cannot be removed
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
oov_token = '<unk>'

document_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token=oov_token)
summary_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=filters, oov_token=oov_token)

document_tokenizer.fit_on_texts(train_df['processed_dialogue'])
summary_tokenizer.fit_on_texts(train_df['processed_summary'])

inputs = document_tokenizer.texts_to_sequences(train_df['processed_dialogue'])
targets = summary_tokenizer.texts_to_sequences(train_df['processed_summary'])

In [None]:
# Save the document tokenizer
document_tokenizer_json = document_tokenizer.to_json()
with open('document_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(document_tokenizer_json)

# Save the summary tokenizer
summary_tokenizer_json = summary_tokenizer.to_json()
with open('summary_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(summary_tokenizer_json)


In [None]:
len(inputs)

14732

In [None]:
print('<start>' in summary_tokenizer.word_index)
print('<end>' in summary_tokenizer.word_index)


True
True


In [None]:
summary_tokenizer.texts_to_sequences(["This is a test"])

[[61, 10, 9, 272]]

In [None]:
summary_tokenizer.sequences_to_texts([[61, 10, 9, 272]])

['this is a test']

In [None]:
encoder_vocab_size = len(document_tokenizer.word_index) + 1
decoder_vocab_size = len(summary_tokenizer.word_index) + 1
encoder_vocab_size, decoder_vocab_size

(30698, 14884)

In [None]:
# Taking the 90th percentile for padding for dialogue
sequence_lengths = [len(seq) for seq in inputs]
dialogue_max_len_90th_percentile = int(np.percentile(sequence_lengths, 90))
dialogue_max_len_90th_percentile

205

In [None]:
# Taking the 90th percentile for padding for summary
sequence_lengths = [len(seq) for seq in targets]
summary_max_len_90th_percentile = int(np.percentile(sequence_lengths, 90))
summary_max_len_90th_percentile

38

In [None]:
# Padding the data
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, maxlen=dialogue_max_len_90th_percentile, padding='post', truncating='post')
targets = tf.keras.preprocessing.sequence.pad_sequences(targets, maxlen=summary_max_len_90th_percentile, padding='post', truncating='post')

In [None]:
inputs = tf.cast(inputs, dtype=tf.int32)
targets = tf.cast(targets, dtype=tf.int32)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).shuffle(10000).batch(32)

## RNN Model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, RNN, SimpleRNNCell
from tensorflow.keras.models import Model

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNNCell, RNN, Dense
from tensorflow.keras.callbacks import EarlyStopping
import os

# model parameters
encoder_vocab_size = 30698
decoder_vocab_size = 14884
dialogue_max_len_90th_percentile = 205

# Encoder
encoder_inputs = Input(shape=(dialogue_max_len_90th_percentile,))
encoder_embedding = Embedding(input_dim=encoder_vocab_size, output_dim=256)(encoder_inputs)
encoder_rnn = RNN(SimpleRNNCell(512), return_state=True)
encoder_outputs, state_h = encoder_rnn(encoder_embedding)

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=decoder_vocab_size, output_dim=256)(decoder_inputs)
decoder_rnn = RNN(SimpleRNNCell(512), return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_rnn(decoder_embedding, initial_state=state_h)
decoder_dense = Dense(decoder_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Train the model with EarlyStopping
rnn_hist = model.fit(
    [inputs, targets[:, :-1]],  # encoder input and decoder input (offset by one time step)
    targets[:, 1:],             # decoder target data is one step ahead
    batch_size=32,
    epochs=15,
    validation_split=0.2,
    callbacks=[early_stopping]
)

model_save_path = '/content/drive/MyDrive/Models/RNN_summarization_model.h5'
os.makedirs(os.path.dirname(model_save_path), exist_ok=True)

# Save the model
model.save(model_save_path)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15


  saving_api.save_model(


### Inference

In [None]:
from tensorflow.keras.models import Model, load_model
import json
from tensorflow.keras.preprocessing.text import tokenizer_from_json


# Load the entire model that you saved after training
model_save_path = '/content/drive/MyDrive/Models/RNN_summarization_model.h5'
trained_model = load_model(model_save_path)

# Extract the encoder and decoder from the trained model
encoder_inputs = trained_model.input[0]  # encoder input
encoder_outputs, state_h_enc = trained_model.layers[4].output  # last output of the encoder RNN
encoder_model = Model(encoder_inputs, state_h_enc)

decoder_inputs = trained_model.input[1]  # decoder input
decoder_embedding_layer = trained_model.layers[3]
decoder_embedding = decoder_embedding_layer(decoder_inputs)

decoder_state_input_h = Input(shape=(512,), name='decoder_state_input_h')
decoder_rnn_layer = trained_model.layers[5]
decoder_outputs, state_h_dec = decoder_rnn_layer(decoder_embedding, initial_state=decoder_state_input_h)

decoder_dense_layer = trained_model.layers[6]
decoder_outputs = decoder_dense_layer(decoder_outputs)
decoder_model = Model([decoder_inputs, decoder_state_input_h], [decoder_outputs, state_h_dec])

# Tokenizer loading
with open('document_tokenizer.json', 'r', encoding='utf-8') as f:
    document_tokenizer_data = f.read()
    document_tokenizer = tokenizer_from_json(document_tokenizer_data)

with open('summary_tokenizer.json', 'r', encoding='utf-8') as f:
    summary_tokenizer_data = f.read()
    summary_tokenizer = tokenizer_from_json(summary_tokenizer_data)

# Function to generate the summary for a new sequence
def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1 with only the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = summary_tokenizer.word_index['<start>']

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h = decoder_model.predict([target_seq] + [states_value])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = summary_tokenizer.index_word[sampled_token_index]

        # Exit condition: either hit end of sequence or max length
        if sampled_word == '<end>' or len(decoded_sentence.split()) >= dialogue_max_len_90th_percentile:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        # Update the target sequence to the next word
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = h

    return decoded_sentence.strip()

In [None]:
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import tokenizer_from_json

# Load the tokenizer for the document
with open('document_tokenizer.json', 'r', encoding='utf-8') as f:
    data = f.read()
    document_tokenizer = tokenizer_from_json(data)

# Define a function to preprocess raw text
def preprocess_text(raw_text, tokenizer):
    raw_text = raw_text.replace('\r\n', "<turn>")
    raw_text = re.sub(r'[^\w\s<turn>]', '', raw_text)
    raw_text = re.sub('\s\s+', " ", raw_text)
    raw_text = raw_text.lower()
    sequences = tokenizer.texts_to_sequences([raw_text])
    max_len = dialogue_max_len_90th_percentile
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    return padded_sequences

summaries = []
for raw_text in test_df['dialogue']:
    preprocessed_sequence = preprocess_text(raw_text, document_tokenizer)
    summary = decode_sequence(preprocessed_sequence)
    summaries.append(summary)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [None]:
from rouge import Rouge

references = test_df['summary'].tolist()

rouge = Rouge()

# Calculate ROUGE scores
scores = rouge.get_scores(summaries, references, avg=True)

print(scores)


{'rouge-1': {'r': 0.11745465740352609, 'p': 0.1746083925571109, 'f': 0.13353173780945837}, 'rouge-2': {'r': 0.008651541267067821, 'p': 0.015520282186948859, 'f': 0.010533618949902743}, 'rouge-l': {'r': 0.09860740605166544, 'p': 0.14705759577554464, 'f': 0.11210389546281387}}


The ROUGE scores provided for the RNN model's performance on the SAMSum dataset indicate that the model has a low overlap between the generated summaries and the reference summaries. All these scores are sigificantly lower and this is expected as vanilla RNN model is not powerful enough for abstractive summarization. Lets approach with more powerful models like LSTM and Transformers.

| Metric  | Recall (r)      | Precision (p)  | F1-Score (f)   |
|---------|-----------------|----------------|----------------|
| ROUGE-1 | 0.1174546574035 | 0.174608392557 | 0.133531737809 |
| ROUGE-2 | 0.0086515412671 | 0.015520282187 | 0.01053361895  |
| ROUGE-L | 0.0986074060517 | 0.147057595776 | 0.112103895463 |


## LSTM Model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

embedding_dim = 100
lstm_units = 128
batch_size = 4
num_epochs = 15

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(encoder_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(decoder_vocab_size, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(decoder_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()

# Define early stopping to halt training when validation loss is not decreasing
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


history = model.fit(
    [inputs, targets[:, :-1]],  # input sequences and target sequences offset by one timestep
    tf.reshape(targets, (targets.shape[0], targets.shape[1], 1))[:, 1:],  # reshaped target sequences
    batch_size=batch_size,
    epochs=num_epochs,
    validation_split=0.2,
    callbacks=[early_stopping]
)

model.save('/content/drive/MyDrive/Models/my_seq2seq_model.h5')


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_3 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 100)            3069800   ['input_2[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 100)            1488400   ['input_3[0][0]']             
                                                                                              

  saving_api.save_model(


### Inference

In [None]:
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import numpy as np

# Load the model
model = load_model('/content/drive/MyDrive/Models/my_seq2seq_model.h5')

# Encoder inference model
encoder_model = Model(inputs=model.input[0], outputs=model.layers[4].output[1:])

# Decoder inference model
decoder_state_input_h = Input(shape=(lstm_units,), name='decoder_state_input_h')
decoder_state_input_c = Input(shape=(lstm_units,), name='decoder_state_input_c')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Decoder Embedding layer
decoder_inputs = model.input[1]  # Input for decoder
dec_emb_layer = model.layers[3]
dec_emb2 = dec_emb_layer(decoder_inputs)

# Decoder LSTM layer
decoder_lstm = model.layers[5]
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)

# Decoder Dense layer
decoder_dense = model.layers[6]
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + [state_h2, state_c2])

# Tokenizer loading
with open('document_tokenizer.json', 'r', encoding='utf-8') as f:
    document_tokenizer_data = f.read()
    document_tokenizer = tokenizer_from_json(document_tokenizer_data)

with open('summary_tokenizer.json', 'r', encoding='utf-8') as f:
    summary_tokenizer_data = f.read()
    summary_tokenizer = tokenizer_from_json(summary_tokenizer_data)

# Reverse-lookup token index for decoding
reverse_target_char_index = dict((i, char) for char, i in summary_tokenizer.word_index.items())

def decode_sequence(input_seq, encoder_model, decoder_model, summary_tokenizer):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1 with only the start character.
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = summary_tokenizer.word_index['<start>']

    # Create the translation
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token and add the corresponding character to the decoded sentence
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        if sampled_char == '<end>':
            break
        decoded_sentence += ' ' + sampled_char

        # Exit condition: either hit max length or find the stop token.
        if (sampled_char == '<end>' or len(decoded_sentence) > 38):
            stop_condition = True

        # Update the target sequence to be the token just predicted.
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update the states with the current LSTM output state for the next loop iteration.
        states_value = [h, c]

    # Remove the 'start' and 'end' tokens if present
    decoded_sentence = decoded_sentence.replace('start', '').replace('end', '').strip()

    return decoded_sentence


In [None]:
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import tokenizer_from_json

with open('document_tokenizer.json', 'r', encoding='utf-8') as f:
    data = f.read()
    document_tokenizer = tokenizer_from_json(data)

def preprocess_text(raw_text, tokenizer):

    raw_text = raw_text.replace('\r\n', "<turn>")
    raw_text = remove_punctuations_and_special_characters(raw_text)
    raw_text = re.sub('\s\s+', " ", raw_text)
    raw_text = raw_text.lower()

    sequences = tokenizer.texts_to_sequences([raw_text])

    max_len = 205
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

    return padded_sequences

summaries = []  # List to store the summaries
for raw_text in test_df['dialogue']:
    preprocessed_sequence = preprocess_text(raw_text, document_tokenizer)
    summary = decode_sequence(preprocessed_sequence, encoder_model, decoder_model, summary_tokenizer)
    summaries.append(summary)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [None]:
from rouge import Rouge

references = test_df['summary'].tolist()

rouge = Rouge()

# Calculate ROUGE scores
scores = rouge.get_scores(summaries, references, avg=True)

print(scores)


{'rouge-1': {'r': 0.08911093462029329, 'p': 0.20088619493381374, 'f': 0.1179760015298777}, 'rouge-2': {'r': 0.00964898166943222, 'p': 0.028708064422350108, 'f': 0.013905789861118524}, 'rouge-l': {'r': 0.08479332412145564, 'p': 0.19017578541388042, 'f': 0.11200303145294424}}


The model's low ROUGE scores indicate that a standard LSTM architecture may not capture the complexity of the summarization task as effectively as advanced models like Transformers or BART. Despite attempts to increase model complexity, which led to overfitting, the model still tends to produce repetitive and generic responses. This behavior suggests that the model lacks the nuanced understanding of context and language necessary for high-quality summarization, a challenge better addressed by models with attention mechanisms and more sophisticated architectures designed for such tasks.

| Metric  | Recall (r)     | Precision (p)  | F1-Score (f)   |
|---------|----------------|----------------|----------------|
| ROUGE-1 | 0.089110934620 | 0.200886194934 | 0.117976001530 |
| ROUGE-2 | 0.009648981669 | 0.028708064422 | 0.013905789861 |
| ROUGE-L | 0.084793324121 | 0.190175785414 | 0.112003031453 |


## Transformer Model

In [None]:
def get_angles(position, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return position * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(
        np.arange(position)[:, np.newaxis],
        np.arange(d_model)[np.newaxis, :],
        d_model
    )

    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask


def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)

    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    output = tf.matmul(attention_weights, v)
    return output, attention_weights


In [None]:
# Multiheaded Attention
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.dense(concat_attention)

        return output, attention_weights

In [None]:
# Feed Forward
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

### Encoder Model

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x


class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2


### Decoder Model

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

        return x, attention_weights


class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)


    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)

        return out3, attn_weights_block1, attn_weights_block2


In [None]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        enc_output = self.encoder(inp, training, enc_padding_mask)

        dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)

        final_output = self.final_layer(dec_output)

        return final_output, attention_weights


In [None]:
num_layers = 6 #4
d_model = 256 #128
dff = 512
num_heads = 8
EPOCHS = 25

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
      step = tf.cast(step, tf.float32)  # Cast step to float
      arg1 = tf.math.rsqrt(step)
      arg2 = step * (self.warmup_steps ** -1.5)

      return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [None]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')

In [None]:
transformer = Transformer(
    num_layers,
    d_model,
    num_heads,
    dff,
    encoder_vocab_size,
    decoder_vocab_size,
    pe_input=encoder_vocab_size,
    pe_target=decoder_vocab_size,
)

In [None]:
def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

In [None]:
checkpoint_path = '/content/drive/MyDrive/Models/transformer_checkpoints'

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

In [None]:
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions, _ = transformer(
            inp, tar_inp,
            True,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask
        )
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)

In [None]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()

    for (batch, (inp, tar)) in enumerate(dataset):
        train_step(inp, tar)

        if batch % 429 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, train_loss.result()))

    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1, ckpt_save_path))

    print ('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.4326
Epoch 1 Batch 429 Loss 4.4695
Epoch 1 Loss 4.4780
Time taken for 1 epoch: 95.59837985038757 secs

Epoch 2 Batch 0 Loss 4.2350
Epoch 2 Batch 429 Loss 4.3521
Epoch 2 Loss 4.3553
Time taken for 1 epoch: 75.5605993270874 secs

Epoch 3 Batch 0 Loss 3.9496
Epoch 3 Batch 429 Loss 4.2479
Epoch 3 Loss 4.2552
Time taken for 1 epoch: 76.45106172561646 secs

Epoch 4 Batch 0 Loss 4.2635
Epoch 4 Batch 429 Loss 4.1641
Epoch 4 Loss 4.1750
Time taken for 1 epoch: 75.2530870437622 secs

Epoch 5 Batch 0 Loss 4.0218
Epoch 5 Batch 429 Loss 4.0562
Saving checkpoint for epoch 5 at /content/drive/MyDrive/Models/transformer_checkpoints/ckpt-1
Epoch 5 Loss 4.0626
Time taken for 1 epoch: 77.6080162525177 secs

Epoch 6 Batch 0 Loss 3.9197
Epoch 6 Batch 429 Loss 3.9246
Epoch 6 Loss 3.9292
Time taken for 1 epoch: 74.70339512825012 secs

Epoch 7 Batch 0 Loss 3.7853
Epoch 7 Batch 429 Loss 3.7961
Epoch 7 Loss 3.8039
Time taken for 1 epoch: 76.68323469161987 secs

Epoch 8 Batch 0 Loss 3.5326

### Inference

In [31]:
import tensorflow as tf
import numpy as np

In [32]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        enc_output = self.encoder(inp, training, enc_padding_mask)

        dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)

        final_output = self.final_layer(dec_output)

        return final_output, attention_weights

class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x


class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

def positional_encoding(position, d_model):
    angle_rads = get_angles(
        np.arange(position)[:, np.newaxis],
        np.arange(d_model)[np.newaxis, :],
        d_model
    )

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

def get_angles(position, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return position * angle_rates

# Multiheaded Attention
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.dense(concat_attention)

        return output, attention_weights

# Feed Forward
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

        return x, attention_weights


class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)


    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)

        return out3, attn_weights_block1, attn_weights_block2

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
      step = tf.cast(step, tf.float32)  # Cast step to float
      arg1 = tf.math.rsqrt(step)
      arg2 = step * (self.warmup_steps ** -1.5)

      return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)



In [33]:
encoder_vocab_size = 30698
decoder_vocab_size = 14884
num_layers = 4
d_model = 256
dff = 512
num_heads = 8
EPOCHS = 1

In [34]:
transformer = Transformer(
    num_layers,
    d_model,
    num_heads,
    dff,
    encoder_vocab_size,
    decoder_vocab_size,
    pe_input=encoder_vocab_size,
    pe_target=decoder_vocab_size,
)

In [35]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [36]:
checkpoint_path = '/content/drive/MyDrive/Models/transformer_checkpoints'

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


In [37]:
def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

In [38]:
dialogue_max_len_90th_percentile = 205
summary_max_len_90th_percentile = 38

In [39]:
def evaluate(input_document):
    input_document = document_tokenizer.texts_to_sequences([input_document])
    input_document = tf.keras.preprocessing.sequence.pad_sequences(input_document, maxlen=dialogue_max_len_90th_percentile, padding='post', truncating='post')

    encoder_input = tf.expand_dims(input_document[0], 0)

    decoder_input = [summary_tokenizer.word_index["<start>"]]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(summary_max_len_90th_percentile):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)

        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask
        )

        predictions = predictions[: ,-1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        if predicted_id == summary_tokenizer.word_index["<end>"]:
            return tf.squeeze(output, axis=0), attention_weights

        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights

def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)

    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    output = tf.matmul(attention_weights, v)
    return output, attention_weights


In [41]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json

# Load the tokenizers
with open('document_tokenizer.json', 'r', encoding='utf-8') as f:
    document_tokenizer = tokenizer_from_json(f.read())

with open('summary_tokenizer.json', 'r', encoding='utf-8') as f:
    summary_tokenizer = tokenizer_from_json(f.read())

# Assuming dialogue_max_len_90th_percentile and summary_max_len_90th_percentile are defined
# Assuming the transformer model and create_masks function are defined

# Preprocess and remove special characters
def preprocess_text(raw_text):
    raw_text = raw_text.replace('\r\n', "<turn>")
    raw_text = remove_punctuations_and_special_characters(raw_text)
    raw_text = re.sub('\s\s+', " ", raw_text)
    raw_text = raw_text.lower()
    return raw_text

# Function to generate summaries from processed text
def summarize(input_document):
    processed_document = preprocess_text(input_document)
    summary_indices, attention_weights = evaluate(processed_document)
    summary = summary_tokenizer.sequences_to_texts([summary_indices.numpy()])[0]  # Convert to text
    # Clean up the summary text if necessary
    summary = summary.replace("<start>", "").replace("<end>", "").strip()
    return summary


# List to store the summaries
summaries = []
i = 1
for dialogue in test_df['dialogue'][0:819]:
    summary = summarize(dialogue)
    print(i, summary)
    i+=1
    summaries.append(summary)


1 app person2 app person2 and person2 others others others others others others person2 others others others others others others others others others others others others others others others others others others others others others others others others others
2 her new new new time person2 bought a new new time last time person2 bought one 5 time one last time last time person2 her one one for her one for her one for her one person2 her
3 
4 person2 will surprise meeting person2 person2 person2 will bring to person2
5 person2 last new week to last new living last week person2 to visit person2 and person1 to living last week for two week person2 and person2 to last week for friday for person2 and person2 to visit person2
6 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 person2 p

In [48]:
# Filter out the empty summaries and corresponding references
filtered_summaries = [x for x in summaries if x]
filtered_references = [references[i] for i, x in enumerate(summaries) if x]

if filtered_summaries:
    scores = rouge.get_scores(filtered_summaries, filtered_references, avg=True)
    print(scores)
else:
    print("No valid summaries to calculate ROUGE scores.")


{'rouge-1': {'r': 0.023874666821249568, 'p': 0.08058545570488895, 'f': 0.03287304921074305}, 'rouge-2': {'r': 0.0005509991216512956, 'p': 0.0006996336996336996, 'f': 0.0005888052196893191}, 'rouge-l': {'r': 0.023118883941484782, 'p': 0.07910737822681149, 'f': 0.03189706449851201}}


The Transformer model also performed poorly when compared with the vanilla RNN and LSTM Models. If we could have given better hyperparameters or better embeddings like glove or universal sentence encoder similar to that this might have worked well. Lets try BART model which is a state-of-the-art model for various natural language processing tasks, such as summarization, machine translation, and sentiment analysis.

## BART Model

In this approach we will use Pytorch for Model building and carry out different tokenization and preprocessing strategies

In [65]:
!pip install evaluate
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=d27c1286dc69c59876099c350c9b7343698c9e99b5f53f3bf04d2ae5d21bfb7e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [21]:
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import pipeline
from transformers import DataCollatorForSeq2Seq
import torch
import evaluate
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
dataset = load_dataset("samsum")
train = dataset["train"].to_pandas()
val = dataset["validation"].to_pandas()
test = dataset["test"].to_pandas()

In [5]:
def clean_tags(text):
    clean = re.compile('<.*?>') # Compiling tags
    clean = re.sub(clean, '', text) # Replacing tags text by an empty string

    # Removing empty dialogues
    clean = '\n'.join([line for line in clean.split('\n') if not re.match('.*:\s*$', line)])

    return clean

In [6]:
# Defining function to clean every text in the dataset.
def clean_df(df, cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(clean_tags)
    return df

In [7]:
# Cleaning texts in all datasets
train = clean_df(train,['dialogue', 'summary'])
test = clean_df(test,['dialogue', 'summary'])
val = clean_df(val,['dialogue', 'summary'])

In [8]:
from datasets import Dataset, load_metric

train_ds = Dataset.from_pandas(train)
test_ds = Dataset.from_pandas(test)
val_ds = Dataset.from_pandas(val)

Using the facebook model bartlarge

In [9]:
summarizer = pipeline('summarization', model = 'facebook/bart-large-xsum')

In [10]:
checkpoint = 'facebook/bart-large-xsum'
tokenizer = BartTokenizer.from_pretrained(checkpoint)

In [11]:
model = BartForConditionalGeneration.from_pretrained(checkpoint)

In [12]:
def preprocess_function(examples):
    inputs = [doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
# Applying preprocess_function to the datasets
tokenized_train = train_ds.map(preprocess_function, batched=True,
                               remove_columns=['id', 'dialogue', 'summary'])
tokenized_test = test_ds.map(preprocess_function, batched=True,
                               remove_columns=['id', 'dialogue', 'summary'])

tokenized_val = val_ds.map(preprocess_function, batched=True,
                               remove_columns=['id', 'dialogue', 'summary'])

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [15]:
metric = load_metric('rouge')

  metric = load_metric('rouge')


In [16]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred# Obtaining predictions and true labels

    # Decoding predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    # Computing rouge score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()} # Extracting some results

    # Add mean-generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir = '/content/drive/MyDrive/Models/bart_samsum',
    evaluation_strategy = "epoch",
    save_strategy = 'epoch',
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_loss',
    seed = 42,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)

In [18]:
# Defining Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.3864,1.508298,52.1909,27.5246,43.004,48.4568,28.7717
2,1.0075,1.486831,52.3109,27.4906,43.163,48.1073,30.0208
3,0.7445,1.560792,52.8233,27.7695,43.5337,48.6055,29.7741


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=11049, training_loss=1.0765350691192623, metrics={'train_runtime': 7138.3866, 'train_samples_per_second': 6.191, 'train_steps_per_second': 1.548, 'total_flos': 2.017977753840845e+16, 'train_loss': 1.0765350691192623, 'epoch': 3.0})

In [20]:
directory = "/content/drive/MyDrive/Models/bart_finetuned_samsum"
trainer.save_model(directory)

tokenizer.save_pretrained(directory)

('/content/drive/MyDrive/Models/bart_finetuned_samsum/tokenizer_config.json',
 '/content/drive/MyDrive/Models/bart_finetuned_samsum/special_tokens_map.json',
 '/content/drive/MyDrive/Models/bart_finetuned_samsum/vocab.json',
 '/content/drive/MyDrive/Models/bart_finetuned_samsum/merges.txt',
 '/content/drive/MyDrive/Models/bart_finetuned_samsum/added_tokens.json')

### Loading the trained model which wa deployed to hugging face

In [7]:
from transformers import pipeline

In [8]:
summarizer = pipeline('summarization', model = 'dhanushkumar97/bart_dk')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Evaluating on 100 data points due to longer running times.

In [12]:
summaries = []
for text in test['dialogue'][0:100]:
  print("Text:", text)
  print("Summary:", summarizer(text)[0]['summary_text'])
  summaries.append(summarizer(text)[0]['summary_text'])
  print()

Text: Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
Summary: Amanda can't find Betty's number. Larry called Betty the last time they were at the park. Hannah doesn't know him well, but Amanda advises her to text him.

Text: Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MAC

Your max_length is set to 62, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)



Text: Mary: Are you going by car or train?
Tom: Ella rented a car
Ella: this makes all of this much faster
Mary: good decision


Your max_length is set to 62, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)


Summary: Ella rented a car. She will go by car.

Text: Luke: are you still looking for someone to join netflix family?
Paul: yes, 1 person :)
Luke: i am the one!
Paul: sure, i will send you the login and password on sunday
Luke: ok we can talk tomorrow
Paul: i don't really remember it now
Luke: send me also the bank account details so I can wire you the money every month. Are you paying for this or someone else?
Paul: I do, and I keep track of everyone accessing so you should not expect any bans :D
Luke: easy mate :D you still on holidays with your girl?
Paul: last dinner :( tomorrow we are out
Luke: how long have you been there?
Paul: less than 8 days :/
Summary: Paul is looking for someone to join Netflix family. Luke is the one. Paul will send him the login and password on Sunday. Luke will wire Paul money every month. Paul is on holidays.

Text: Greg: Hi, honey. I need to stay after hours :-(
Betsy: Again?
Greg: I'm sorry!
Betsy: What about Johnny?
Greg: Well, could you pick him up

Your max_length is set to 62, but your input_length is only 52. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)



Text: Myah: <file_photo>
Selah: I can't see the phone number very well. Rewrite it plz
Myah: <file_photo>
Selah: The phone of that person is off


Your max_length is set to 62, but your input_length is only 52. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)


Summary: Selah can't see the phone number of the person, because the phone is off.

Text: Eric: Hey Bella, What happened today in boss's room?? Was he angry??
Bella: NO NO!!! He wasn't angry at all.. He actually appreciated on our brave deccision to dismiss the request of client..
Eric: REALLY!! He appreciated this decision..
Bella: Yeah he really did.. I too was astounded by his reaction...
Eric: What could possibly lead to this?? I mean , they were potential clients...
Bella: What he told me was that he was looking forward to bring in new clients which were our current client's competitor..
Eric: Oh that could possibly be the reason.Well anyways you got appreciation xD congo	
Bella: hahaha Blessing in disguise xD
Summary: Bella and Eric's boss was not angry today. He appreciated their decision to dismiss the request of a client. He was looking forward to bring in new clients.

Text: Ben: Where are you?
Emma: at the rare of the bus
Ben: why?
Emma: there are some free seats here
Emma: 

Your max_length is set to 62, but your input_length is only 38. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)



Text: Mary: hey, im kinda broke, lend me a few box
Carter: okay, give me an hour, im at the train station
Mary: cool, thanks


Your max_length is set to 62, but your input_length is only 38. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)


Summary: Mary is broke. Carter will lend her a few box in an hour.

Text: Charlotte: Hello Paula, a funny question: how do you pronounce 'Natal lily', the name of the plant? It refers to the region of ZA and not to the word 'natal' as in 'his natal day', right?
Paula: Hi Charlotte, 'nu tell', 'nu' as in 'number'.
Charlotte: And the stress on the second syllable? Or the first?
Paula: 2nd
Charlotte: Thank you dear.
Paula: <file_other>
Charlotte: Lovely to hear your voice!!
Paula: :$
Paula: <file_other>
Charlotte: :X
Summary: Charlotte wants to know how to pronounce the name of the plant called "Natal Lily". Paula explains it to Charlotte.

Text: Jack: Cocktails later?
May: YES!!!
May: You read my mind...
Jack: Possibly a little tightly strung today?
May: Sigh... without question.
Jack: Thought so.
May: A little drink will help!
Jack: Maybe two!
Summary: Jack and May are going to have a drink later.


Your max_length is set to 62, but your input_length is only 39. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)



Text: Margaret: Honey, buy me some painkiller.
Jack: What is going on?
Margaret: Terrible headache!
Jack: Maybe you should rest!


Your max_length is set to 62, but your input_length is only 39. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)


Summary: Margaret has a terrible headache. Jack will buy her a painkiller.

Text: Andrei: hey, did you pick up the film equipment for tonite's shooting?
Serge: no, im on my way there now.
Andrei: cool. do you happen to have your credit card with you? we have an outstanding bill to pay with the company.
Serge: yeah, i do. not a lot of available credit on it, but we'll see when we get there.
Andrei: OK, thanks. theyll be glad when we pay it. its long overdue.
Serge: ill let you know if it works out. getting of the metro now
Andrei: ok
Summary: Serge is on his way to pick up the film equipment for tonight's shooting. He will let Andrei know if it works out. They have an outstanding bill to pay with the company.

Text: Janice: my son has been asking me to get him a hamster for his birthday
Janice: should i?
Martina: NO! NO! NO! NO! NO!
Martina: i got one for my son and it stank up the whole house
Martina: so don't do it!!!
Summary: Janice's son wants her to get him a hamster for his birthd

Your max_length is set to 62, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)



Text: Mike: Do u have new John's number?
Ann: No, u should ask Mary.
Mike: Ok, thank u :*


Your max_length is set to 62, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)


Summary: Ann doesn't have new John's number for Mike. He should ask Mary.

Text: Joseph: It's fuzzy but I think you can recognize what's that(^_-)-☆
Joseph: <file_photo>
Ella: Ooooo
Ella: Baby cows??(/◕ヮ◕)/(/◕ヮ◕)/(/◕ヮ◕)/
Joseph: Wujek Janek has tween cows:D
Ella: Twins* darling xD
Joseph: Oh yeah, sorry Twins*
Ella: Good for him!! So cool❤️❤️
Ella: Wanna touch them❤️❤️❤️
Summary: Joseph has sent a photo of newborn cows to Ella.

Text: Josh: Stephen, I think you've accidentaly taken my notebook home
Stephen: wait lemme check
Stephen: nope, I don't see it anywhere
Jack: oh shit, I've got it xDDD I don't even know why
Josh: xDDD ok, no problem, cool I know where it is
Jack: I'll bring it tomorow
Summary: Jack has Josh's notebook. Jack will bring it tomorrow.

Text: Lola: hey girlfriend, what's up?
Adele: Oh, hi Lols, not much.
Adele: got a new dog.
Lola: another one?
Adele: Yup. a pup biscuit lab. 4 months. Chewy.
Lola: how did the others react?
Adele: the cats keep their distance, Poppy 

Your max_length is set to 62, but your input_length is only 53. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)



Text: Cathy: Just realized I left my sunglasses at yours
Broke: Yes, they are waiting for you to pick them up
Cathy: Might come round at 10 tonight if that's alright
Broke: Yeah okay, see ya


Your max_length is set to 62, but your input_length is only 53. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)


Summary: Cathy left her sunglasses at Broke's place. She will come over at 10 tonight to pick them up.

Text: Petra: I need to sleep, I can't stand how sleepy I am
Andy: I know, and it's so boring today, nobody's working at the office
Ezgi: I am working! lazy pigs
Petra: I'm sleeping with my eyes open, kill me
Andy: ask the fat woman from HR
Petra: she would kill me on spot without batting an eye
Andy: she always repeats she has a black belt in karate 
Petra: it's hard to believe she can move, but let her have whatever belt she wants
Andy: LOL
Petra: sooooo sleepy
Summary: Petra needs to sleep because she's sleepy. Nobody is working at the office today.

Text: Nick: You look absolutely gorgeous and have a lovely smile. 
Nick: Would love to get to know you a bit more. How about we meet up for a drink sometime?
Jane: Hmmm... You're shooting a bit above your range aren't you?
Nick: Why would you think that hon?
Jane: Because I'm not that desperate.
Nick: That was a bit below the belt.
Nic

Your max_length is set to 62, but your input_length is only 54. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)



Text: Salma: <file_video>
Salma: the latest cat meme
Hugh: oh sweet, I can never get enough of those lol
Hugh: <file_photo>
Salma: hahaha same


Your max_length is set to 62, but your input_length is only 54. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)


Summary: Hugh and Salma love the latest cat meme.

Text: Matt: results should be announced soon
Matt: probably today 
Oliver: they posted it
Oliver: <file_other>
Peter: I didn't get into Stanford :(
Matt: let me see
Matt: yup, I did
Oliver: me too
Oliver: barely
Peter: I'm happy for you guys
Matt: chin up! there are many other options
Oliver: exactly, don't give up
Peter: thanks guys, that means a lot
Peter: send your documents asap
Peter: otherwise you'll stuck in the queue
Matt: thanks for a heads-up
Oliver: yea, we owe you one
Peter: I have to look for other university
Peter: see ya
Matt: bye
Summary: Matt and Oliver got into Stanford. Peter didn't get into Stanford, so he has to look for another university.


Your max_length is set to 62, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)



Text: Jake: we don't have english today
Nadia: whooooah!
Vanessa: cool! you sure though?
Jake: Smith called in sick, they couldn't find a replacement, oh yeah


Your max_length is set to 62, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)


Summary: Smith called in sick, so they don't have English today.

Text: Brandon: Shit, I've lost my credit card!
Brandon: I blocked it in the bank but it will take time before I get a new one
Brandon: can you lend me $ 100 guys?
Luke: sorry man, Im broke:/
Brandon: I see
Ian: what a misfortune, dude
Ian: I can lend you $$, no problem
Brandon: thanks, dude!
Summary: Brandon lost his credit card. He blocked it in the bank but it will take time before he gets a new one. Ian will lend him $100.

Text: Inez: My dears, our evening inspired me to create this group conversation to plan further Food Evenings :)
Inez: This is my proposal for the next one: <file_photo>
Alicja: Wow, I will actually feel happy going to work thanks to this :D
Gosia: Happy going to work and even happier leaving it haha
Alicja: Just too bad that the time between 9 and 17 will be wasted :P
Patrycja: I really liked our evening, even the pizza was delicious :) How did you girls like it?
Inez: I loved it, Gosia really cho

Your max_length is set to 62, but your input_length is only 40. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)



Text: Joyce: Check this out!
Joyce: <link>
Michael: That's cheap!
Edson: No way! I'm booking my ticket now!! 


Your max_length is set to 62, but your input_length is only 40. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)


Summary: Edson is booking his ticket now.

Text: Jane: google maps says it is at least 3h <file_other>
Steven: I used to make it in 2, trust me :D
Jane: but it's almost 300km..
Steven: the road is new , we will make it ^^
Jane: I don't want  to stress out , let's meet at 4:30 instead of 5, ok?
Steven: ok, if u reaaly want, we can meet at 4:30
Jane: thx! 
Jane: I will wait at the main entrance or where?
Steven: main entrance is good for me;-) 
Steven: cu
Summary: Jane and Steven will meet at 4:30 instead of 5:30. They will wait at the main entrance.

Text: Morgan: Hey gorgeous, how’s your day?
Suzanne: Nothing special, it’s just one of many boring days at work. But… better now though!
Morgan: Are you working at all? 😉
Suzanne: I’m trying 😉 But you aren’t helping me, at all
Suzanne: I’m just taking a well-deserved break 😉
Morgan:  I miss you Suzie
Suzanne: I miss you too Morgan
Morgan: Do you feel like going to a concert next week? Maroon 5 is playing at the Hulu Theater at Madison Squar

Your max_length is set to 62, but your input_length is only 53. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)



Text: Steve: BTW, USA won last night!
Gulab: I forgot to check!
Steve: England playing tomorrow at 2:00!
Gulab: That's right, Croatia?
Steve: Yep.


Your max_length is set to 62, but your input_length is only 53. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)


Summary: USA won last night. England will play Croatia tomorrow at 2:00.

Text: John: Ela i am coming in 10 mins please give me my walle outside t i forgot it
Ela: yes just saw it when you are here call me
John: but your phone is busy thats why i messaged keep it free please i am getting late
Ela: oh yes was talking to mom ... its free now
John: ok
Summary: John wants Ela to give him his walle outside because he forgot it. Ela's phone is busy.

Text: Mary: Did you tell your sister I am doing online job?
Mark: yes !
Mary: why
Mark: because she keep saying your good for nothing?
Mary: dint I tell you I don’t care?
Mark: what happened?
Mary: see I don’t want to prove anything to anyone..
Mark: I know… but I was just feeling proud so it was kind of show off…
Mary: she is asking everyone… and trying to get to the people I am working for
Mark: really!! I am sorry for that…
Mary: don’t be! I understand your feelings…  but u know how she is…
Mark: I know!! :? 
Mary: don’t be sad now its ok.. 

Your max_length is set to 62, but your input_length is only 61. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)



Text: Ashleigh: Looks like we're going to the cinema!! 
Ashleigh: <file_gif>
Peter: You got the job??
Ashleigh: I got hte job! :D
Peter: <file_gif>
Ashleigh: <file_gif>


Your max_length is set to 62, but your input_length is only 61. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)


Summary: Ashleigh and Peter are going to the cinema.

Text: Danna: How's your Saturday?
Reed: It was alright thanks
Danna: Good
Reed: Yours ?
Danna: Boring
Reed: Why?
Danna: I'm angry I called maybe 5-8 of my friends and they aren't around or are busy.
Reed: Shame
Danna: So it's is the next boring weekend for me
Reed: That sucks
Danna: The only thing I can do is watching TV -.-
Reed: Haha lucky you
Danna: Yeah haha
Reed: I don't have tv, our subscription expired and they never renewed it. They want us to pay for it so fuck it
Danna: Yeah. What are you doing?
Reed: I'm in bed
Danna: Work tomorrow?
Reed: No. Off tomorrow
Danna: Nice
Reed: Indeed
Summary: Reed had a good Saturday. Danna's Saturday was boring. Reed doesn't have a TV subscription. Reed is in bed. Reed has no work tomorrow.

Text: Antonio: Is everything okay? You've been quiet lately
Alivia: Oh, hi, yeah, I've just been working on my thesis
Alivia: Or rather trying to work, it's not going too well
Antonio: Oh :( Problems fin

Your max_length is set to 62, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)



Text: Maddie: I'm in Asda, do you need anything?
John: could do with a white bread and some apples 
Maddie: ok. Gala?
John: yes please ta


Your max_length is set to 62, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)


Summary: Maddie is in Asda. She will buy white bread and apples for John.

Text: Elliot: i can't talk rn, i'm rly busy
Elliot: can i call u back in about 2 hours?
Jordan: Not really, I'm going to a funeral.
Jordan: I'll call you tonight, ok?
Elliot: sure
Elliot: whose funeral is it?
Jordan: My colleague's, Brad.
Jordan: I told you about him, he had a liver cancer.
Elliot: i'm so sorry man, i hope u're ok
Elliot: i'll call u at 8 pm
Summary: Jordan is going to a funeral of his colleague, Brad, who had a liver cancer. Elliot will call Jordan tonight at 8 pm.

Text: Flo: OMG, I can't get into the salon until the 6th!
Gina: What? Why?
Flo: They're just too busy. I'm going to be gray! LOL!
Gina: Get you a touch-up kit at Tesco!
Flo: Gonna have to!
Summary: Flo can't get into the salon until the 6th as she's going to be gray. Gina will get her a touch-up kit at Tesco.

Text: Rob: hey, pick up your phone :)
Ann: can't - meeting :)
Rob: sorry...
Ann: no problem - super boring one :) 
Ann: what

Your max_length is set to 62, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)



Text: Marvin: When's the last time you got laid ?
Melany: I don't even remember..
Marvin: Hmm so there must be lots of cobwebs between your legs now huh hahaha


Your max_length is set to 62, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)


Summary: Melany doesn't remember the last time she got laid.

Text: Eric: <file_video>, check it out :D
Samantha: HAHA, what is our favorite professor?
Eric: Talking about this recent scandal on the news :P
Noah: "I am the smartest person alive, I knew this will happen" :D
Samantha: Hahaha, now I don't even need to open the video
Summary: Eric sends Samantha and Noah a video of a professor talking about the recent scandal on the news.

Text: Jacky: I think you were right yesterday. 
David: What about? I'm right about most things :P
Jacky: Yeah, whole you ;)
Jacky: About taking the blame etc. 
David: Okey, I remeber. We'll talk later?
Jacky: With pleasure. I'll call you when I get home.
Summary: Jacky and David will talk later. Jacky will call David when she gets home.



In [14]:
from rouge import Rouge
references = test['summary'][0:100].tolist()

rouge = Rouge()

scores = rouge.get_scores(summaries, references, avg=True)

print(scores)


{'rouge-1': {'r': 0.5128223738277121, 'p': 0.529748691093847, 'f': 0.5010465500349265}, 'rouge-2': {'r': 0.23906727526514193, 'p': 0.2579704599496894, 'f': 0.23766792620324956}, 'rouge-l': {'r': 0.48380389164538606, 'p': 0.5023638123758647, 'f': 0.4739684023756211}}


BART model performed (better rouge scores) well when compared with all the 3 other models that we have tried. This model is also deployed on hugging face spaces. To access this model please click on the following link

https://huggingface.co/spaces/nishanthp/text-summarization-with-bart