In [1]:
!pip install datasets accelerate

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [11]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [2]:
import pandas as pd

# Load SAMSUM dataset
data_path = '/content/DiaglougeData.csv'  # Replace with the actual path to your dataset
data = pd.read_csv(data_path)

# Display the first few rows
print(data.head())


         id                                           dialogue  \
0  13862856  Hannah: Hey, do you have Betty's number?\nAman...   
1  13729565  Eric: MACHINE!\nRob: That's so gr8!\nEric: I k...   
2  13680171  Lenny: Babe, can you help me with something?\n...   
3  13729438  Will: hey babe, what do you want for dinner to...   
4  13828600  Ollie: Hi , are you in Warsaw\nJane: yes, just...   

                                             summary  
0  Hannah needs Betty's number but Amanda doesn't...  
1  Eric and Rob are going to watch a stand-up on ...  
2  Lenny can't decide which trousers to buy. Bob ...  
3  Emma will be home soon and she will let Will k...  
4  Jane is in Warsaw. Ollie and Jane has a party....  


In [5]:
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing
data['dialogue'] = data['dialogue'].apply(preprocess_text)
data['summary'] = data['summary'].apply(preprocess_text)

# Tokenization and Encoding
max_words = 10000
max_sequence_length = 100

tokenizer_input = Tokenizer(num_words=max_words)
tokenizer_input.fit_on_texts(data['dialogue'])
sequences_input = tokenizer_input.texts_to_sequences(data['dialogue'])
X = pad_sequences(sequences_input, maxlen=max_sequence_length)

tokenizer_target = Tokenizer(num_words=max_words)
tokenizer_target.fit_on_texts(data['summary'])
sequences_target = tokenizer_target.texts_to_sequences(data['summary'])
y = pad_sequences(sequences_target, maxlen=max_sequence_length)

# Create decoder inputs and targets
decoder_input_data = y[:, :-1]
decoder_target_data = np.expand_dims(y[:, 1:], -1)


In [13]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate

# Define encoder
encoder_inputs = Input(shape=(None,))
embedding_layer = Embedding(max_words, 256)(encoder_inputs)
encoder_lstm = LSTM(256, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(embedding_layer)
encoder_states = [state_h, state_c]

# Define decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(max_words, 256)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(max_words, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define model
lstm_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Train the model
lstm_model.fit([X, decoder_input_data], decoder_target_data, epochs=10, batch_size=64, validation_split=0.2)


Epoch 1/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 116ms/step - loss: 6.9611 - val_loss: 1.9742
Epoch 2/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 86ms/step - loss: 1.9162 - val_loss: 1.8743
Epoch 3/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 84ms/step - loss: 1.6751 - val_loss: 1.6091
Epoch 4/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 90ms/step - loss: 1.4789 - val_loss: 1.5496
Epoch 5/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 92ms/step - loss: 1.4061 - val_loss: 1.5292
Epoch 6/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 91ms/step - loss: 1.3881 - val_loss: 1.5206
Epoch 7/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 100ms/step - loss: 1.4083 - val_loss: 1.5160
Epoch 8/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 103ms/step - loss: 1.3869 - val_loss: 1.5119
Epoch 9/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7e94a02cf6a0>

In [14]:
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf

# Load BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize dialogues
def encode_bert(texts):
    return bert_tokenizer(texts, padding=True, truncation=True, return_tensors='tf', max_length=max_sequence_length)

encoded_texts = encode_bert(data['dialogue'].tolist())

# Define BERT-based encoder
bert_encoder_inputs = Input(shape=(None,), dtype=tf.int32)

# Use a Lambda layer to convert KerasTensor to tf.Tensor
# and specify the output shape
bert_encoder_outputs = tf.keras.layers.Lambda(lambda x: bert_model(x)[0], output_shape=(max_sequence_length, 768))(bert_encoder_inputs)

encoder_outputs = tf.keras.layers.GlobalAveragePooling1D()(bert_encoder_outputs)
encoder_dense = Dense(256, activation='relu')(encoder_outputs)

# Define decoder
# The decoder input shape needs to be adjusted to match the expected output from the encoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(max_words, 256)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True)(decoder_embedding)
decoder_dense = Dense(max_words, activation='softmax')(decoder_lstm)

# Define model
bert_model = Model([bert_encoder_inputs, decoder_inputs], decoder_dense)
bert_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Train the model
bert_model.fit([encoded_texts['input_ids'], decoder_input_data], decoder_target_data, epochs=10, batch_size=64, validation_split=0.2)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 80ms/step - loss: 6.9010 - val_loss: 1.9897
Epoch 2/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 77ms/step - loss: 1.8914 - val_loss: 1.9287
Epoch 3/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 71ms/step - loss: 1.7541 - val_loss: 1.6336
Epoch 4/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 69ms/step - loss: 1.4824 - val_loss: 1.5658
Epoch 5/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 70ms/step - loss: 1.4255 - val_loss: 1.5397
Epoch 6/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 70ms/step - loss: 1.4049 - val_loss: 1.5300
Epoch 7/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 70ms/step - loss: 1.4321 - val_loss: 1.5251
Epoch 8/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 71ms/step - loss: 1.4099 - val_loss: 1.5220
Epoch 9/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7e94a03df8e0>

In [17]:
from rouge import Rouge

# Define a function to decode the model output into text
def decode_sequence(sequence, tokenizer):
    reverse_word_index = dict((i, word) for word, i in tokenizer.word_index.items())
    return ' '.join([reverse_word_index.get(i, '') for i in sequence])

def predict_summary(model, input_text, tokenizer, decoder_input_data): # Add decoder_input_data as an argument
    if isinstance(model, tf.keras.models.Model): # Check if the model is a BERT model
        encoded_input = encode_bert([input_text])['input_ids']
        # Ensure decoder_input_data has the same number of samples as encoded_input
        prediction = model.predict([encoded_input, decoder_input_data[:1]]) # Slice decoder_input_data to have one sample
    else:
        encoded_input = tokenizer.texts_to_sequences([input_text])
        encoded_input = tf.keras.preprocessing.sequence.pad_sequences(encoded_input, maxlen=max_summary_length, padding='post')
        # Ensure decoder_input_data has the same number of samples as encoded_input
        prediction = model.predict([encoded_input, decoder_input_data[:1]]) # Slice decoder_input_data to have one sample
    predicted_sequence = np.argmax(prediction, axis=-1)[0]
    return decode_sequence(predicted_sequence, tokenizer_target)

# Sample dialogue
dialogue = """Bill: I haven’t seen you for a while. You OK?
Jane: Yeah, I am fine. Basically.
Bill: What d'you mean, basically?
Jane: I'm not too well.
Bill: Meaning?
Jane: I've got this terrible cold.
Bill: You seen a doctor?
Jane: No. I don't feel like going out at all.
Bill: You've got a fever?
Jane: I don't think so.
Bill: You can measure it, you know.
Jane: No such device in my household.
Bill: Right: I'm gonna bring you a thermometer.
Bill: You need anything else?
Jane: Yeah. Would be cool if you got me some bread and aspirin.
Bill: No problem. I'll be over in an hour or so.
Jane: Thanks, Bill. Appreciate it:)!"""

# True summary
true_summary = "Jane has a very bad cold. She didn't see the doctor. She doesn't have a thermometer. Bill will get her a thermometer, some bread and aspirin. He will come over in about an hour."

# Generate summaries
lstm_summary = predict_summary(lstm_model, dialogue, tokenizer_target, decoder_input_data) # Pass decoder_input_data to the function
bert_summary = predict_summary(bert_model, dialogue, tokenizer_target, decoder_input_data) # Pass decoder_input_data to the function


print('lstm_summary:', lstm_summary)
print('bert_summary:', bert_summary)


# ROUGE-1 evaluation
rouge = Rouge()
lstm_scores = rouge.get_scores(lstm_summary, true_summary)
bert_scores = rouge.get_scores(bert_summary, true_summary)

print('LSTM ROUGE-1:', lstm_scores)
print('BERT ROUGE-1:', bert_scores)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
lstm_summary:                                                                                       and and to to the to the to to to to the to
bert_summary:                                                                                       to to to to the to the to to to to the to
LSTM ROUGE-1: [{'rouge-1': {'r': 0.06666666666666667, 'p': 0.6666666666666666, 'f': 0.12121211955922866}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.06666666666666667, 'p': 0.6666666666666666, 'f': 0.12121211955922866}}]
BERT ROUGE-1: [{'rouge-1': {'r': 0.03333333333333333, 'p': 0.5, 'f': 0.062499998828125014}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.03333333333333333, 'p': 0.5, 'f': 0.062499998828125014}}]
