In [1]:
import json
import numpy as np
import pandas as pd

In [2]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]
    return data

In [3]:
train_path = "rl-sentence-compression/data/train-data/gigaword/train.jsonl"
val_path = "rl-sentence-compression/data/train-data/gigaword/val.jsonl"
test_path = "rl-sentence-compression/data/test-data/gigaword.jsonl"

In [4]:
train_data = load_data(train_path)
val_data = load_data(val_path)
test_data = load_data(test_path)

In [5]:
print(len(train_data))
print(len(val_data))
print(len(test_data))

1000000
189651
1951


In [6]:
train_data = train_data[0:10000]
val_data   = val_data[0:10000]
test_data = test_data[0:20]

In [7]:
print(len(train_data))
print(len(val_data))
print(len(test_data))

10000
10000
20


In [8]:
def remove_id(data):
    updated_data = []
    for item in data:
        updated_item = {key:value for key,value in item.items() if key != "id"}
        updated_data.append(updated_item)
    return updated_data

In [9]:
train_data = remove_id(train_data)
test_data = remove_id(test_data)
val_data = remove_id(val_data)

In [10]:
train_df = pd.DataFrame(train_data)
val_df   = pd.DataFrame(val_data)
test_df  = pd.DataFrame(test_data)

In [11]:
train_df.head()

Unnamed: 0,text,summary
0,australia 's current account deficit shrunk by...,australian current account deficit narrows sha...
1,at least two people were killed in a suspected...,at least two dead in southern philippines blast
2,australian shares closed down #.# percent mond...,australian stocks close down #.# percent
3,south korea 's nuclear envoy kim sook urged no...,envoy urges north korea to restart nuclear dis...
4,south korea on monday announced sweeping tax r...,skorea announces tax cuts to stimulate economy


In [12]:
test_df.rename(columns = {'summaries':'summary'}, inplace = True) 

In [13]:
print(type(test_df['text'][0]))
print(type(test_df['summary'][0]))
test_df['summary'] = test_df['summary'].str[0].astype(str)
test_df.head(2)

<class 'str'>
<class 'list'>


Unnamed: 0,text,summary
0,japan 's nec corp. and UNK computer corp. of t...,nec UNK in computer sales tie-up
1,the sri lankan government on wednesday announc...,sri lanka closes schools as war escalates


In [14]:
def is_integer(value):
    try:
        int(value)
        return True
    except ValueError:
        return False

In [15]:
for index, row in train_df.iterrows():
    if any(is_integer(value) for value in row):
        train_df.drop(index, inplace=True)

In [16]:
for index, row in val_df.iterrows():
    if any(is_integer(value) for value in row):
        val_df.drop(index, inplace=True)

In [17]:
for index, row in test_df.iterrows():
    if any(is_integer(value) for value in row):
        test_df.drop(index, inplace=True)

In [18]:
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [19]:
train_df = train_df.head(2000)
val_df   = val_df.head(2000)
test_df  = test_df.head(10)

In [20]:
# Preprocessing: Clean text data
def clean_text(df):
    for col in df.columns:
        df[col] = df[col].str.lower()
        df[col] = df[col].str.lstrip().str.rstrip()
        df[col] = df[col].str.replace(r'[^\w\s]+', '')
    return df

In [21]:
train_df = clean_text(train_df)
val_df = clean_text(val_df)
test_df = clean_text(test_df)

  df[col] = df[col].str.replace(r'[^\w\s]+', '')


In [22]:
from transformers import BertTokenizer, BertModel
import tensorflow as tf

In [23]:
# Maximum sentence length (word count)
max_len_word = 150

In [24]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [25]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [26]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [27]:
# Encode sentences using BERT
def encode_sentences(sentences):
    #print(sentences)
    encoding = tokenizer.batch_encode_plus(sentences,
                                           padding=True,
                                           truncation=True,
                                           return_tensors='pt',
                                           add_special_tokens=True)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state 
    return word_embeddings,input_ids

In [28]:
# Encode text data
text_embeddings_train_bert, train_ids= encode_sentences(train_df['text'])

In [29]:
text_embeddings_val_bert, val_ids= encode_sentences(val_df['text'])

In [30]:
text_embeddings_test_bert, test_ids= encode_sentences(test_df['text'])

In [31]:
# Encode summary data
summary_embeddings_train_bert,s_train_id= encode_sentences(train_df['summary'])

In [32]:
summary_embeddings_val_bert,s_val_id= encode_sentences(val_df['summary'])

In [33]:
summary_embeddings_test_bert,s_test_id = encode_sentences(test_df['summary'])

In [34]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Padding length
max_sequence_length = 150
max_sequence_length2 = 10

# Pad the sequences to ensure consistent length
text_embeddings_train_bert = pad_sequences(text_embeddings_train_bert, maxlen=max_sequence_length, padding='post')
text_embeddings_val_bert = pad_sequences(text_embeddings_val_bert, maxlen=max_sequence_length, padding='post')
text_embeddings_test_bert = pad_sequences(text_embeddings_test_bert, maxlen=max_sequence_length, padding='post')

summary_embeddings_train_bert = pad_sequences(summary_embeddings_train_bert, maxlen=max_sequence_length2, padding='post')
summary_embeddings_val_bert = pad_sequences(summary_embeddings_val_bert, maxlen=max_sequence_length2, padding='post')
summary_embeddings_test_bert = pad_sequences(summary_embeddings_test_bert, maxlen=max_sequence_length2, padding='post')

# Pad the sequences to ensure consistent length
s_train_id = pad_sequences(s_train_id, maxlen=max_sequence_length2, padding='post')
s_val_id = pad_sequences(s_val_id, maxlen=max_sequence_length2, padding='post')
s_test_id = pad_sequences(s_test_id, maxlen=max_sequence_length2, padding='post')

train_ids = pad_sequences(train_ids, maxlen=max_sequence_length, padding='post')
val_ids = pad_sequences(val_ids, maxlen=max_sequence_length, padding='post')
test_ids = pad_sequences(test_ids, maxlen=max_sequence_length, padding='post')


In [35]:
print(text_embeddings_train_bert.shape)
print(text_embeddings_val_bert.shape)
print(text_embeddings_test_bert.shape)

print(summary_embeddings_train_bert.shape)
print(summary_embeddings_val_bert.shape)
print(summary_embeddings_test_bert.shape)

print(train_ids.shape)
print(val_ids.shape)
print(test_ids.shape)

print(s_train_id.shape)
print(s_val_id.shape)
print(s_test_id.shape)

(2000, 150, 768)
(2000, 150, 768)
(10, 150, 768)
(2000, 10, 768)
(2000, 10, 768)
(10, 10, 768)
(2000, 150)
(2000, 150)
(10, 150)
(2000, 10)
(2000, 10)
(10, 10)


In [36]:
# Tokenize text and summary data
def tokenize_text(text, max_length):
    tokens = tokenizer(text, max_length=max_length, padding="max_length", 
                       truncation=True, return_tensors='tf')
    return tokens['attention_mask']


In [37]:
# Tokenize text data
text_train_mask = tokenize_text(train_df['text'].tolist(), max_len_word)
text_val_mask   = tokenize_text(val_df['text'].tolist(), max_len_word)
text_test_mask  = tokenize_text(test_df['text'].tolist(), max_len_word)

In [38]:
# Tokenize text data
summary_train_mask = tokenize_text(train_df['summary'].tolist(), max_len_word)
summary_val_mask = tokenize_text(val_df['summary'].tolist(), max_len_word)
summary_test_mask = tokenize_text(test_df['summary'].tolist(), max_len_word)

In [39]:
# Define input shape for text and summary inputs
max_sequence_length = 150
max_sequence_length2 = 10
input_shape = (max_sequence_length, 768)
summary_shape = (max_sequence_length2, 768)

In [40]:
# Define input layers for text and summary inputs
from tensorflow.keras.layers import Input, Dropout, Dense, Flatten
from tensorflow.keras.layers import Concatenate, Lambda, LayerNormalization
from tensorflow.keras.models import Model
text_input = Input(shape=input_shape)
summary_input = Input(shape=summary_shape)

In [41]:
print(text_input.shape)
print(summary_input.shape)

(None, 150, 768)
(None, 10, 768)


In [42]:
# Define layer groups
for _ in range(24):
    # Self-attention layer
    self_attention = tf.keras.layers.MultiHeadAttention(num_heads=8, key_dim=64)(
        query=text_input, value=text_input, attention_mask=text_train_mask)
    self_attention = LayerNormalization()(self_attention + text_input)
    self_attention = Dropout(0.1)(self_attention)

    # Feed forward layer
    feed_forward = Dense(200, activation='relu')(self_attention)
    feed_forward = LayerNormalization()(feed_forward)  # Add LayerNormalization
    feed_forward = Dense(768)(feed_forward)
    feed_forward = LayerNormalization()(feed_forward)  # Add LayerNormalization
    feed_forward = Dropout(0.1)(feed_forward)

    # Update text embeddings
    text_input = feed_forward

print(merged_embeddings.shape)

In [43]:
# Final layers
for _ in range(3):
    text_input = Dense(768, activation="sigmoid")(text_input)
    text_input = LayerNormalization()(text_input)  # Add LayerNormalization
    text_input = Dropout(0.1)(text_input)

In [44]:
output = Dense(768)(text_input)

In [45]:
print(output.shape)

(None, 150, 768)


In [46]:
# Compile the model
model = Model(inputs=text_input, outputs=output)

In [47]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy
#optimizer = Adam(learning_rate=0.001, epsilon=1e-8)
optimizer = Adam(learning_rate=0.0001)

In [48]:
# Compile the model with the target tensors
model.compile(optimizer=optimizer, loss="mean_squared_error", metrics=['mean_squared_error'])
model.summary()

In [49]:
# Train the model using the target tensors
model.fit(text_embeddings_train_bert, summary_embeddings_train_bert, 
          validation_data=(text_embeddings_val_bert,summary_embeddings_val_bert),
          epochs=15, batch_size=64)

Epoch 1/15


ValueError: Dimensions must be equal, but are 10 and 150 for '{{node compile_loss/mean_squared_error/sub}} = Sub[T=DT_FLOAT](compile_loss/mean_squared_error/Cast, functional_1_1/dense_51_1/Add)' with input shapes: [?,10,768], [?,150,768].

In [None]:
# Generate predictions for test data
predicted_summary = model.predict(text_embeddings_test_bert)

In [None]:
predicted_summary.shape

def decode_summary(predicted_summary, ids):
    decoded_summaries = []
    for i in predicted_summary:
        decoded_text = tokenizer.decode(i['input_ids'], skip_special_tokens=True)
        decoded_summaries.append(decoded_text)
    return decoded_summaries

# Assuming 'tokenizer' is your BERT tokenizer instance
decoded_predictions=decode_summary(predicted_summary, test_ids)

# Print the decoded summaries
for i, summary_text in enumerate(decoded_predictions):
    print(f"Sample {i+1} Summary: {summary_text} Len: {len(summary_text)}")
    print()

In [None]:
test_input = ["My name is Rhea and I am 19 years old"]
encoding = tokenizer.batch_encode_plus(test_input,
                                      padding=True,
                                      truncation=True,
                                      return_tensors='pt',
                                      add_special_tokens=True)

In [None]:
encoding

In [None]:
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

In [None]:
with torch.no_grad():
    outputs = bert_model(input_ids, attention_mask=attention_mask)
    word_embeddings = outputs.last_hidden_state 
    print(word_embeddings.shape)
    print(len(word_embeddings))

In [None]:
word_embeddings = pad_sequences(word_embeddings, maxlen=max_sequence_length, padding='post')

In [None]:
summary = model.predict(word_embeddings)

In [None]:
print(summary)

In [None]:
summary.shape

In [None]:
dec = bert.get_output_embeddings()(torch.from_numpy(summary.reshape(4,768)).float())
print("Decoded sentence:", tok.decode(dec.softmax(0).argmax(1)))

In [None]:
print(tokenizer.convert_ids_to_tokens(summary['input_ids'],skip_special_tokens=True))

In [None]:
for i in summary:
    decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    decoded_text = tokenizer.tokenize(decoded_text)
print(decoded_text)

from rouge import Rouge

def evaluate_rouge_n(predicted_embeddings_list, reference_embeddings_list, n=1):
    rouge_scores = {'rouge-{}-f'.format(n): 0.0, 'rouge-{}-p'.format(n): 0.0, 'rouge-{}-r'.format(n): 0.0}
    rouge = Rouge()
    
    # Convert embeddings to tex
    predicted_texts = [tokenizer.decode(embedding) for embeddings in predicted_embeddings_list for embedding in embeddings]
    reference_texts = [tokenizer.decode(embedding) for embeddings in reference_embeddings_list for embedding in embeddings]
    
    # Calculate ROUGE-N scores
    valid_scores_count = 0
    for pred_text, ref_text in zip(predicted_texts, reference_texts):
        scores = rouge.get_scores(pred_text, ref_text)
        if len(scores) > 0:
            scores = scores[0]
            rouge_scores['rouge-{}-f'.format(n)] += scores.get('rouge-{}-f'.format(n), 0.0)
            rouge_scores['rouge-{}-p'.format(n)] += scores.get('rouge-{}-p'.format(n), 0.0)
            rouge_scores['rouge-{}-r'.format(n)] += scores.get('rouge-{}-r'.format(n), 0.0)
            valid_scores_count += 1
    
    # Average the scores
    if valid_scores_count > 0:
        for metric in ['f', 'p', 'r']:
            rouge_scores['rouge-{}-{}'.format(n, metric)] /= valid_scores_count
    
    return rouge_scores