In [1]:
import json
import spacy
import numpy as np
import pandas as pd

In [2]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]
    return data

In [3]:
train_path = "rl-sentence-compression/data/train-data/gigaword/train.jsonl"
val_path = "rl-sentence-compression/data/train-data/gigaword/val.jsonl"
test_path = "rl-sentence-compression/data/test-data/gigaword.jsonl"

In [4]:
train_data = load_data(train_path)
val_data = load_data(val_path)
test_data = load_data(test_path)

In [5]:
print(len(train_data))
print(len(val_data))
print(len(test_data))

1000000
189651
1951


In [6]:
train_data = train_data[0:4000]
val_data   = val_data[0:4000]
test_data = test_data[0:20]

In [7]:
print(len(train_data))
print(len(val_data))
print(len(test_data))

4000
4000
20


In [8]:
def remove_id(data):
    updated_data = []
    for item in data:
        updated_item = {key:value for key,value in item.items() if key != "id"}
        updated_data.append(updated_item)
    return updated_data

In [9]:
train_data = remove_id(train_data)
test_data = remove_id(test_data)
val_data = remove_id(val_data)

In [10]:
train_df = pd.DataFrame(train_data)
val_df   = pd.DataFrame(val_data)
test_df  = pd.DataFrame(test_data)

In [11]:
train_df.head()

Unnamed: 0,text,summary
0,australia 's current account deficit shrunk by...,australian current account deficit narrows sha...
1,at least two people were killed in a suspected...,at least two dead in southern philippines blast
2,australian shares closed down #.# percent mond...,australian stocks close down #.# percent
3,south korea 's nuclear envoy kim sook urged no...,envoy urges north korea to restart nuclear dis...
4,south korea on monday announced sweeping tax r...,skorea announces tax cuts to stimulate economy


In [12]:
test_df.rename(columns = {'summaries':'summary'}, inplace = True) 

In [13]:
print(type(test_df['text'][0]))
print(type(test_df['summary'][0]))
test_df['summary'] = test_df['summary'].str[0].astype(str)
test_df.head(2)

<class 'str'>
<class 'list'>


Unnamed: 0,text,summary
0,japan 's nec corp. and UNK computer corp. of t...,nec UNK in computer sales tie-up
1,the sri lankan government on wednesday announc...,sri lanka closes schools as war escalates


In [14]:
def is_integer(value):
    try:
        int(value)
        return True
    except ValueError:
        return False

In [15]:
for index, row in train_df.iterrows():
    if any(is_integer(value) for value in row):
        train_df.drop(index, inplace=True)

In [16]:
for index, row in val_df.iterrows():
    if any(is_integer(value) for value in row):
        val_df.drop(index, inplace=True)

In [17]:
for index, row in test_df.iterrows():
    if any(is_integer(value) for value in row):
        test_df.drop(index, inplace=True)

In [18]:
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [19]:
train_df = train_df.head(2000)
val_df   = val_df.head(2000)
test_df  = test_df.head(3)

In [20]:
for col in train_df.columns:
    train_df[col] = train_df[col].str.lower()
for column in train_df.columns:
    train_df[column] = train_df[column].str.lstrip().str.rstrip()
for column in train_df.columns:
    train_df[column] = train_df[column].str.replace(r'[^\w\s]+', '')

  train_df[column] = train_df[column].str.replace(r'[^\w\s]+', '')


In [21]:
# Maximum sentence length (word count)
max_len_word = 150

In [22]:
for col in val_df.columns:
    val_df[col] = val_df[col].str.lower()
for column in val_df.columns:
    val_df[column] = val_df[column].str.lstrip().str.rstrip()
for column in val_df.columns:
    val_df[column] = val_df[column].str.replace(r'[^\w\s]+', '')

  val_df[column] = val_df[column].str.replace(r'[^\w\s]+', '')


In [23]:
for col in test_df.columns:
    test_df[col] = test_df[col].str.lower()
for column in test_df.columns:
    test_df[column] = test_df[column].str.lstrip().str.rstrip()
for column in test_df.columns:
    test_df[column] = test_df[column].str.replace(r'[^\w\s]+', '')

  test_df[column] = test_df[column].str.replace(r'[^\w\s]+', '')


In [24]:
# Load BERT tokenizer
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [25]:
# Tokenize text and summary data
def tokenize_text(text, max_length):
    print(len(text))
    if isinstance(text, pd.Series):
        text = text.tolist()
    #print(text)
    tokens = tokenizer(text, max_length=max_length, padding="max_length", truncation=True, return_tensors='tf')
    #print(tokens)
    return tokens['input_ids'], tokens['attention_mask']


In [26]:
# Tokenize text data
text_train_input, text_train_mask = tokenize_text(train_df['text'], max_len_word)
text_val_input, text_val_mask = tokenize_text(val_df['text'], max_len_word)
text_test_input, text_test_mask = tokenize_text(test_df['text'], max_len_word)


2000
2000
3


In [27]:
print(text_train_mask.shape)
print(text_val_mask.shape)
print(text_test_mask.shape)

(2000, 150)
(2000, 150)
(3, 150)


In [28]:
import numpy as np

# Reshape mask tensors
text_train_mask = np.expand_dims(text_train_mask, axis=-1)
text_val_mask = np.expand_dims(text_val_mask, axis=-1)
text_test_mask = np.expand_dims(text_test_mask, axis=-1)


In [29]:
print(text_train_mask.shape)
print(text_val_mask.shape)
print(text_test_mask.shape)

(2000, 150, 1)
(2000, 150, 1)
(3, 150, 1)


In [30]:
text_train_input[0]

<tf.Tensor: shape=(150,), dtype=int32, numpy=
array([  101,  2660,  1055,  2783,  4070, 15074, 14021, 15532,  2243,
        2011,  1037,  2501,  4551,  6363,  1048, 15185,  4551,  2149,
       25269,  2497,  1999,  1996,  2238,  4284,  2349,  2000, 23990,
       19502,  7597,  4481,  2207,  6928,  3662,   102,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,   

# Load Sentence Transformer model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [31]:
from transformers import BertModel
import torch
model = BertModel.from_pretrained('bert-base-uncased')

In [32]:
# Encode sentences using BERT
def encode_sentences(sentences):
    encoding = tokenizer.batch_encode_plus(sentences,
                                           padding=True,
                                           truncation=True,
                                           return_tensors='pt',
                                           add_special_tokens=True)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state 
    return word_embeddings,input_ids

In [33]:
# Encode text data
text_embeddings_train_bert,train_ids = encode_sentences(train_df['text'])

In [34]:
text_embeddings_val_bert,val_ids = encode_sentences(val_df['text'])

In [35]:
text_embeddings_test_bert,test_ids = encode_sentences(test_df['text'])

In [36]:
print(text_embeddings_train_bert.shape)
print(len(text_embeddings_train_bert))

print(text_embeddings_val_bert.shape)
print(len(text_embeddings_val_bert))

print(text_embeddings_test_bert.shape)
print(len(text_embeddings_test_bert))

torch.Size([2000, 54, 768])
2000
torch.Size([2000, 56, 768])
2000
torch.Size([3, 33, 768])
3


In [37]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Padding length
max_sequence_length = 150

# Pad the sequences to ensure consistent length
text_embeddings_train_bert = pad_sequences(text_embeddings_train_bert, maxlen=max_sequence_length, padding='post')
text_embeddings_val_bert = pad_sequences(text_embeddings_val_bert, maxlen=max_sequence_length, padding='post')
text_embeddings_test_bert = pad_sequences(text_embeddings_test_bert, maxlen=max_sequence_length, padding='post')

# Now check the shape of the padded sequences
print(text_embeddings_train_bert.shape)
print(text_embeddings_val_bert.shape)
print(text_embeddings_test_bert.shape)


(2000, 150, 768)
(2000, 150, 768)
(3, 150, 768)


In [38]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dropout, Dense, LayerNormalization
from tensorflow.keras.layers import Reshape, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.metrics import binary_accuracy
from sklearn.metrics.pairwise import cosine_similarity

# Modify model architecture to accept BERT-encoded inputs
input_text_bert = Input(shape=(text_embeddings_train_bert.shape[1], text_embeddings_train_bert.shape[2]))

In [39]:
# Define input shape
input_shape = (150, 768)  # Assuming BERT model output size is 768

# Define input layer
input_text = Input(shape=input_shape)

In [40]:
print(input_text.shape)

(None, 150, 768)


text_embeddings_bert = input_text_bert

In [41]:
text_embeddings = input_text

In [42]:
# Define model architecture
num_bert_layers = 12
dropout_rate = 0.1

for _ in range(num_bert_layers):
    text_embeddings_bert = Dropout(dropout_rate)(text_embeddings_bert)
    for _ in range(3):
        text_embeddings_bert = Dense(text_embeddings_bert.shape[-1], activation="relu")(text_embeddings_bert)
        text_embeddings_bert = Dropout(dropout_rate)(text_embeddings_bert)
        text_embeddings_bert = Dense(text_embeddings_bert.shape[-1])(text_embeddings_bert)
        
        # Add skip connection
        text_embeddings_bert = text_embeddings_bert + input_text_bert

In [43]:
for _ in range(num_bert_layers):
    text_embeddings = Dropout(dropout_rate)(text_embeddings)
    for _ in range(3):
        text_embeddings = Dense(text_embeddings.shape[-1], activation="relu")(text_embeddings)
        text_embeddings = Dropout(dropout_rate)(text_embeddings)
        text_embeddings = Dense(text_embeddings.shape[-1])(text_embeddings)
        
        # Add skip connection
        text_embeddings = text_embeddings + input_text

text_output_bert = Dense(text_train_mask.shape[-1], activation="sigmoid")(text_embeddings_bert)

In [44]:
# Reshape the output to match the target shape
from tensorflow.keras.layers import Reshape, Lambda, TimeDistributed
text_output = TimeDistributed(Dense(1, activation="sigmoid"))(text_embeddings)

In [45]:
from tensorflow.keras.layers import Flatten

# Remove the extra dimension from the output tensor
text_output = Flatten()(text_output)

# Final feedforward layer with sigmoid activation
text_output = Dense(150, activation="sigmoid")(text_output)


In [46]:
print(text_output.shape)

(None, 150)


model_bert = Model(inputs=input_text_bert, outputs=text_output_bert)

In [47]:
# Define model
model_bert = Model(inputs=input_text, outputs=text_output)

In [48]:
# Compile model
optimizer = Adam(learning_rate=1e-5, epsilon=1e-8)
model_bert.compile(optimizer=optimizer, loss=binary_crossentropy, 
                   metrics=[binary_accuracy])

In [49]:
# Train the model with BERT-encoded inputs
model_bert.fit(text_embeddings_train_bert, text_train_mask, 
               validation_data=(text_embeddings_val_bert, text_val_mask), 
               epochs=10, batch_size=32)

Epoch 1/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1352s[0m 20s/step - binary_accuracy: 0.5455 - loss: 0.6930 - val_binary_accuracy: 0.5805 - val_loss: 0.6788
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1890s[0m 29s/step - binary_accuracy: 0.6024 - loss: 0.6731 - val_binary_accuracy: 0.5908 - val_loss: 0.6652
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1433s[0m 21s/step - binary_accuracy: 0.6118 - loss: 0.6584 - val_binary_accuracy: 0.6340 - val_loss: 0.6455
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1429s[0m 23s/step - binary_accuracy: 0.6456 - loss: 0.6383 - val_binary_accuracy: 0.6631 - val_loss: 0.6223
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1159s[0m 18s/step - binary_accuracy: 0.6727 - loss: 0.6123 - val_binary_accuracy: 0.7073 - val_loss: 0.5837
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1147s[0m 18s/step - binary_accuracy: 0.7229 - 

<keras.src.callbacks.history.History at 0x1fababe7eb0>

In [50]:
# Generate summaries for test data using the trained model
text_predicted_summary_bert = model_bert.predict(text_embeddings_test_bert)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12s/step


In [65]:
# Decode the generated summaries
def decode_summary_bert1(embeddings, test_ids, tokenizer):
    decoded_summaries = []
    for i in range(len(embeddings)):
        summary_text = tokenizer.decode(test_ids[i], skip_special_tokens=True)
        decoded_summaries.append(summary_text)
    return decoded_summaries

In [66]:
# Assuming 'tokenizer' is your BERT tokenizer instance
decoded_summaries_bert = decode_summary_bert1(text_predicted_summary_bert, 
                                             test_ids,tokenizer)

In [67]:
# Print summary text for each sample
for i, summary_text in enumerate(decoded_summaries_bert):
    print(f"Sample {i+1} Summary: {summary_text} Len: {len(summary_text)}")
    print()

Sample 1 Summary: japan s nec corp and unk computer corp of the united states said wednesday they had agreed to join forces in supercomputer sales Len: 128

Sample 2 Summary: the sri lankan government on wednesday announced the closure of government schools with immediate effect as a military campaign against tamil separatists escalated in the north of the country Len: 191

Sample 3 Summary: police arrested five antinuclear protesters thursday after they sought to disrupt loading of a french antarctic research and supply vessel a spokesman for the protesters said Len: 174



In [54]:
# Decode the generated summaries
def decode_summary_bert2(embeddings, tokenizer):
    decoded_summaries = []
    for i in range(len(embeddings)):
        summary_text = tokenizer.decode(test_ids[i], skip_special_tokens=True)
        decoded_summaries.append(summary_text)
    return decoded_summaries

In [55]:
# Assuming 'tokenizer' is your BERT tokenizer instance
decoded_summaries_bert = decode_summary_bert2(text_predicted_summary_bert, 
                                             tokenizer)

In [57]:
# Print summary text for each sample
for i, summary_text in enumerate(decoded_summaries_bert):
    print(f"Sample {i+1} Summary: {summary_text} Len: {len(summary_text)}")
    print()

Sample 1 Summary: japan s nec corp and unk computer corp of the united states said wednesday they had agreed to join forces in supercomputer sales Len: 128

Sample 2 Summary: the sri lankan government on wednesday announced the closure of government schools with immediate effect as a military campaign against tamil separatists escalated in the north of the country Len: 191

Sample 3 Summary: police arrested five antinuclear protesters thursday after they sought to disrupt loading of a french antarctic research and supply vessel a spokesman for the protesters said Len: 174



In [None]:
for i in text_predicted_summary_bert:
    a = tokenizer.encode()

In [48]:
from transformers import BertTokenizer, BertModel
model = BertModel.from_pretrained('bert-base-uncased')

In [49]:
import torch
# Load BERT tokenizer
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [50]:
text = ["Rhea is there","Water is there","Fire is there"]
encoding = tokenizer.batch_encode_plus( text,
                                      padding=True,
                                      truncation=True,
                                      return_tensors='pt',
                                      add_special_tokens=True)

In [51]:
print(encoding)

{'input_ids': tensor([[  101, 24775,  2003,  2045,   102],
        [  101,  2300,  2003,  2045,   102],
        [  101,  2543,  2003,  2045,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}


In [55]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0],skip_special_tokens=True))

['rhea', 'is', 'there']


In [41]:
marked_text = []
for i in text:
    marked_text.append("[CLS] "+i+" [SEP]")

In [42]:
print(marked_text)

['[CLS] Rhea is there [SEP]', '[CLS] Water is there [SEP]', '[CLS] Fire is there [SEP]']


In [43]:
tokenized_texts = []
indexed_tokens = []

In [44]:
for i in marked_text:
    tokenized_text = tokenizer.tokenize(i)
    print(tokenized_text)
    indexed_token  = tokenizer.convert_tokens_to_ids(tokenized_text)
    print(indexed_token)
    print()
    tokenized_texts.append(tokenized_text)
    indexed_tokens.append(indexed_token)

['[CLS]', 'rhea', 'is', 'there', '[SEP]']
[101, 24775, 2003, 2045, 102]

['[CLS]', 'water', 'is', 'there', '[SEP]']
[101, 2300, 2003, 2045, 102]

['[CLS]', 'fire', 'is', 'there', '[SEP]']
[101, 2543, 2003, 2045, 102]



In [45]:
print(tokenized_texts)

[['[CLS]', 'rhea', 'is', 'there', '[SEP]'], ['[CLS]', 'water', 'is', 'there', '[SEP]'], ['[CLS]', 'fire', 'is', 'there', '[SEP]']]


In [46]:
print(indexed_tokens)

[[101, 24775, 2003, 2045, 102], [101, 2300, 2003, 2045, 102], [101, 2543, 2003, 2045, 102]]


In [47]:
for i,j in zip(tokenized_texts, indexed_tokens):
    for k,l in zip(i,j):
        print('{:<12} {:>6,}'.format(k,l))

[CLS]           101
rhea         24,775
is            2,003
there         2,045
[SEP]           102
[CLS]           101
water         2,300
is            2,003
there         2,045
[SEP]           102
[CLS]           101
fire          2,543
is            2,003
there         2,045
[SEP]           102


In [21]:
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']


In [14]:
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    word_embeddings = outputs.last_hidden_state 
    print(word_embeddings.shape)
    print(len(word_embeddings))

torch.Size([3, 5, 768])
3


In [15]:
decoded_summary =[]
for i in range(len(input_ids)):
    decoded_text = tokenizer.decode(input_ids[i], skip_special_tokens=True)
    decoded_text = tokenizer.tokenize(decoded_text)
    decoded_summary.append(decoded_text)

In [16]:
for i in decoded_summary:
    print(i)

['rhea', 'is', 'there']
['water', 'is', 'there']
['fire', 'is', 'there']


In [17]:
for token,i in zip(decoded_text,word_embeddings[0]):
    print("Token : ",token)
    

Token :  fire
Token :  is
Token :  there
