In [1]:
import json
import spacy
import numpy as np
import pandas as pd

In [2]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]
    return data

In [3]:
train_path = "rl-sentence-compression/data/train-data/gigaword/train.jsonl"
val_path = "rl-sentence-compression/data/train-data/gigaword/val.jsonl"
test_path = "rl-sentence-compression/data/test-data/gigaword.jsonl"

In [4]:
train_data = load_data(train_path)
val_data = load_data(val_path)
test_data = load_data(test_path)

In [5]:
print(len(train_data))
print(len(val_data))
print(len(test_data))

1000000
189651
1951


In [6]:
train_data = train_data[0:4000]
val_data   = val_data[0:4000]
test_data = test_data[0:20]

In [7]:
print(len(train_data))
print(len(val_data))
print(len(test_data))

4000
4000
20


In [8]:
def remove_id(data):
    updated_data = []
    for item in data:
        updated_item = {key:value for key,value in item.items() if key != "id"}
        updated_data.append(updated_item)
    return updated_data

In [9]:
train_data = remove_id(train_data)
test_data = remove_id(test_data)
val_data = remove_id(val_data)

In [10]:
train_df = pd.DataFrame(train_data)
val_df   = pd.DataFrame(val_data)
test_df  = pd.DataFrame(test_data)

In [11]:
train_df.head()

Unnamed: 0,text,summary
0,australia 's current account deficit shrunk by...,australian current account deficit narrows sha...
1,at least two people were killed in a suspected...,at least two dead in southern philippines blast
2,australian shares closed down #.# percent mond...,australian stocks close down #.# percent
3,south korea 's nuclear envoy kim sook urged no...,envoy urges north korea to restart nuclear dis...
4,south korea on monday announced sweeping tax r...,skorea announces tax cuts to stimulate economy


In [12]:
test_df.rename(columns = {'summaries':'summary'}, inplace = True) 

In [13]:
print(type(test_df['text'][0]))
print(type(test_df['summary'][0]))
test_df['summary'] = test_df['summary'].str[0].astype(str)
test_df.head(2)

<class 'str'>
<class 'list'>


Unnamed: 0,text,summary
0,japan 's nec corp. and UNK computer corp. of t...,nec UNK in computer sales tie-up
1,the sri lankan government on wednesday announc...,sri lanka closes schools as war escalates


In [14]:
def is_integer(value):
    try:
        int(value)
        return True
    except ValueError:
        return False

In [15]:
for index, row in train_df.iterrows():
    if any(is_integer(value) for value in row):
        train_df.drop(index, inplace=True)

In [16]:
for index, row in val_df.iterrows():
    if any(is_integer(value) for value in row):
        val_df.drop(index, inplace=True)

In [17]:
for index, row in test_df.iterrows():
    if any(is_integer(value) for value in row):
        test_df.drop(index, inplace=True)

In [18]:
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [19]:
train_df = train_df.head(2000)
val_df   = val_df.head(2000)
test_df  = test_df.head(3)

In [None]:
# Preprocessing: Clean text data
def clean_text(df):
    for col in df.columns:
        df[col] = df[col].str.lower()
        df[col] = df[col].str.lstrip().str.rstrip()
        df[col] = df[col].str.replace(r'[^\w\s]+', '')
    return df

In [None]:
train_df = clean_text(train_df)
val_df = clean_text(val_df)
test_df = clean_text(test_df)

for col in train_df.columns:
    train_df[col] = train_df[col].str.lower()
for column in train_df.columns:
    train_df[column] = train_df[column].str.lstrip().str.rstrip()
for column in train_df.columns:
    train_df[column] = train_df[column].str.replace(r'[^\w\s]+', '')

for col in val_df.columns:
    val_df[col] = val_df[col].str.lower()
for column in val_df.columns:
    val_df[column] = val_df[column].str.lstrip().str.rstrip()
for column in val_df.columns:
    val_df[column] = val_df[column].str.replace(r'[^\w\s]+', '')

for col in test_df.columns:
    test_df[col] = test_df[col].str.lower()
for column in test_df.columns:
    test_df[column] = test_df[column].str.lstrip().str.rstrip()
for column in test_df.columns:
    test_df[column] = test_df[column].str.replace(r'[^\w\s]+', '')

In [25]:
from transformers import BertTokenizer, BertModel
import torch
import tensorflow as tf
from tensorflow.keras.layers import Input, Dropout, Dense, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy
from sklearn.metrics import classification_report

In [23]:
# Maximum sentence length (word count)
max_len_word = 150

In [24]:
# Load BERT tokenizer
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [26]:
# Tokenize text and summary data
def tokenize_text(text, max_length):
    tokens = tokenizer(text, max_length=max_length, padding="max_length", 
                       truncation=True, return_tensors='pt')
    return tokens['input_ids'], tokens['attention_mask']

In [None]:
text_train_input_ids, text_train_mask = tokenize_text(train_df['text'], 
                                                      max_len_word)
text_val_input_ids, text_val_mask = tokenize_text(val_df['text'], 
                                                  max_len_word)
text_test_input_ids, text_test_mask = tokenize_text(test_df['text'], 
                                                    max_len_word)

In [None]:
def tokenize_summary(summary, max_length):
    encoding = tokenizer(summary,max_length=max_length, padding="max_length",
                         truncation=True, return_tensors='pt',
                         add_special_tokens=True,
                         )
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    return input_ids, attention_mask

In [None]:
# Encode summary data
summary_train_input_ids, summary_train_mask = tokenize_summary(train_df['summary'], max_len_word)
summary_val_input_ids, summary_val_mask = tokenize_summary(val_df['summary'], max_len_word)
summary_test_input_ids, summary_test_mask = tokenize_summary(test_df['summary'], max_len_word)

In [None]:
def encode_sentences(sentences):
    encoding = tokenizer.batch_encode_plus(sentences,
                                           padding=True,
                                           truncation=True,
                                           return_tensors='pt',
                                           add_special_tokens=True)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state 
    return word_embeddings, input_ids

In [None]:
# Encode text data
text_embeddings_train_bert,train_ids = encode_sentences(train_df['text'])
text_embeddings_val_bert,val_ids = encode_sentences(val_df['text'])
text_embeddings_test_bert,test_ids = encode_sentences(test_df['text'])

In [None]:
# Encode summary data
summary_embeddings_train_bert, summay_train_id = encode_sentences(train_df['summary'])
summary_embeddings_val_bert, summary_val_id = encode_sentences(val_df['summary'])
summary_embeddings_test_bert, summary_test_id= encode_sentences(test_df['summary'])

In [None]:
# Pad e sequences to ensure consistent length
summary_embeddings_train_bert = pad_sequences(summary_embeddings_train_bert, maxlen=max_sequence_length, padding='post')
summary_embeddings_val_bert = pad_sequences(summary_embeddings_val_bert, maxlen=max_sequence_length, padding='post')
summary_embeddings_test_bert = pad_sequences(summary_embeddings_test_bert, maxlen=max_sequence_length, padding='post')

In [None]:
# Combine text and summary embeddings for training
combined_train_input = np.concatenate((text_embeddings_train_bert, summary_embeddings_train_bert), axis=1)
combined_val_input = np.concatenate((text_embeddings_val_bert, summary_embeddings_val_bert), axis=1)
combined_test_input = np.concatenate((text_embeddings_test_bert, summary_embeddings_test_bert), axis=1)

In [None]:
# Define input shape for combined input
combined_input_shape = (max_sequence_length, 768 * 2)  

In [None]:
# Define input layer for combined input
combined_input = Input(shape=combined_input_shape)
combined_embeddings = combined_input

In [None]:
# Define model architecture
num_bert_layers = 12
dropout_rate = 0.1

In [None]:
input_text = Input(shape=(max_len_word,), dtype='int32')
input_summary = Input(shape=(max_len_word,), dtype='int32')

In [None]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
text_embeddings = merged_embeddings = Concatenate(axis=1)([text_embeddings, summary_embeddings])rt_model(input_text)[1]
summary_embeddings = bert_model(input_summary)[1]

In [None]:
merged_embeddings = Concatenate(axis=1)([text_embeddings, summary_embeddings])

In [None]:
for _ in range(num_bert_layers):
    merged_embeddings = Dropout(dropout_rate)(merged_embeddings)
    for _ in range(3):
        merged_embeddings = Dense(merged_embeddings.shape[-1], activation="relu")(merged_embeddings)
        merged_embeddings = Dropout(dropout_rate)(merged_embeddings)
        merged_embeddings = Dense(merged_embeddings.shape[-1])(merged_embeddings)

In [None]:
# Define output la# Compile the model
model = Model(inputs=[input_text, input_summary], outputs=output)
output = Dense(1, activation='sigmoid')(merged_embeddings)

In [None]:
# Compile the model
model = Model(inputs=[input_text, input_summary], outputs=output)

In [None]:
#model.compile(optimizer=optimizer, loss=binary_crossentropy, 
#              metrics=[binary_accuracy])
model.compile(optimizer=optimizer, loss=binary_crossentropy, 
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# Training
model.fit([text_train_input, summary_train_input], text_train_mask, 
          validation_data=([text_val_input, summary_val_input], text_val_mask), 
          epochs=10, batch_size=32)

In [None]:
# Evaluation
# Evaluate the model on test data
loss, accuracy = model.evaluate([text_test_input, summary_test_input], text_test_mask)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

In [None]:
predicted_summary = model.predict([text_test_input, summary_test_input])

In [None]:
decoded_summaries = decode_summary(predicted_summary, tokenizer)

In [None]:
# Print summary text for each sample
for i, summary_text in enumerate(decoded_summaries):
    print(f"Sample {i+1} Summary: {summary_text} Len: {len(summary_text)}")
    print()

In [None]:
# Tokenize and encode summary data
summary_train_input, summary_train_mask = tokenize_text(train_df['summary'], max_len_word)
summary_val_input, summary_val_mask = tokenize_text(val_df['summary'], max_len_word)
summary_test_input, summary_test_mask = tokenize_text(test_df['summary'], max_len_word)


In [None]:
# Encode summary data
summary_embeddings_train_bert, _ = encode_sentences(train_df['summary'])
summary_embeddings_val_bert, _ = encode_sentences(val_df['summary'])
summary_embeddings_test_bert, _ = encode_sentences(test_df['summary'])

In [None]:
# Pad the sequences to ensure consistent length
summary_embeddings_train_bert = pad_sequences(summary_embeddings_train_bert, maxlen=max_sequence_length, padding='post')
summary_embeddings_val_bert = pad_sequences(summary_embeddings_val_bert, maxlen=max_sequence_length, padding='post')
summary_embeddings_test_bert = pad_sequences(summary_embeddings_test_bert, maxlen=max_sequence_length, padding='post')

In [None]:
# Combine text and summary embeddings for training
combined_train_input = np.concatenate((text_embeddings_train_bert, summary_embeddings_train_bert), axis=1)
combined_val_input = np.concatenate((text_embeddings_val_bert, summary_embeddings_val_bert), axis=1)
combined_test_input = np.concatenate((text_embeddings_test_bert, summary_embeddings_test_bert), axis=1)

In [None]:
# Define input shape for combined input
combined_input_shape = (max_sequence_length, 768 * 2) 

In [None]:
# Define input layer for combined input
combined_input = Input(shape=combined_input_shape)
combined_embeddings = combined_input

In [None]:
# Model architecture for combined input
num_bert_layers = 12
dropout_rate = 0.1

In [None]:
for _ in range(num_bert_layers):
    combined_embeddings = Dropout(dropout_rate)(combined_embeddings)
    for _ in range(3):
        combined_embeddings = Dense(combined_embeddings.shape[-1], activation="relu")(combined_embeddings)
        combined_embeddings = Dropout(dropout_rate)(combined_embeddings)
        combined_embeddings = Dense(combined_embeddings.shape[-1])(combined_embeddings)
        
        # Add skip connection
        combined_embeddings = combined_embeddings + combined_input

In [None]:
# Final feedforward layer with sigmoid activation
combined_output = Dense(150, activation="sigmoid")(combined_embeddings)

In [None]:
model_combined = Model(inputs=combined_input, outputs=combined_output)

In [None]:
# Compile the model
optimizer = Adam(learning_rate=1e-5, epsilon=1e-8)
model_combined.compile(optimizer=optimizer, loss=binary_crossentropy, 
                       metrics=[binary_accuracy])

In [None]:
model_combined.summary()

In [None]:
# Train the model with combined inputs
model_combined.fit(combined_train_input, text_train_mask, 
                   validation_data=(combined_val_input, text_val_mask), 
                   epochs=10, batch_size=16)

In [None]:
# Generate summaries for test data using the trained model
combined_predicted_summary = model_combined.predict(combined_test_input)

In [None]:
# Decode the generated summaries
def decode_combined_summary(embeddings, tokenizer):
    decoded_summaries = []
    for i in range(len(embeddings)):
        summary_text = tokenizer.decode(embeddings[i], skip_special_tokens=True)
        decoded_summaries.append(summary_text)
    return decoded_summaries

In [None]:
# Assuming 'tokenizer' is your BERT tokenizer instance
decoded_combined_summaries=decode_combined_summary(combined_predicted_summary,
                                                   tokenizer)

In [None]:
# Print summary text for each sample
for i, summary_text in enumerate(decoded_combined_summaries):
    print(f"Sample {i+1} Summary: {summary_text} Len: {len(summary_text)}")
    print()