In [1]:
import json
import numpy as np
import pandas as pd

In [2]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]
    return data

In [3]:
train_path = "rl-sentence-compression/data/train-data/gigaword/train.jsonl"
val_path = "rl-sentence-compression/data/train-data/gigaword/val.jsonl"
test_path = "rl-sentence-compression/data/test-data/gigaword.jsonl"

In [4]:
train_data = load_data(train_path)
val_data = load_data(val_path)
test_data = load_data(test_path)

In [5]:
print(len(train_data))
print(len(val_data))
print(len(test_data))

1000000
189651
1951


In [6]:
train_data = train_data[0:10000]
val_data   = val_data[0:10000]
test_data = test_data[0:20]

In [7]:
print(len(train_data))
print(len(val_data))
print(len(test_data))

10000
10000
20


In [8]:
def remove_id(data):
    updated_data = []
    for item in data:
        updated_item = {key:value for key,value in item.items() if key != "id"}
        updated_data.append(updated_item)
    return updated_data

In [9]:
train_data = remove_id(train_data)
test_data = remove_id(test_data)
val_data = remove_id(val_data)

In [10]:
train_df = pd.DataFrame(train_data)
val_df   = pd.DataFrame(val_data)
test_df  = pd.DataFrame(test_data)

In [11]:
train_df.head()

Unnamed: 0,text,summary
0,australia 's current account deficit shrunk by...,australian current account deficit narrows sha...
1,at least two people were killed in a suspected...,at least two dead in southern philippines blast
2,australian shares closed down #.# percent mond...,australian stocks close down #.# percent
3,south korea 's nuclear envoy kim sook urged no...,envoy urges north korea to restart nuclear dis...
4,south korea on monday announced sweeping tax r...,skorea announces tax cuts to stimulate economy


In [12]:
test_df.rename(columns = {'summaries':'summary'}, inplace = True) 

In [13]:
print(type(test_df['text'][0]))
print(type(test_df['summary'][0]))
test_df['summary'] = test_df['summary'].str[0].astype(str)
test_df.head(2)

<class 'str'>
<class 'list'>


Unnamed: 0,text,summary
0,japan 's nec corp. and UNK computer corp. of t...,nec UNK in computer sales tie-up
1,the sri lankan government on wednesday announc...,sri lanka closes schools as war escalates


In [14]:
def is_integer(value):
    try:
        int(value)
        return True
    except ValueError:
        return False

In [15]:
for index, row in train_df.iterrows():
    if any(is_integer(value) for value in row):
        train_df.drop(index, inplace=True)

In [16]:
for index, row in val_df.iterrows():
    if any(is_integer(value) for value in row):
        val_df.drop(index, inplace=True)

In [17]:
for index, row in test_df.iterrows():
    if any(is_integer(value) for value in row):
        test_df.drop(index, inplace=True)

In [18]:
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [19]:
train_df = train_df.head(2000)
val_df   = val_df.head(2000)
test_df  = test_df.head(10)

In [20]:
# Preprocessing: Clean text data
def clean_text(df):
    for col in df.columns:
        df[col] = df[col].str.lower()
        df[col] = df[col].str.lstrip().str.rstrip()
        df[col] = df[col].str.replace(r'[^\w\s]+', '')
    return df

In [21]:
train_df = clean_text(train_df)
val_df = clean_text(val_df)
test_df = clean_text(test_df)

  df[col] = df[col].str.replace(r'[^\w\s]+', '')


In [22]:
from transformers import BertTokenizer, BertModel
import tensorflow as tf

In [23]:
# Maximum sentence length (word count)
max_len_word = 100
# Maximum wordpiece length
max_wordpiece_len = 150

In [24]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [25]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [26]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [27]:
# Function to encode sentences with padding and truncation (improved for wordpiece handling)
def encode_sentences(sentences):
    encoding = tokenizer.batch_encode_plus(sentences,
                                         padding='max_length',
                                         truncation=True,
                                         return_tensors='pt',
                                         add_special_tokens=True)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    
    # Ensure correct handling of padding tokens
    pad_token_id = tokenizer.pad_token_id
    attention_mask[attention_mask == 0] = -100
    
    # Handle wordpiece truncation (assuming max_wordpiece_len is set)
    word_embeddings = bert_model(input_ids, attention_mask=attention_mask).last_hidden_state
    '''word_embeddings = tf.where(tf.math.equal(input_ids, pad_token_id), tf.zeros_like(word_embeddings), word_embeddings)  # Set padding to zero vectors
    word_embeddings = tf.clip_by_value(word_embeddings, clip_value_min=0.0, clip_value_max=1.0)  # Clip for stability

    # Truncate wordpieces to max_wordpiece_len (optional, adjust as needed)
    if max_wordpiece_len:
        word_embeddings = tf.sequence_mask(tokenizer.model_max_length, maxlen=max_wordpiece_len, dtype=tf.float32)[:, :, None] * word_embeddings

    '''
    return word_embeddings, input_ids

In [None]:
# Encode text data (including handling labels)
text_embeddings_train_bert, train_ids = encode_sentences(train_df['text'])
text_embeddings_train_bert = tf.where(tf.math.equal(input_ids, pad_token_id), tf.zeros_like(text_embeddings_train_bert), text_embeddings_train_bert)  # Set padding to zero vectors
text_embeddings_train_bert = tf.clip_by_value(text_embeddings_train_bert, clip_value_min=0.0, clip_value_max=1.0)  # Clip for stability

# Truncate wordpieces to max_wordpiece_len (optional, adjust as needed)
if max_wordpiece_len:
    text_embeddings_train_bert = tf.sequence_mask(tokenizer.model_max_length, maxlen=max_wordpiece_len, dtype=tf.float32)[:, :, None] * text_embeddings_train_bert

In [None]:
# Encode text data (including handling labels)
text_embeddings_val_bert, val_ids = encode_sentences(val_df['text'])
text_embeddings_val_bert = tf.where(tf.math.equal(input_ids, pad_token_id), tf.zeros_like(text_embeddings_val_bert), text_embeddings_val_bert)  
text_embeddings_val_bert = tf.clip_by_value(text_embeddings_val_bert, clip_value_min=0.0, clip_value_max=1.0)  # Clip for stability

# Truncate wordpieces to max_wordpiece_len (optional, adjust as needed)
if max_wordpiece_len:
    text_embeddings_val_bert = tf.sequence_mask(tokenizer.model_max_length, maxlen=max_wordpiece_len, dtype=tf.float32)[:, :, None] * text_embeddings_val_bert

In [None]:
# Encode text data (including handling labels)
text_embeddings_test_bert, test_ids = encode_sentences(test_df['text'])

text_embeddings_test_bert = tf.where(tf.math.equal(input_ids, pad_token_id), tf.zeros_like(text_embeddings_test_bert), text_embeddings_test_bert)  # Set padding to zero vectors
text_embeddings_test_bert = tf.clip_by_value(text_embeddings_test_bert, clip_value_min=0.0, clip_value_max=1.0)  # Clip for stability

# Truncate wordpieces to max_wordpiece_len (optional, adjust as needed)
if max_wordpiece_len:
    text_embeddings_test_bert = tf.sequence_mask(tokenizer.model_max_length, maxlen=max_wordpiece_len, dtype=tf.float32)[:, :, None] * text_embeddings_test_bert

In [None]:
summary_embeddings_train_bert,strain_ids=encode_sentences(
    train_df['summary'])  

summary_embeddings_train_bert = tf.where(tf.math.equal(input_ids, pad_token_id), tf.zeros_like(summary_embeddings_train_bert), summary_embeddings_train_bert)  
summary_embeddings_train_bert = tf.clip_by_value(summary_embeddings_train_bert, clip_value_min=0.0, clip_value_max=1.0)  # Clip for stability

# Truncate wordpieces to max_wordpiece_len (optional, adjust as needed)
if max_wordpiece_len:
    summary_embeddings_train_bert = tf.sequence_mask(tokenizer.model_max_length, maxlen=max_wordpiece_len, dtype=tf.float32)[:, :, None] * summary_embeddings_train_bert

In [None]:
summary_embeddings_val_bert,sval_ids=encode_sentences(val_df['summary'])  

summary_embeddings_val_bert = tf.where(tf.math.equal(input_ids, pad_token_id), tf.zeros_like(summary_embeddings_val_bert), summary_embeddings_val_bert)  
summary_embeddings_val_bert = tf.clip_by_value(summary_embeddings_val_bert, clip_value_min=0.0, clip_value_max=1.0)  # Clip for stability

# Truncate wordpieces to max_wordpiece_len (optional, adjust as needed)
if max_wordpiece_len:
    summary_embeddings_val_bert = tf.sequence_mask(tokenizer.model_max_length, maxlen=max_wordpiece_len, dtype=tf.float32)[:, :, None] * summary_embeddings_val_bert

In [None]:
summary_embeddings_test_bert,stest_ids=encode_sentences(test_df['summary']) 

summary_embeddings_test_bert = tf.where(tf.math.equal(input_ids, pad_token_id), tf.zeros_like(summary_embeddings_test_bert), summary_embeddings_test_bert)  
summary_embeddings_test_bert = tf.clip_by_value(summary_embeddings_test_bert, clip_value_min=0.0, clip_value_max=1.0)  # Clip for stability

# Truncate wordpieces to max_wordpiece_len (optional, adjust as needed)
if max_wordpiece_len:
    summary_embeddings_test_bert = tf.sequence_mask(tokenizer.model_max_length, maxlen=max_wordpiece_len, dtype=tf.float32)[:, :, None] * summary_embeddings_test_bert

In [None]:
# Function to generate one-hot encoded labels (modify based on your label format)
def generate_labels(sentences, summaries):
    labels = []
    for sentence, summary in zip(sentences, summaries):
        sentence_tokens = tokenizer.convert_ids_to_tokens(sentence)
        summary_tokens = tokenizer.convert_ids_to_tokens(summary)
        label = [1 if token in summary_tokens else 0 for token in sentence_tokens]
        labels.append(label)
    return np.asarray(labels)

In [None]:
train_labels = generate_labels(train_df['text'].tolist(), 
                               train_df['summary'].tolist())

In [None]:
val_labels = generate_labels(val_df['text'].tolist(),
                             val_df['summary'].tolist())

In [None]:
test_labels = generate_labels(test_df['text'].tolist(),
                              test_df['summary'].tolist())

In [None]:
# Define input shape for text embeddings and labels
input_shape = (max_len_word, 768)
label_shape = (max_len_word,)  # Adjust based on your label format

In [None]:
from tensorflow.keras.layers import Input, Dropout, Dense, Flatten
from tensorflow.keras.layers import Concatenate, Lambda, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy

In [None]:
# Define input layers for text and labels
text_input = Input(shape=input_shape)
label_input = Input(shape=label_shape)

In [None]:
# Define layer groups (example structure with 12 groups)
for _ in range(12):
    # Self-attention layer
    self_attention = tf.keras.layers.MultiHeadAttention(num_heads=8, key_dim=64)(
    query=text_input, value=text_input, attention_mask=text_input)  # Use text_input for self-attention
    self_attention = LayerNormalization()(self_attention + text_input)
    self_attention = Dropout(0.1)(self_attention)

    # Feed forward layer
    feed_forward = Dense(200, activation='relu')(self_attention)
    feed_forward = LayerNormalization()(feed_forward)
    feed_forward = Dense(768)(feed_forward)
    feed_forward = LayerNormalization()(feed_forward)
    feed_forward = Dropout(0.1)(feed_forward)

    # Update text embeddings
    text_input = feed_forward

In [None]:
# Final layers for predicting deletion probability
deletion_prob = Dense(1, activation="sigmoid")(text_input)

In [None]:
# Model definition
model = Model(inputs=[text_input, label_input], outputs=deletion_prob)

In [None]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss=binary_crossentropy,
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# Fit the model on training data
model.fit([text_embeddings_train_bert, train_labels], train_labels,
          validation_data=([text_embeddings_val_bert, val_labels], val_labels),
          epochs=15, batch_size=64)

In [None]:
# Function for prediction on unseen sentences (consider batching for efficiency)
def predict_deletion_probs(sentence):
  # Encode the sentence
  sentence_embeddings, _ = encode_sentences([sentence])

  # Predict deletion probabilities
  deletion_probs = model.predict([sentence_embeddings])[0]  # Access the first element for single sentence

  # Return deletion probabilities
  return deletion_probs

In [None]:
# Example usage (assuming you have a sentence)
sentence = " A mariachi band has serenaded Donald Trump on the sidewalk outside Trump Tower in New York City."
deletion_probs = predict_deletion_probs(sentence)

In [None]:
# Process deletion probabilities to generate the compressed sentence (logic based on your needs)
compressed_sentence = ""
for i, prob in enumerate(deletion_probs):
    if prob > 0.5:  # Adjust threshold as needed
        compressed_sentence += tokenizer.convert_ids_to_tokens(text_embeddings_train_bert[0][i].numpy())[0] + " "  # Access word from embeddings

In [None]:
print(f"Original Sentence: {sentence}")
print(f"Compressed Sentence: {compressed_sentence.strip()}")