In [None]:
import numpy as np
import pandas as pd
from datasets import Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModel

### Loading train & test data

In [None]:
train_data_path = "./train.txt"
test_data_path = "./test.rand.txt"

In [None]:
def safe_read(file_path):
    valid_lines = []
    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
        for line in f:
            if '\ufffd' not in line:  # Check for replacement characters
                valid_lines.append(line)
    return valid_lines

In [None]:
lines = safe_read(train_data_path)
train_data_df = pd.DataFrame([line.strip().split('\t') for line in lines], columns=['Sentence_A', 'Sentence_B'])
train_data_df['labels'] = 0
train_data_df = train_data_df.sample(frac=1, random_state=42).reset_index(drop=True)
train_data_df

In [None]:
lines = safe_read(test_data_path)
test_data_df = pd.DataFrame([line.strip().split('\t') for line in lines], columns=['Sentence_A', "Sentence_B"])
test_data_df

### Tokenizing sentences into embeddings

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
embedding_model = AutoModel.from_pretrained("bert-base-uncased")

def compute_embeddings(sentences):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = embedding_model(**inputs)
    return outputs.last_hidden_state[:, 0, :]

def tokenize_function(examples):
    embeddings_A = compute_embeddings(examples["Sentence_A"])
    embeddings_B = compute_embeddings(examples["Sentence_B"])
    return {"embeddings_A": embeddings_A.numpy(),
            "embeddings_B": embeddings_B.numpy()}

In [None]:
train_dataset = Dataset.from_pandas(train_data_df.iloc[0:50000])
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_train_dataset.save_to_disk("tokenized_train_dataset")

In [None]:
test_dataset = Dataset.from_pandas(testdata)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset.save_to_disk("tokenized_test_dataset")

### Generating training samples with different labels

In [None]:
def duplicate_and_flip(dataset):
    flipped_data = {
        "Sentence_A": [],
        "Sentence_B" : [],
        "labels": [],
        "embeddings_A": [],
        "embeddings_B": []
    }

    for example in dataset:
        # Swap embeddings and flip labels
        flipped_data["Sentence_A"].append(example["Sentence_B"])
        flipped_data["embeddings_A"].append(np.array(example["embeddings_B"], dtype=np.float32))
        flipped_data["Sentence_B"].append(example["Sentence_A"])
        flipped_data["embeddings_B"].append(np.array(example["embeddings_A"], dtype=np.float32))
        flipped_data["labels"].append(1 - example["labels"])  # Flip label (0 -> 1, 1 -> 0)

    flipped_dataset = Dataset.from_dict(flipped_data)
    return flipped_dataset

flipped_train_dataset = duplicate_and_flip(tokenized_train_dataset)

In [None]:
concat_train_data = concatenate_datasets([tokenized_train_dataset, flipped_train_dataset])

### Computing difference in embeddings of Sentences for model input

In [None]:
# Function to compute the difference of embeddings
def compute_difference(examples):
    embeddings_A = torch.tensor(examples["embeddings_A"])
    embeddings_B = torch.tensor(examples["embeddings_B"])
    difference = embeddings_A - embeddings_B
    return {"difference": difference.numpy()}

In [None]:
final_train_dataset = concat_train_data.map(compute_difference, batched=True)
final_train_dataset.save_to_disk("final_train_dataset")

In [None]:
final_test_dataset = tokenized_test_dataset.map(compute_difference, batched=True)
final_test_dataset.save_to_disk("final_test_dataset")