In [None]:
%pip install transformers datasets pandas

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

In [None]:
# load the pre-trained model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Load the datasets
train_dataset = Dataset.load_dataset("csv", data_files="train_pair.csv")
val_dataset = Dataset.load_dataset("csv", data_files="val_pair.csv")
test_dataset = Dataset.load_dataset("csv", data_files="test_pair.csv")

# tokenize the dataset
def tokenize(batch):
    # Get the maximum length from the model configuration
    max_length = model.config.max_position_embeddings

    # Tokenize each text separately and truncate to half the maximum length
    tokenized_text1 = tokenizer(batch['text1'], truncation=True, max_length=int(max_length/2), padding=False)
    tokenized_text2 = tokenizer(batch['text2'], truncation=True, max_length=int(max_length/2), padding=False)
    
    # Merge the results
    tokenized_inputs = {
        'input_ids': tokenized_text1['input_ids'] + tokenized_text2['input_ids'],
        'attention_mask': tokenized_text1['attention_mask'] + tokenized_text2['attention_mask']
    }

    # Pad the results to the maximum length
    tokenized_inputs = tokenizer.pad(tokenized_inputs, max_length=max_length)

    return tokenized_inputs


train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

# set the format for the pytorch tensors
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'bin_label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'bin_label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'bin_label'])

# define training args
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    evaluation_strategy="epoch"
)

# create a function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy,
    }

# create a trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics     # the function that computes metrics
)

# train the model
trainer.train()

# evaluate the model
metrics = trainer.evaluate(eval_dataset=test_dataset)

print(f"Test set accuracy: {metrics['eval_accuracy']}")
