In [230]:
import wandb
WANDB_API = ""
wandb.login(key=WANDB_API)

TRAIN_PATH = 'SI-NLI/train.tsv'
TEST_PATH = 'SI-NLI/test.tsv'



In [231]:
import torch
import time
import pandas as pd
import numpy as np  
import os
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from sklearn.model_selection import train_test_split
from datasets import Dataset
from datasets import load_dataset, load_metric
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PrefixTuningConfig, IA3Config
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [232]:
def load_data():
    train = pd.read_csv(TRAIN_PATH, sep='\t')
    test = pd.read_csv(TEST_PATH, sep='\t')
    train, val = train_test_split(train, test_size=0.1, random_state=42)
    return {"train": train, "val": val, "test": test}


In [233]:
dataset = load_data()

In [234]:
def encode_labels(examples):
            label_dict = {"entailment": 0, "neutral": 1, "contradiction": 2}
            # Replace labels in the examples with encoded labels
            print(np.unique(examples["label"]))
            examples["label"] = [
                label_dict[label] for label in examples["label"]
            ].copy()
            return examples

In [235]:
train_dataset = dataset["train"]
train_dataset, test_dataset = train_test_split(
    train_dataset, test_size=0.2, random_state=42
)
train_dataset, val_dataset = train_test_split(
    train_dataset, test_size=0.1, random_state=42
)

In [236]:
train_dataset = Dataset.from_pandas(encode_labels(train_dataset))
val_dataset = Dataset.from_pandas(encode_labels(val_dataset))
test_dataset = Dataset.from_pandas(encode_labels(test_dataset))


print(train_dataset)
print(val_dataset)
print(test_dataset)

['contradiction' 'entailment' 'neutral']
['contradiction' 'entailment' 'neutral']
['contradiction' 'entailment' 'neutral']
Dataset({
    features: ['pair_id', 'premise', 'hypothesis', 'annotation_1', 'comment_1', 'annotator1_id', 'annotation_2', 'comment_2', 'annotator2_id', 'annotation_3', 'comment_3', 'annotator3_id', 'annotation_FINAL', 'label', '__index_level_0__'],
    num_rows: 2844
})
Dataset({
    features: ['pair_id', 'premise', 'hypothesis', 'annotation_1', 'comment_1', 'annotator1_id', 'annotation_2', 'comment_2', 'annotator2_id', 'annotation_3', 'comment_3', 'annotator3_id', 'annotation_FINAL', 'label', '__index_level_0__'],
    num_rows: 317
})
Dataset({
    features: ['pair_id', 'premise', 'hypothesis', 'annotation_1', 'comment_1', 'annotator1_id', 'annotation_2', 'comment_2', 'annotator2_id', 'annotation_3', 'comment_3', 'annotator3_id', 'annotation_FINAL', 'label', '__index_level_0__'],
    num_rows: 791
})


In [237]:
# For reference
models = ["EMBEDDIA/sloberta", "bert-base-multilingual-cased"]

In [238]:
def preprocess_function(examples, model_name, test=False):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    tokenized = tokenizer(
                examples["premise"],
                examples["hypothesis"],
                padding="max_length",
                truncation=True,
                max_length=512,
            )
    if not test:
        tokenized["label"] = examples["label"]
    return tokenized

In [239]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='macro'),
        'precision': precision_score(labels, predictions, average='macro'),
        'recall': recall_score(labels, predictions, average='macro')
    }


In [240]:
def fine_tune_model(model_name, dataset, model, training_args):
    
    tokenized_train_dataset = train_dataset.map(
        lambda examples: preprocess_function(examples, model_name),
        batched=True,
    )
    tokenized_val_dataset = val_dataset.map(
        lambda examples: preprocess_function(examples, model_name),
        batched=True,
    )
    tokenized_test_dataset = test_dataset.map(
        lambda examples: preprocess_function(examples, model_name, test=True),
        batched=True,
    )


    data_collator = DataCollatorWithPadding(
        tokenizer=AutoTokenizer.from_pretrained(model_name),
        padding="max_length",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    start = time.time()
    trainer.train()
    elapsed_training = time.time() - start

    metrics = trainer.evaluate(tokenized_test_dataset)

    print(f"model: {model_name}, Dataset: SI_NLI, Test Metrics: {metrics}")

    model.save_pretrained(f"models/{model_name}si_nli")

    return model, metrics, elapsed_training

In [241]:
def run_lora_sloberta():
    model_name = "EMBEDDIA/sloberta"
    task_type = TaskType.SEQ_CLS
    training_args = TrainingArguments(
        output_dir=f"{model_name}-sentinews",
        learning_rate=1e-4,
        per_device_train_batch_size=24,
        per_device_eval_batch_size=24,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    model = prepare_model_for_kbit_training(model, task_type)

    target_modules = (
        [
            "roberta.encoder.layer." + str(i) + ".attention.self.query"
            for i in range(model.config.num_hidden_layers)
        ]
        + [
            "roberta.encoder.layer." + str(i) + ".attention.self.key"
            for i in range(model.config.num_hidden_layers)
        ]
        + [
            "roberta.encoder.layer." + str(i) + ".attention.self.value"
            for i in range(model.config.num_hidden_layers)
        ]
        + [
            "roberta.encoder.layer." + str(i) + ".attention.output.dense"
            for i in range(model.config.num_hidden_layers)
        ]
    )

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        task_type=task_type,
        bias="none",
        target_modules=target_modules,
    )

    model = get_peft_model(model, lora_config)

    _, metrics, elapsed_training = fine_tune_model(
        model_name, dataset, model, training_args
    )

    current_time = time.strftime("%Y-%m-%d-%H-%M-%S")
    with open("results.csv", "a") as f:
        f.write(
            f"{current_time},{model_name},Sentinews, {metrics},{elapsed_training}\n"
        )

In [242]:
def run_prefix_tune_sloberta():
    model_name = "EMBEDDIA/sloberta"
    task_type = TaskType.SEQ_CLS
    training_args = TrainingArguments(
        output_dir=f"{model_name}-sentinews",
        learning_rate=1e-4,
        per_device_train_batch_size=24,
        per_device_eval_batch_size=24,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    model = prepare_model_for_kbit_training(model, task_type)

    target_modules = (
        [
            "roberta.encoder.layer." + str(i) + ".attention.self.query"
            for i in range(model.config.num_hidden_layers)
        ]
        + [
            "roberta.encoder.layer." + str(i) + ".attention.self.key"
            for i in range(model.config.num_hidden_layers)
        ]
        + [
            "roberta.encoder.layer." + str(i) + ".attention.self.value"
            for i in range(model.config.num_hidden_layers)
        ]
        + [
            "roberta.encoder.layer." + str(i) + ".attention.output.dense"
            for i in range(model.config.num_hidden_layers)
        ]
    )

    prefix_config = PrefixTuningConfig(task_type="SEQ_CLS", num_virtual_tokens=20)

    model = get_peft_model(model, prefix_config)

    _, metrics, elapsed_training = fine_tune_model(
        model_name, dataset, model, training_args
    )

    current_time = time.strftime("%Y-%m-%d-%H-%M-%S")
    with open("results.csv", "a") as f:
        f.write(
            f"{current_time},{model_name},Sentinews, {metrics},{elapsed_training}\n"
        )

In [243]:
def run_ia3_sloberta():
    model_name = "EMBEDDIA/sloberta"
    task_type = TaskType.SEQ_CLS
    training_args = TrainingArguments(
        output_dir=f"{model_name}-sentinews",
        learning_rate=1e-4,
        per_device_train_batch_size=24,
        per_device_eval_batch_size=24,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    model = prepare_model_for_kbit_training(model, task_type)

    feed_forward_modules = [
        "roberta.encoder.layer." + str(i) + ".intermediate.dense"
        for i in range(model.config.num_hidden_layers)
    ]

    target_modules = (
        [
            "roberta.encoder.layer." + str(i) + ".attention.self.query"
            for i in range(model.config.num_hidden_layers)
        ]
        + [
            "roberta.encoder.layer." + str(i) + ".attention.self.key"
            for i in range(model.config.num_hidden_layers)
        ]
        + [
            "roberta.encoder.layer." + str(i) + ".attention.self.value"
            for i in range(model.config.num_hidden_layers)
        ]
        + [
            "roberta.encoder.layer." + str(i) + ".attention.output.dense"
            for i in range(model.config.num_hidden_layers)
        ]
        + feed_forward_modules
    )

    ia3_config = IA3Config(task_type=task_type, feedforward_modules = feed_forward_modules, target_modules=target_modules)

    model = get_peft_model(model, ia3_config)

    _, metrics, elapsed_training = fine_tune_model(
        model_name, dataset, model, training_args
    )

    current_time = time.strftime("%Y-%m-%d-%H-%M-%S")
    with open("results.csv", "a") as f:
        f.write(
            f"{current_time},{model_name},Sentinews, {metrics},{elapsed_training}\n"
        )

In [244]:
run_lora_sloberta()

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/sloberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 2844/2844 [00:01<00:00, 2205.10 examples/s]
Map: 100%|██████████| 317/317 [00:00<00:00, 906.67 examples/s]
Map: 100%|██████████| 791/791 [00:00<00:00, 2163.68 examples/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 94.00 MiB. GPU 