In [197]:
import torch
import time
import os
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from sklearn.model_selection import train_test_split
from datasets import Dataset
from datasets import load_dataset, load_metric
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
import wandb
WANDB_API_KEY = os.environ.get("WANDB_API_KEY", "")
wandb.login(key=WANDB_API_KEY)



True

In [198]:
dataset = load_dataset("cjvt/sentinews", "sentence_level")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['nid', 'content', 'sentiment', 'pid', 'sid'],
        num_rows: 168899
    })
})

In [None]:
train_dataset = dataset["train"].to_pandas()
train_dataset, test_dataset = train_test_split(
    train_dataset, test_size=0.2, random_state=42
)
train_dataset, val_dataset = train_test_split(
    train_dataset, test_size=0.1, random_state=42
)

In [None]:
train_dataset = Dataset.from_pandas(train_dataset)
val_dataset = Dataset.from_pandas(val_dataset)
test_dataset = Dataset.from_pandas(test_dataset)


print(train_dataset)
print(val_dataset)
print(test_dataset)

Dataset({
    features: ['nid', 'content', 'sentiment', 'pid', 'sid', '__index_level_0__'],
    num_rows: 121607
})
Dataset({
    features: ['nid', 'content', 'sentiment', 'pid', 'sid', '__index_level_0__'],
    num_rows: 13512
})
Dataset({
    features: ['nid', 'content', 'sentiment', 'pid', 'sid', '__index_level_0__'],
    num_rows: 33780
})


In [None]:
# For reference
models = ["EMBEDDIA/sloberta", "bert-base-multilingual-cased"]

In [None]:
def encode_labels(batch_labels):
    label_map = {"negative": 0, "neutral": 1, "positive": 2}
    return [label_map[label] for label in batch_labels]

In [None]:
def preprocess_function(examples, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    texts = examples["content"]
    labels = examples["sentiment"]
    tokenized_inputs = tokenizer(texts, truncation=True, padding=True, max_length=512)
    tokenized_inputs["labels"] = encode_labels(labels)
    return tokenized_inputs

In [None]:
def fine_tune_model(model_name, model, training_args):
    tokenized_train_dataset = train_dataset.map(
        lambda examples: preprocess_function(examples, model_name),
        batched=True,
    )
    tokenized_val_dataset = val_dataset.map(
        lambda examples: preprocess_function(examples, model_name),
        batched=True,
    )
    tokenized_test_dataset = test_dataset.map(
        lambda examples: preprocess_function(examples, model_name),
        batched=True,
    )

    data_collator = DataCollatorWithPadding(
        tokenizer=AutoTokenizer.from_pretrained(model_name),
        padding="max_length",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        data_collator=data_collator,
    )

    start = time.time()
    trainer.train()
    elapsed_training = time.time() - start

    metrics = trainer.evaluate(tokenized_test_dataset)

    print(f"model: {model_name}, Dataset: Sentinews, Test Metrics: {metrics}")

    model.save_pretrained(f"models/{model_name}_sentinews")

    return model, metrics, elapsed_training

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
def run_lora_sloberta():
    model_name = "EMBEDDIA/sloberta"
    task_type = TaskType.SEQ_CLS
    training_args = TrainingArguments(
        output_dir=f"{model_name}-sentinews",
        learning_rate=1e-4,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
    )
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    model = prepare_model_for_kbit_training(model, task_type)

    target_modules = (
        [
            "roberta.encoder.layer." + str(i) + ".attention.self.query"
            for i in range(model.config.num_hidden_layers)
        ]
        + [
            "roberta.encoder.layer." + str(i) + ".attention.self.key"
            for i in range(model.config.num_hidden_layers)
        ]
        + [
            "roberta.encoder.layer." + str(i) + ".attention.self.value"
            for i in range(model.config.num_hidden_layers)
        ]
        + [
            "roberta.encoder.layer." + str(i) + ".attention.output.dense"
            for i in range(model.config.num_hidden_layers)
        ]
    )

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        task_type=task_type,
        bias="none",
        target_modules=target_modules,
    )

    model = get_peft_model(model, lora_config)

    print_trainable_parameters(model)

    _, metrics, elapsed_training = fine_tune_model(
        model_name, model, training_args
    )

    current_time = time.strftime("%Y-%m-%d-%H-%M-%S")
    with open("results.csv", "a") as f:
        f.write(
            f"{current_time},{model_name},Sentinews,{metrics['f1']},{metrics['accuracy']},{elapsed_training}\n"
        )

In [None]:
run_lora_sloberta()

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at EMBEDDIA/sloberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1772547 || all params: 112396806 || trainable%: 1.577043924184109


TypeError: fine_tune_model() takes 3 positional arguments but 4 were given