In [1]:
from datasets import load_dataset, Dataset
from setfit import SetFitModel, Trainer, TrainingArguments
import torch

from data.dataset_config import DatasetConfig
from train.active_learning import ActiveTrainer, create_random_subset
from train.active_learning_config import ActiveLearningConfig
from train.reporter import Reporter
from train.metrics import camprehesive_metrics

In [2]:
model_name = ""
samples_per_cycle = 12

In [3]:
dataset = load_dataset("dair-ai/emotion")
dataset_config = DatasetConfig(text_column="text", num_classes=6)
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

final_reporter = Reporter("final_balancing.csv", label_column=dataset_config.label_column)
cycle_reporter = Reporter("cycle_balancing.csv", report_train_args=False, label_column=dataset_config.label_column)

def after_train_callback(trainer: Trainer, dataset: Dataset, run_id: int):
    cycle_reporter.report(trainer=trainer, dataset=dataset, run_id=run_id)


In [4]:
def run_train(args, initial_train_subset, active_learning_config, model_init, **kwargs):
    trainer = ActiveTrainer(
        model_init=model_init, 
        full_train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        initial_train_subset=initial_train_subset,
        train_args=args,
        active_learning_config=active_learning_config, 
        dataset_config=dataset_config,
        after_train_callback=after_train_callback,
        metric=camprehesive_metrics
    )
    t = trainer.train()
    final_reporter.report(
        trainer=t, 
        dataset=trainer.train_subset, 
        active_learning_config=active_learning_config, 
        dataset_name="emotion", **kwargs #kwars, so you can put anything here
        )

In [5]:
run_id = 0
train_args = TrainingArguments(num_iterations=20, num_epochs=(1, 16))
for dataset_id in range(3):
    initial_subset = create_random_subset(dataset=train_dataset, num_samples=samples_per_cycle)
    for model_name in ["sentence-transformers/all-mpnet-base-v2", "WhereIsAI/UAE-Large-V1"]:
        model_init = lambda: SetFitModel.from_pretrained(model_name, use_differentiable_head=True, head_params={"out_features": dataset_config.num_classes}).to(device)
        for setting in range(5):
            strategy = "max_entropy"
            balancing = None
            if setting == 0:
                strategy = "random"
            if setting == 2:
                balancing = 0.1
            if setting == 3:
                balancing = 0.25
            if setting == 4:
                balancing = 0.5
            active_config = ActiveLearningConfig(samples_per_cycle=samples_per_cycle, model_name=model_name, unlabeled_samples=600, active_sampling_strategy=strategy, balancing_factor=balancing)
            run_train(train_args, initial_subset, active_config, model_init, run_id=run_id, dataset_id=dataset_id, setting=setting)

Filter:   0%|          | 0/12 [00:00<?, ? examples/s]

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


config.json:   0%|          | 0.00/733 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/64.1k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/733 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]