In [None]:
#colab
'''
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd /content/drive/MyDrive/aml_final/aml_final/
! git pull
! pip install setfit
'''

In [None]:
from datasets import load_dataset
from setfit import TrainingArguments
import gc

from data.dataset_config import DatasetConfig
from train.active_learning import ActiveTrainer, create_random_subset
from train.active_learning_config import ActiveLearningConfig
from train.reporter import Reporter
from train.metrics import comprehensive_metrics

In [None]:
samples_per_cycle = 12 # 12 * 4 = 8 * 6, comparable to setfit paper

dataset = load_dataset("dair-ai/emotion")
dataset_config = DatasetConfig(text_column="text", num_classes=6)
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

final_reporter = Reporter("final_balancing.csv", label_column=dataset_config.label_column)

In [None]:
slow_train_args = TrainingArguments(num_iterations=20, num_epochs=(1, 16))
fast_train_args = TrainingArguments(num_iterations=10, num_epochs=(1, 8))

def run_train(args, initial_train_subset, active_learning_config, **kwargs):
    trainer = ActiveTrainer(
        full_train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        initial_train_subset=initial_train_subset,
        train_args=args,
        active_learning_config=active_learning_config, 
        dataset_config=dataset_config,
        metric=comprehensive_metrics,
        run_id=kwargs["run_id"],
        final_model_train_args=slow_train_args
    )
    t = trainer.train()
    final_reporter.report(
        trainer=t, 
        dataset=trainer.train_subset, 
        active_learning_config=active_learning_config, 
        dataset_name="emotion", **kwargs #kwars, so you can put anything here
        )

In [None]:
run_id = 0
for dataset_id in range(3):
    initial_subset = create_random_subset(dataset=train_dataset, num_samples=samples_per_cycle, seed=dataset_id, dataset_config=dataset_config)
    for model_name in ["sentence-transformers/all-mpnet-base-v2"]:
        for speed in ["fast", "slow"]:
            if speed == "fast":
                train_args = fast_train_args
            else:
                train_args = slow_train_args
            for setting in range(4):
                strategy = "max_entropy"
                balancing = None
                if setting == 0:
                    strategy = "random"
                if setting == 2:
                    balancing = 0.25
                if setting == 3:
                    balancing = 0.5
                active_config = ActiveLearningConfig(samples_per_cycle=samples_per_cycle, model_name=model_name, unlabeled_samples=600, active_sampling_strategy=strategy, balancing_factor=balancing)
                run_train(train_args, initial_subset, active_config, run_id=run_id, dataset_id=dataset_id, setting=setting, speed=speed)
                gc.collect()
                run_id+=1