In [1]:
#colab
'''
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd /content/drive/MyDrive/aml_final/aml_final/
! git pull
! pip install setfit
'''
from datasets import load_dataset, Dataset
from setfit import SetFitModel, Trainer, TrainingArguments
import torch, gc

from data.dataset_config import DatasetConfig
from train.active_learning import ActiveTrainer, create_random_subset, add_random_samples
from train.active_learning_config import ActiveLearningConfig
from train.reporter import Reporter
from train.metrics import camprehesive_metrics
from data.load_datasets import load_spanish_dataset

In [3]:
num_classes = 4 #change
samples_per_cycle = num_classes * 2
dataset_name = "twitter_humor"
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" #change
dataset = load_spanish_dataset("twitter_humor") #change
dataset_config = DatasetConfig(text_column="tweet", num_classes=num_classes) #change?


train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset_name = dataset_name.replace("/", "_") # prevent file system errors
final_reporter = Reporter(dataset_name + "_final.csv", label_column=dataset_config.label_column)
cycle_reporter = Reporter(dataset_name + "_cycle.csv", report_train_args=False, label_column=dataset_config.label_column) # not used
def after_train_callback(trainer: Trainer, dataset: Dataset, run_id: int):
    cycle_reporter.report(trainer=trainer, dataset=dataset, run_id=run_id)

In [5]:
#starts one run of active learning
cycle_train_args = TrainingArguments(num_iterations=10, num_epochs=(1, 8))
final_train_args = TrainingArguments(num_iterations=20, num_epochs=(1, 16))
run_id = 0

def run_train(initial_train_subset: Dataset, active_learning_config: ActiveLearningConfig, **kwargs):
    global run_id
    trainer = ActiveTrainer(
        full_train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        initial_train_subset=initial_train_subset,
        train_args=cycle_train_args,
        active_learning_config=active_learning_config, 
        dataset_config=dataset_config,
        metric=camprehesive_metrics,
        run_id=run_id,
        final_model_train_args=final_train_args,
        #after_train_callback=after_train_callback, #just slows down training
    )
    if active_learning_config.active_sampling_strategy == "random": # speed up training, just train one cycle
        trainer.train_subset = add_random_samples(train_dataset, initial_train_subset, len(initial_train_subset) + (samples_per_cycle * active_learning_config.active_learning_cycles), dataset_config, seed=run_id)
        t = trainer.run_training(final_model=True)
    else:
        t = trainer.train()
    final_reporter.report(
        trainer=t, 
        dataset=trainer.train_subset, 
        active_learning_config=active_learning_config, 
        dataset_name=dataset_name, run_id=run_id, **kwargs
        )
    run_id+=1

In [6]:
def run_hyperparam_search(initial_train_subset: Dataset):
    for setting in ["random", "max_entropy", "max_entropy_balanced"]:
        if setting == "random":
            strategy =  "random"
            balance = 0.0
        elif setting == "max_entropy":
            strategy =  "max_entropy"
            balance = None
        elif setting == "max_entropy_balanced":
            strategy =  "max_entropy"
            balance = 0.25
        unlabeled_samples = 10 * samples_per_cycle
        config = ActiveLearningConfig(samples_per_cycle=samples_per_cycle, active_sampling_strategy=strategy, balancing_factor=balance, unlabeled_samples=unlabeled_samples, model_name=model_name)
        run_train(initial_train_subset, config, setting=setting)
    

In [7]:
for dataset_seed in range(7):
    initial_train_subset = create_random_subset(train_dataset, dataset_config, num_samples=samples_per_cycle, seed=dataset_seed)
    run_hyperparam_search(initial_train_subset)
#samples_per_cycle = 4* samples_per_cycle # larger dataset
#for dataset_seed in range(7):
#    initial_train_subset = create_random_subset(train_dataset, dataset_config, num_samples=samples_per_cycle, seed=dataset_seed)
#    run_hyperparam_search(initial_train_subset)

Filter:   0%|          | 0/8 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4 [00:00<?, ? examples/s]

Filter:   0%|          | 0/32 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/8 [00:00<?, ? examples/s]

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Map:   0%|          | 0/32 [00:00<?, ? examples/s]

***** Running training *****
  Num examples = 80
  Num epochs = 1
  Total optimization steps = 80
  Total train batch size = 16


  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]