This notebook fine-tunes Llama-3.1-8B model using either standard fine-tuning or fine-tuning through curriculum learning with BBS. To fine-tune the model we use QLORA with 4bit quantization. This notebook contributes to the results shown in Section 5.2.2.

This notebook is inspired by: https://github.com/unslothai/unsloth

# Packages

In [None]:
%%writefile requirements.txt
unsloth==2025.6.2
unsloth_zoo==2025.6.1
trl==0.15.2
xformers==0.0.29.post3
bitsandbytes==0.46.0
peft==0.15.2
accelerate==1.7.0
torch==2.6.0+cu124
transformers==4.52.4
datasets==3.6.0
pandas==2.2.2
scikit-learn==1.6.1
sentencepiece==0.2.0
huggingface-hub
hf_transfer

In [None]:
%%capture
!pip install --no-cache-dir -r requirements.txt

from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import pandas as pd
import numpy as np
import random
from datasets import Dataset, concatenate_datasets
import xml.etree.ElementTree as ET
import json
import ast
from tqdm.auto import tqdm
from trl import SFTTrainer
from transformers import TrainingArguments, EarlyStoppingCallback
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding
from transformers.trainer_utils import EvalPrediction
import transformers.utils.logging
from sklearn.metrics import accuracy_score, f1_score, classification_report
import wandb

In [None]:
# Set seed for remainder of the code
seed_val = 546297
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_val)

# Functions

Load data

In [None]:
def load_data(file_path):
    """
    Load and preprocess the XML file into a pandas DataFrame.
    """

    # Load and parse XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract data into a list of dictionaries
    data = []

    # Iterate through the XML structure and extract infomation from the reviews
    for review in root.findall('Review'):
      review_id = review.get('rid')

      for sentence in review.findall('.//sentence'):
        text = sentence.find('text').text

        # Show nothing rather than 'NULL' in the prompt
        for opinion in sentence.findall('.//Opinion'):
          aspect = opinion.get('target')
          if aspect == 'NULL':
            aspect = ''

          # Adjust the categories to be more informative for the prompt
          category = opinion.get('category').lower().replace('#', ' ').replace('_',' and ')
          if category == 'food general':
              category = 'food style and options'
          elif category == 'service general':
              category = 'service'
          elif category == 'restaurant general' or category == 'restaurant miscellaneous':
              category = 'restaurant'
          elif category == 'ambience general':
              category = 'ambience'
          elif category == 'location general':
              category = 'location'

          # Represent the aspect as 'term (category entity)'
          aspect_term_category = aspect + ' (' + category + ')'

          data.append({
              "sentence": text,
              "aspect": aspect,
              "category": category,
              "aspect_term_category": aspect_term_category,
              "sentiment": opinion.get('polarity')
          })

    # Convert to DataFrame
    df = pd.DataFrame(data)
    y = df['sentiment'].values
    label_to_idx = {"negative": 0, "neutral": 1, "positive": 2}
    df['sentiment_label'] = df['sentiment'].map(label_to_idx)

    # Return both the features DataFrame and the sentiment labels
    return df, y

In [None]:
def zero_shot_classification(dataset):
    """
    Perform zero-shot classification by predicting the most likely sentiment.
    """

    # Define the classes and get their token IDS
    classes = ["negative", "neutral", "positive"]
    class_token_ids = [tokenizer.encode(c, add_special_tokens=False)[0] for c in classes]

    # Create lists to store the predicted classes
    pred_sent = []

    # Iterate over the dataset
    for row in tqdm(dataset):
      input = row['zero_shot_prompt']
      inputs = tokenizer(input, return_tensors="pt").to('cuda')

      # Perform inference without computing gradients to save memory
      with torch.no_grad():
        # This returns the outputs for all input tokens and the next one
        outputs = model(**inputs)

      # The predicted class should be at the last token (= logit.size(1) -1)
      logits = outputs.logits
      prediction_position = logits.size(1) - 1

      # Retrieve the logits for the classes at the specified position (end of prompt)
      logits_for_prediction = logits[:, prediction_position, :]
      class_logits = logits_for_prediction[:, class_token_ids]

      # Obtain the predicted probabilities class by using the softmax and argmax function respectively
      class_probs = F.softmax(class_logits, dim=-1)
      probs = class_probs.cpu().detach().float().numpy()
      pred_sent.append(classes[np.argmax(probs)])

      # To save memory
      del inputs, outputs, logits, logits_for_prediction, class_logits, class_probs
      torch.cuda.empty_cache()

    return pred_sent

In [None]:
class DataCollatorForLabelOnlyLoss(DataCollatorForLanguageModeling):
    """
    A custom data collator to compute the loss function on the sentiment label only.
    """

    def __init__(self, tokenizer, label_texts, ignore_index=-100, *args, **kwargs):
        super().__init__(tokenizer=tokenizer, mlm=False, *args, **kwargs)

        # Setting the label to -100 makes the model ignore it
        self.ignore_index = ignore_index
        # Store the token IDs for the sentiment labels in a dictionary
        self.label_token_ids = {label: tokenizer.encode(label, add_special_tokens=False) for label in label_texts}

    def torch_call(self, examples):

        # Examples is a list of (tokenized) single data instances
        batch = super().torch_call(examples)

        # Loop over every training instance
        for i in range(len(batch["labels"])):

            # Find the non-padding tokens
            non_ignored = (batch["labels"][i] != self.ignore_index).nonzero(as_tuple=True)[0]

            # The sentiment prediction is at the penultimate non-padding token (EOS token is last one)
            label_pos = non_ignored[-2].item()

            # Mask everything except the label and the EOS-token
            batch["labels"][i][:label_pos] = self.ignore_index
            batch["labels"][i][label_pos+1:] = self.ignore_index

        return batch

In [None]:
# To prevent redundant errors from being printed
transformers.utils.logging.get_logger("transformers.trainer").setLevel("ERROR")

class CustomSFTTrainer(SFTTrainer):
    """
    A custom SFTTrainer that allows us to compute the accuracy, f1-score and convergence metric of the validation set during training.
    This allows us to monitor the performance convergence of the model during training.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Store the token IDs for the sentiment classes
        self.class_token_ids = []
        for c in CLASSES:
            token_ids = self.tokenizer.encode(c, add_special_tokens=False)
            self.class_token_ids.append(token_ids[0])

    def get_eval_dataloader(self, eval_dataset: Dataset = None) -> DataLoader:
        """
        Override the default dataloader to manually tokenize the input and format the data for the custom prediction_step function.
        """

        # Use the provided evaluation dataset, or fall back to the one defined in the Trainer.
        current_eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset

        # Extract a list of the prompts ("text") and labels in the eval set and tokenize the prompts
        texts = current_eval_dataset["text"]
        metric_labels = current_eval_dataset["labels"]
        tokenized_inputs = self.tokenizer(texts, truncation=True, max_length=max_seq_length, padding=False)

        # Reconstruct the tokenized data from a columnar format (dict of lists) to a row format (list of dicts), where each dict is a single example.
        # This has to be done to format the data in the way the function prediction_step expects it
        processed_examples = []
        for i in range(len(texts)):
            example = {}
            for key in tokenized_inputs.keys():
                example[key] = tokenized_inputs[key][i]
            example["labels"] = metric_labels[i]
            processed_examples.append(example)

        # Convert back to a huggingface dataset
        tokenized_eval_dataset = Dataset.from_list(processed_examples)

        # Initialize the new dataloader with our custom dataset
        eval_loader = DataLoader(
            tokenized_eval_dataset,
            batch_size=self.args.eval_batch_size,
            collate_fn=DataCollatorWithPadding(self.tokenizer, padding="longest"),
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
        )

        return eval_loader

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        """
        A custom prediction_step function that returns the predicted and true classes for every isntance, allowing us to compute our custom evaluation metrics.
        The default prediction_step copmuted the predicted the logits of all tokens, rather than all tokens, and returned the evaluation loss, which we are less interested in.
        """

        # Move the input tensors from the dataloader batch to the correct GPU device
        input_ids = inputs["input_ids"].to(model.device)
        attention_mask = inputs["attention_mask"].to(model.device)

        # Safely extract the ground truth integer labels from the batch, regardless of the shape
        true_metric_labels = torch.full((input_ids.shape[0],), -100, device=model.device, dtype=torch.long)
        if "labels" in inputs:
            labels_from_input = inputs["labels"].to(model.device)
            if labels_from_input.ndim == 1:
                true_metric_labels = labels_from_input
            elif labels_from_input.ndim == 2 and labels_from_input.shape[1] == 1: # Squeeze if (batch_size, 1)
                true_metric_labels = labels_from_input.squeeze(1)

        # Perform inference without computing gradients to save memory
        with torch.no_grad():
            # This returns the outputs for all input tokens and the next one
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

        # Get the number of examples in current batch
        batch_size = input_ids.shape[0]

        # Copmute the length of each sequence in the batch, ignoring any padding tokens
        if self.tokenizer.pad_token_id is None:
            sequence_lengths = torch.ne(input_ids, 0).sum(-1)
        else:
            sequence_lengths = torch.ne(input_ids, self.tokenizer.pad_token_id).sum(-1)

        # Retrieve the index of the last token of the prompt, and the logits of the next index, the predicted sentiment (comes after last prompt token)
        indices_of_last_prompt_token = sequence_lengths - 1
        next_token_prediction_logits = logits[torch.arange(batch_size, device=model.device), indices_of_last_prompt_token, : ]

        # Compute the predicted class given the logits of the sentiment labels
        class_specific_logits = next_token_prediction_logits[:, self.class_token_ids]
        preds_class_indices = torch.argmax(class_specific_logits, dim=-1)

        return (None, preds_class_indices, true_metric_labels)


In [None]:
def compute_eval_metrics(eval_preds: EvalPrediction):
    """
    Compute the accuracy, macro f1-score, weighted f1-score, and convergence metric of the validation set.
    """

    # Extract the indices predicted and true classes
    true_classes = eval_preds.label_ids
    predicted_classes = eval_preds.predictions

    # Make sure we do not include -100 as a class, which could be added due to padding
    valid_mask = true_classes != -100
    filtered_true = true_classes[valid_mask]
    filtered_preds = predicted_classes[valid_mask]

    # Compute the evaluation metrics
    accuracy = accuracy_score(filtered_true, filtered_preds)
    f1_macro = f1_score(filtered_true, filtered_preds, average='macro', zero_division=0)
    f1_weighted = f1_score(filtered_true, filtered_preds, average='weighted', zero_division=0)

    # Create a dictionary with the obtained metrcis
    metrics_dict = {
        "eval_accuracy": accuracy,
        "eval_f1_macro": f1_macro,
        "eval_f1_weighted": f1_weighted,
        "convergence_metric": accuracy + 0.001*f1_macro
    }

    return metrics_dict

In [None]:
def BS(dataset, num_subsets, cur_order):
    """
    This function partitions the data into buckets to perform curriculum learning with BS
    """

    # Initialize the list of subsets and indices
    subsets = []
    sorted_data = dataset.sort(cur_order)
    end_index = 0
    subset_size = int(np.ceil(len(sorted_data)/num_subsets))

    # Create the buckets one by one
    for i in range(num_subsets):
      if i < num_subsets - 1:
        end_index += subset_size
      else:
        end_index = len(sorted_data)
      subset = sorted_data.select(range(0, end_index))
      subsets.append(subset)

    return subsets

In [None]:
def BBS(dataset, num_subsets, cur_order):
    """
    This function partitions the data into proportional buckets to perform curriculum learning with BBS
    """

    # Initialize the list of subsets and indices
    subsets = []
    negative_end_index = 0
    neutral_end_index = 0
    positive_end_index = 0

    # Partition the data based on their sentimeent classes
    negative_data = dataset.filter(lambda example: example['sentiment'] == 'negative').sort(cur_order)
    neutral_data = dataset.filter(lambda example: example['sentiment'] == 'neutral').sort(cur_order)
    positive_data = dataset.filter(lambda example: example['sentiment'] == 'positive').sort(cur_order)

    # Determine the number of data instances of each class are in the subsets to maintain a class distribution proportional to the full dataset.
    negative_subset_size = int(np.ceil(len(negative_data)/num_subsets))
    neutral_subset_size = int(np.ceil(len(neutral_data)/num_subsets))
    positive_subset_size = int(np.ceil(len(positive_data)/num_subsets))

    # Create the buckets one by one
    for i in range(0,num_subsets):

      # The number of data instances of each class should be in the buckets
      negative_end_index += negative_subset_size
      neutral_end_index += neutral_subset_size
      positive_end_index += positive_subset_size

      # Create the buckets by concatenating the subsets of the different classes back together
      if i < num_subsets - 1:
        subset = concatenate_datasets([negative_data.select(range(0, negative_end_index)), neutral_data.select(range(0, neutral_end_index)), positive_data.select(range(0, positive_end_index))]).shuffle(seed = i)
      else:
        subset = concatenate_datasets([negative_data.select(range(0, len(negative_data))), neutral_data.select(range(0, len(neutral_data))), positive_data.select(range(0, len(positive_data)))]).shuffle(seed = i)
      subsets.append(subset)

    return subsets

# Prompts

The prompt used for zero-shot classification

In [None]:
zero_shot_prompt = """Classify the sentiment expressed towards the given aspect within the provided sentence as 'negative', 'neutral' or 'positive'.

### Sentence:
{}

### Aspect:
{}

### Sentiment:
"""

def format_zero_shot_prompts(data):
    """
    This function formats the prompts for zero-shot classification.
    """

    # Extract the sentences and aspects from the data
    sentences      = data["sentence"]
    aspects        = data["aspect_term_category"]
    prompts = []

    # Format the zero-shot prompts for every test instance
    for sentence, aspect in zip(sentences, aspects):
      prompt = zero_shot_prompt.format(sentence, aspect)
      prompts.append(prompt)

    return { "zero_shot_prompt" : prompts }

The prompt used for model evaluation during training

In [None]:
eval_prompt_template = """Classify the sentiment expressed towards the given aspect within the provided sentence as 'negative', 'neutral' or 'positive'.

### Sentence:
{}

### Aspect:
{}

### Sentiment:
"""

CLASSES = ["negative", "neutral", "positive"]
word_to_id = {text: i for i, text in enumerate(CLASSES)}

def format_eval_prompt(data):
    """
    This function prepares the data from the validation set for the SFFTrainer, by formatting the prompts and
    """

    # Extract the sentences, aspects, and sentiments from the data
    sentences      = data["sentence"]
    aspects        = data["aspect_term_category"]
    sentiments     = data["sentiment"]
    prompts = []
    true_class_indices = []

    for sentence, aspect, sentiment in zip(sentences, aspects, sentiments):

        # Format the prompt
        prompt_text = eval_prompt_template.format(sentence, aspect)
        prompts.append(prompt_text)

        # Convert true sentiment word to class index
        class_idx = word_to_id.get(sentiment, -1) # Default to -1 if not found
        if class_idx == -1:
            print(f"Warning: Unknown sentiment word '{sentiment}' found in eval data. Assigning label -1.")
        true_class_indices.append(class_idx)

    return {"text": prompts, "labels": true_class_indices}

The prompt used during the fine-tuning of the model

In [None]:
fine_tune_prompt = """Classify the sentiment expressed towards the given aspect within the provided sentence as 'negative', 'neutral' or 'positive'.

### Sentence:
{}

### Aspect:
{}

### Sentiment:
{}"""

def format_fine_tune_prompts(data):
    """
    This function formats the prompts for the fine-tuning of the model.
    """

    # Extract the sentences, aspects, and sentiments from the data
    sentences      = data["sentence"]
    aspects        = data["aspect_term_category"]
    sentiments     = data["sentiment"]
    prompts = []
    EOS_TOKEN = tokenizer.eos_token

    # Format the fine-tune prompts for every training instance
    for sentence, aspect, sentiment in zip(sentences, aspects, sentiments):
      prompt = fine_tune_prompt.format(sentence, aspect, sentiment) + EOS_TOKEN # Append EOS Token for fine-tuning
      prompts.append(prompt)

    return { "text" : prompts }

# Main

To run this code, first upload the files:

*   '201x_Restaurants_Train.xml'
*   '201x_Restaurants_Test.xml'
*   'swn_complexity_scores_201x.csv'
*   'sentence_lengths_201x.csv'
*   'ce_llama_201x.csv'

with 201x being either 2015 or 2016.

Once you have completed the fine-tuning, computed the evaluation metrics, and saved the results, reconnect the runtime before running another case (different dataset/training procedure).

## Load the model

Load model

In [None]:
model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"

In [None]:
max_seq_length = 1024
dtype = None
load_in_4bit = True
CLASSES = ["negative", "neutral", "positive"]
#wandb.login()
#os.environ["WANDB_PROJECT"] = "project_name" # Specify wandb project name

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 1024,
    dtype = None,
    load_in_4bit = True,
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 546297,
    use_rslora = False,
    loftq_config = None,
)

## Load the data

Load either the 2015 or 2016 data, perform the train-validation split and format the prompts

### 2015

In [None]:
# Load train data
df_train, y_train = load_data('2015_Restaurants_Train.xml')
df_train['original_indices'] = df_train.index
df_train['sl'] = pd.read_csv('sentence_lengths_2015.csv')
df_train['swn'] = pd.read_csv('swn_complexity_scores_2015.csv')
df_train['ce'] = pd.read_csv('ce_llama_2015.csv')

# Load test data
df_test, y_test = load_data('2015_Restaurants_Test.xml')
df_test['original_indices'] = df_test.index

# Convert to huggingface Datasets
dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)

### 2016

In [None]:
# Load train data
df_train, y_train = load_data('2016_Restaurants_Train.xml')
df_train['original_indices'] = df_train.index
df_train['sl'] = pd.read_csv('sentence_lengths_2016.csv')
df_train['swn'] = pd.read_csv('swn_complexity_scores_2016.csv')
df_train['ce'] = pd.read_csv('ce_llama_2016.csv')

# Load test data
df_test, y_test = load_data('2016_Restaurants_Test.xml')
df_test['original_indices'] = df_test.index

# Convert to huggingface Datasets
dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)

### Split data and format the prompts

In [None]:
# Perform the train-validation split
split = dataset_train.train_test_split(test_size=0.2, seed = 123)

# Get the sentiment labels for the train, validation and test set
y_train = split['train']['sentiment']
y_val = split['test']['sentiment']
y_test = df_test['sentiment']

# Format the prompts for every set
train_set = split['train'].map(format_fine_tune_prompts, batched=True, remove_columns=split['train'].column_names)
eval_set = split['test'].map(format_eval_prompt, batched=True, remove_columns=split['test'].column_names)
test_set = dataset_test.map(format_zero_shot_prompts, batched=True, remove_columns=dataset_test.column_names)

## Baseline

Train the model with standard fine-tuning

In [None]:
# Initialize the custom SFFTrainer
trainer = CustomSFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_set,
    eval_dataset = eval_set,
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    data_collator = DataCollatorForLabelOnlyLoss(tokenizer=tokenizer, label_texts=CLASSES), # The custom data collator
    compute_metrics = compute_eval_metrics, # The custom convergence metrics
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)], # Perform early stopping
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 10,
        learning_rate = 2e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 25,
        eval_steps = 25,
        eval_strategy='steps',
        save_steps = 25,
        save_strategy='steps',
        load_best_model_at_end=True,
        metric_for_best_model='convergence_metric', # Early stopping based on the convergence metric
        greater_is_better=True,
        optim = "adamw_8bit",
        weight_decay = 0.1,
        lr_scheduler_type = "cosine",
        seed = 546297,
        output_dir = "output_dir",
        run_name="run_name", # Specify wandb run name
        report_to = "none", # Switch to "wandb" if desired
    ),
)

# Train the model
trainer_stats = trainer.train()
print(trainer_stats.metrics)

Compute the evaluation metrics for the test, train and validaiton set

In [None]:
# Compute the evaluation metrics and save the results for the test set
y_pred_test = zero_shot_classification(test_set)
print(classification_report(y_test, y_pred_test, digits = 4))
pd.Series(y_pred_test).to_csv('llama2016_baseline_test.csv', index=False)

In [None]:
# Compute the evaluation metrics and save the results for the train set
train_class = split['train'].map(format_zero_shot_prompts, batched=True,)
y_pred_train = zero_shot_classification(train_class)
print(classification_report(y_train, y_pred_train, digits = 4))
pd.Series(y_pred_test).to_csv('llama2016_baseline_train.csv', index=False)

In [None]:
# Compute the evaluation metrics and save the results for the validation set
eval_class = split['test'].map(format_zero_shot_prompts, batched=True,)
y_pred_val = zero_shot_classification(eval_class)
print(classification_report(y_val, y_pred_val, digits = 4))
pd.Series(y_pred_test).to_csv('llama2016_baseline_val.csv', index=False)

## BS

Create training buckets for BS and use 'sl', 'swn', or 'ce' as a complexity measure in the last argument

In [None]:
train_subsets = BS(split['train'], 3, 'ce')

Train the model with curriculum learning using BS

In [None]:
# Initialize the settings for every training phase, note that we train on the easy data for 1 epoch, while we train on the medium and hard data until convergence
phase_configs = [
    {"name": "easy", "num_train_epochs": 1,  "learning_rate": 2e-5,  "warmup_ratio": 0.1, "eval_steps": 20, "callbacks": [EarlyStoppingCallback(early_stopping_patience=100)], "load_model": False},
    {"name": "medium", "num_train_epochs": 7, "learning_rate": 2e-5, "warmup_ratio": 0.06, "eval_steps": 25, "callbacks": [EarlyStoppingCallback(early_stopping_patience=5)], "load_model": True},
    {"name": "hard", "num_train_epochs": 10, "learning_rate": 2e-5,"warmup_ratio": 0.03, "eval_steps": 30, "callbacks" : [EarlyStoppingCallback(early_stopping_patience=5)], "load_model": True}
]

# Iterate over the training phases
for phase_idx, (subset, config) in enumerate(zip(train_subsets, phase_configs)):
    print(f"\n=== Starting {config['name']} phase ===")
    train_subset = subset.map(format_fine_tune_prompts, batched=True, remove_columns=subset.column_names)

    # Initialize the custom SFFTrainer for each phase
    trainer = CustomSFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = train_subset,
        eval_dataset = eval_set,
        max_seq_length = max_seq_length,
        dataset_num_proc = 2,
        packing = False,
        data_collator = DataCollatorForLabelOnlyLoss(tokenizer=tokenizer, label_texts=CLASSES), # The custom data collator
        compute_metrics = compute_eval_metrics, # The custom evaluation metrics
        callbacks= config["callbacks"], # Perform early stopping (for medium and hard)
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_ratio = config["warmup_ratio"],
            num_train_epochs = config["num_train_epochs"],
            learning_rate = config["learning_rate"],
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = config["eval_steps"],
            eval_steps = config["eval_steps"],
            eval_strategy='steps',
            save_steps = config["eval_steps"],
            save_strategy='steps',
            load_best_model_at_end=config["load_model"],
            metric_for_best_model='convergence_metric', # Early stopping based on the convergence metric
            greater_is_better=True,
            optim = "adamw_8bit",
            weight_decay = 0.1,
            lr_scheduler_type = "cosine",
            seed = 546297,
            output_dir = "output-dir",
            run_name=f"run_name", # Specify wandb run name
            report_to = "none", # Switch to "wandb" if desired
        ),
    )

    # Train the model
    trainer_stats = trainer.train()
    print(trainer_stats)

    # Compute the performance on the test set to get insight into model behaviour for every phase
    y_pred_test = zero_shot_classification(test_set)
    print(classification_report(y_test, y_pred_test, digits = 4))

# Save the predictions of the final model
pd.Series(y_pred_test).to_csv('llama2015_obs_ce_test.csv', index=False)

In [None]:
# Compute the evaluation metrics and save the results for the train set
train_class = split['train'].map(format_zero_shot_prompts, batched=True,)
y_pred_train = zero_shot_classification(train_class)
print(classification_report(y_train, y_pred_train, digits = 4))
pd.Series(y_pred_train).to_csv('llama2015_obs_ce_train.csv', index=False)

In [None]:
# Compute the evaluation metrics and save the results for the validation set
eval_class = split['test'].map(format_zero_shot_prompts, batched=True,)
y_pred_val = zero_shot_classification(eval_class)
print(classification_report(y_val, y_pred_val, digits = 4))
pd.Series(y_pred_val).to_csv('llama2015_obs_ce_val.csv', index=False)

## BBS

Create training buckets for BBS and use 'sl', 'swn', or 'ce' as a complexity measure in the last argument

In [None]:
train_subsets = BBS(split['train'], 3, 'swn')

Train the model with curriculum learning using BBS

In [None]:
# Initialize the settings for every training phase, note that we train on the easy data for 1 epoch, while we train on the medium and hard data until convergence
phase_configs = [
    {"name": "easy", "num_train_epochs": 1,  "learning_rate": 2e-5,  "warmup_ratio": 0.1, "eval_steps": 20, "callbacks": [EarlyStoppingCallback(early_stopping_patience=100)], "load_model": False},
    {"name": "medium", "num_train_epochs": 7, "learning_rate": 2e-5, "warmup_ratio": 0.06, "eval_steps": 25, "callbacks": [EarlyStoppingCallback(early_stopping_patience=5)], "load_model": True},
    {"name": "hard", "num_train_epochs": 10, "learning_rate": 2e-5,"warmup_ratio": 0.03, "eval_steps": 30, "callbacks" : [EarlyStoppingCallback(early_stopping_patience=5)], "load_model": True}
]

# Iterate over the training phases
for phase_idx, (subset, config) in enumerate(zip(train_subsets, phase_configs)):
    print(f"\n=== Starting {config['name']} phase ===")
    train_subset = subset.map(format_fine_tune_prompts, batched=True, remove_columns=subset.column_names)

    # Initialize the custom SFFTrainer for each phase
    trainer = CustomSFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = train_subset,
        eval_dataset = eval_set,
        max_seq_length = max_seq_length,
        dataset_num_proc = 2,
        packing = False,
        data_collator = DataCollatorForLabelOnlyLoss(tokenizer=tokenizer, label_texts=CLASSES), # The custom data collator
        compute_metrics = compute_eval_metrics, # The custom evaluation metrics
        callbacks= config["callbacks"], # Perform early stopping (for medium and hard)
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_ratio = config["warmup_ratio"],
            num_train_epochs = config["num_train_epochs"],
            learning_rate = config["learning_rate"],
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = config["eval_steps"],
            eval_steps = config["eval_steps"],
            eval_strategy='steps',
            save_steps = config["eval_steps"],
            save_strategy='steps',
            load_best_model_at_end=config["load_model"],
            metric_for_best_model='convergence_metric', # Early stopping based on the convergence metric
            greater_is_better=True,
            optim = "adamw_8bit",
            weight_decay = 0.1,
            lr_scheduler_type = "cosine",
            seed = 546297,
            output_dir = "output-dir",
            run_name=f"run_name", # Specify wandb run name
            report_to = "none", # Switch to "wandb" if desired
        ),
    )

    # Train the model
    trainer_stats = trainer.train()
    print(trainer_stats)

    # Compute the performance on the test set to get insight into model behaviour for every phase
    y_pred_test = zero_shot_classification(test_set)
    print(classification_report(y_test, y_pred_test, digits = 4))

# Save the predictions of the final model
pd.Series(y_pred_test).to_csv('llama2016_bs_ce_test.csv', index=False)

In [None]:
# Compute the evaluation metrics and save the results for the train set
train_class = split['train'].map(format_zero_shot_prompts, batched=True,)
y_pred_train = zero_shot_classification(train_class)
print(classification_report(y_train, y_pred_train, digits = 4))
pd.Series(y_pred_train).to_csv('llama2016_bs_ce_train.csv', index=False)

In [None]:
# Compute the evaluation metrics and save the results for the validation set
eval_class = split['test'].map(format_zero_shot_prompts, batched=True,)
y_pred_val = zero_shot_classification(eval_class)
print(classification_report(y_val, y_pred_val, digits = 4))
pd.Series(y_pred_val).to_csv('llama2016_bs_ce_val.csv', index=False)