<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/Training/3_Training_2_DPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training for critical question

## Setup
First we define some constant values and also install all needed libraries



### Installation

In [1]:
!pip install --no-deps xformers triton unsloth_zoo
!pip install sentencepiece protobuf huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install -U transformers
!pip install -U datasets
!pip install -U accelerate
!pip install -U peft
!pip install -U trl
!pip install -U bitsandbytes

Collecting xformers
  Downloading xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.5.7-py3-none-any.whl.metadata (8.0 kB)
Downloading xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl (31.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.5/31.5 MB[0m [31m79.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading unsloth_zoo-2025.5.7-py3-none-any.whl (138 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.1/138.1 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, unsloth_zoo
Successfully installed unsloth_zoo-2025.5.7 xformers-0.0.30
Collecting unsloth
  Downloading unsloth-2025.5.6-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading unsloth-2025.5.6-py3-none-any.whl (265 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from unsloth import FastLanguageModel
import shutil
import os
import torch
from datasets import load_dataset
from trl import DPOTrainer
import logging
from transformers import TrainingArguments, EarlyStoppingCallback, IntervalStrategy, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.7.0+cu126 with CUDA 1206 (you have 2.6.0+cu124)
    Python  3.11.12 (you have 3.11.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


### Colab
This part is only relevant when using the notebook in google colab

In [3]:
from google.colab import userdata, drive

In [4]:
drive.mount('/content/drive')
token = userdata.get('GITHUB')

Mounted at /content/drive


Clone GitHub Repository to directly push generated files

In [5]:
repo_url = f"https://{token}@github.com/RicoStaedeli/NLP2025_CQG.git"

!git clone {repo_url}

Cloning into 'NLP2025_CQG'...
remote: Enumerating objects: 1039, done.[K
remote: Counting objects: 100% (126/126), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 1039 (delta 91), reused 67 (delta 60), pack-reused 913 (from 1)[K
Receiving objects: 100% (1039/1039), 47.11 MiB | 24.71 MiB/s, done.
Resolving deltas: 100% (567/567), done.
Updating files: 100% (98/98), done.


### Path Variables and Logger

In [6]:
################################################################################
#######################   STATIC VARIABLES      ################################
################################################################################

TRAINING_NUMBER = 3
BASE_MODEL_REPO = "unsloth/Meta-Llama-3.1-8B-Instruct"
MODEL_NAME = "Meta-Llama-3.1-1B-Instruct_DPO"

################################################################################
#######################   PATH VARIABLES        ################################
################################################################################

train_dataset_path = "/content/NLP2025_CQG/Data/Processed/processed_train_data_filtered_dpo.json" #"/content/NLP2025_CQG/Data/Processed/example_train.json"

log_base_path = f"/content/NLP2025_CQG/Training/Logs/Traing_{TRAINING_NUMBER}/Tensorboard/"
os.makedirs(log_base_path, exist_ok=True)

log_file_path = f"/content/NLP2025_CQG/Logs/training_{TRAINING_NUMBER}.log"

model_save_path = f"/content/drive/MyDrive/HSG/NLP/Project NLP/Training/Training_{TRAINING_NUMBER}/Model/{MODEL_NAME}_finetuned/"
os.makedirs(model_save_path, exist_ok=True)

model_lora_adapter_save_path = f"/content/drive/MyDrive/HSG/NLP/Project NLP/Training/Training_{TRAINING_NUMBER}/Model/{MODEL_NAME}_lora_adapters/"
os.makedirs(model_lora_adapter_save_path, exist_ok=True)


checkpoint_dir = f"/content/drive/MyDrive/HSG/NLP/Project NLP/Training/Training_{TRAINING_NUMBER}/Checkpoints/"
os.makedirs(checkpoint_dir, exist_ok=True)


################################################################################
#######################   LOGGER                ################################
################################################################################

# Setup logger manually
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create file handler (only if not already added)
if not logger.handlers:
    fh = logging.FileHandler(log_file_path)
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    logger.addHandler(fh)

# Detect device
device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

In [7]:
logger.info("--------  Start with Training  -------------")
logger.info(f'Device selected: {device}')
logger.info(f'Model: {MODEL_NAME}')
logger.info(f'Training number: {TRAINING_NUMBER}')

INFO:__main__:--------  Start with Training  -------------
INFO:__main__:Device selected: cuda
INFO:__main__:Model: Meta-Llama-3.1-1B-Instruct_DPO
INFO:__main__:Training number: 3


## Training Parameters

In [8]:
################################################################################
#######################   Unlsoth Parameters    ################################
################################################################################

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


################################################################################
#######################   PEFT Parameters       ################################
################################################################################

r = 16 # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                  "gate_proj", "up_proj", "down_proj",]
lora_alpha = 16
lora_dropout = 0 # Supports any, but = 0 is optimized
bias = "none"    # Supports any, but = "none" is optimized
use_gradient_checkpointing = "unsloth" # True or "unsloth" for very long context
random_state = 3407
use_rslora = False  # Unsloth supports rank stabilized LoRA
loftq_config = None # And LoftQ


################################################################################
#######################   SFT Trainer Parameters   #############################
################################################################################

dataset_text_field = "input_ids"
dataset_num_proc = 2
packing = False
per_device_train_batch_size = 2
gradient_accumulation_steps = 4
warmup_steps = 5
max_steps = 10
learning_rate = 2e-4
fp16 = not is_bfloat16_supported()
bf16 = is_bfloat16_supported()
logging_steps = 1
save_strategy = IntervalStrategy.STEPS
save_steps = 1
save_total_limit = 1
optim = "adamw_8bit"
weight_decay = 0.01
lr_scheduler_type = "linear"
seed = 3407
output_dir = checkpoint_dir
report_to = "tensorboard"
logging_dir = log_base_path
evaluation_strategy="steps"
eval_steps=1

################################################################################
#######################   Log Parameters            ############################
################################################################################

logger.info("------ Unlsoth Parameters ---------------")
logger.info(f"max_seq_length: {max_seq_length}")
logger.info(f"dtype: {dtype}")
logger.info(f"load_in_4bit: {load_in_4bit}")

logger.info("------ PEFT Parameters ------------------")
logger.info(f"r: {r}")
logger.info(f"target_modules: {target_modules}")
logger.info(f"lora_alpha: {lora_alpha}")
logger.info(f"lora_dropout: {lora_dropout}")
logger.info(f"bias: {bias}")
logger.info(f"use_gradient_checkpointing: {use_gradient_checkpointing}")
logger.info(f"random_state: {random_state}")
logger.info(f"use_rslora: {use_rslora}")

logger.info("------  SFT Trainer Parameters ----------")
logger.info(f"dataset_text_field: {dataset_text_field}")
logger.info(f"dataset_num_proc: {dataset_num_proc}")
logger.info(f"packing: {packing}")
logger.info(f"per_device_train_batch_size: {per_device_train_batch_size}")
logger.info(f"gradient_accumulation_steps: {gradient_accumulation_steps}")
logger.info(f"warmup_steps: {warmup_steps}")
logger.info(f"max_steps: {max_steps}")
logger.info(f"learning_rate: {learning_rate}")
logger.info(f"fp16: {fp16}")
logger.info(f"bf16: {bf16}")
logger.info(f"logging_steps: {logging_steps}")
logger.info(f"save_strategy: {save_strategy}")
logger.info(f"save_steps: {save_steps}")
logger.info(f"save_total_limit: {save_total_limit}")
logger.info(f"optim: {optim}")
logger.info(f"weight_decay: {weight_decay}")
logger.info(f"lr_scheduler_type: {lr_scheduler_type}")
logger.info(f"seed: {seed}")
logger.info(f"output_dir: {output_dir}")
logger.info(f"report_to: {report_to}")
logger.info(f"logging_dir: {logging_dir}")
logger.info(f"evaluation_strategy: {evaluation_strategy}")
logger.info(f"eval_steps: {eval_steps}")

INFO:__main__:------ Unlsoth Parameters ---------------
INFO:__main__:max_seq_length: 2048
INFO:__main__:dtype: None
INFO:__main__:load_in_4bit: True
INFO:__main__:------ PEFT Parameters ------------------
INFO:__main__:r: 16
INFO:__main__:target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
INFO:__main__:lora_alpha: 16
INFO:__main__:lora_dropout: 0
INFO:__main__:bias: none
INFO:__main__:use_gradient_checkpointing: unsloth
INFO:__main__:random_state: 3407
INFO:__main__:use_rslora: False
INFO:__main__:------  SFT Trainer Parameters ----------
INFO:__main__:dataset_text_field: input_ids
INFO:__main__:dataset_num_proc: 2
INFO:__main__:packing: False
INFO:__main__:per_device_train_batch_size: 2
INFO:__main__:gradient_accumulation_steps: 4
INFO:__main__:warmup_steps: 5
INFO:__main__:max_steps: 10
INFO:__main__:learning_rate: 0.0002
INFO:__main__:fp16: False
INFO:__main__:bf16: True
INFO:__main__:logging_steps: 1
INFO:__main__:save_strategy: IntervalS

### Unsloth

In [9]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = BASE_MODEL_REPO,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.5.6: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [10]:
model = FastLanguageModel.get_peft_model(
    model,
    r = r,
    target_modules = target_modules,
    lora_alpha = lora_alpha,
    lora_dropout = lora_dropout,
    bias = bias,
    use_gradient_checkpointing = use_gradient_checkpointing,
    random_state = random_state,
    use_rslora = use_rslora,
    loftq_config = loftq_config,
)

Unsloth 2025.5.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Load and preprocess dataset
The raw dataset [SocratiQ](https://github.com/NUS-IDS/eacl23_soqg/tree/main) has a label at the begining of the context. We have to remove that and also tokenize the input for the model training.

In [18]:
dataset = load_dataset('json', data_files=train_dataset_path)

In [19]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 3128
    })
})


## Training

## Prepare Tokenizer

In [20]:
from unsloth.chat_templates import CHAT_TEMPLATES
print(list(CHAT_TEMPLATES.keys()))


['unsloth', 'zephyr', 'chatml', 'mistral', 'llama', 'vicuna', 'vicuna_old', 'vicuna old', 'alpaca', 'gemma', 'gemma_chatml', 'gemma2', 'gemma2_chatml', 'llama-3', 'llama3', 'phi-3', 'phi-35', 'phi-3.5', 'llama-3.1', 'llama-31', 'llama-3.2', 'llama-3.3', 'llama-32', 'llama-33', 'qwen-2.5', 'qwen-25', 'qwen25', 'qwen2.5', 'phi-4', 'gemma-3', 'gemma3', 'qwen-3', 'qwen3']


In [21]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3",
)

## Prepare Dataset

In [22]:
print(dataset['train'][0])

{'prompt': [{'content': "Generate one critical question addressing the provided context. Ensure it matches the schema: 'Fear Appeal' with the examples:\n    Is <eventB> bad? Why and to whom is it bad?\n    Is <eventA> a way to prevent <eventB>?\n\nContext: implication_consequences: The argument isn't that school teachers' compensation is adequate. The argument is that everyone should be paid a living wage. Someone asked how much a living wage was, and OP responded with an estimation of 40-50k.", 'role': 'user'}], 'chosen': [{'content': 'How much would a big Mac be if every employee made 50k a year?', 'role': 'assistant'}], 'rejected': [{'content': 'Is paying school teachers a living wage a way to ensure fair compensation for all workers?', 'role': 'assistant'}]}


In [23]:
column_names = list(dataset["train"].features)
def apply_dpo_template(example):
  if all(k in example.keys() for k in ("chosen", "rejected","prompt")):
    # For DPO, the inputs are triples of (prompt, chosen, rejected), where `chosen` and `rejected` are the final turn of a dialogue
    prompt_messages = example["prompt"]
    chosen_messages = example["chosen"]
    rejected_messages = example["rejected"]

    example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
    example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
    example["text_prompt"] = tokenizer.apply_chat_template(prompt_messages, tokenize=False)
  return example

dataset = dataset.map(apply_dpo_template,remove_columns=column_names,
          desc="Formatting comparisons with prompt template",)

In [28]:
print(dataset['train'][0])

{'text_chosen': '<|begin_of_text|><|start_header_id|>assistant<|end_header_id|>\n\nBut does the persistence of the debate matter if we drastically reduce the demand for abortion?<|eot_id|>', 'text_rejected': '<|begin_of_text|><|start_header_id|>assistant<|end_header_id|>\n\nAre the motivations behind the anti-abortion movement and the push to limit unwanted pregnancies similar in their goals to reduce the need for abortions?<|eot_id|>', 'text_prompt': "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nGenerate one critical question addressing the provided context. Ensure it matches the schema: 'Analogy' with the examples:\n    Are <C1> and <C2> similar in the respect cited?\n    Is <eventA> true in <C1>?\n\nContext: implication_consequences: Certainly, a large portion of the anti-abortion crowd is simply adhering to partisanship, following a religious agenda, etc. And attempting to limit the need for abortions is the best way to attack the root issue, unwanted pregnancies, b

In [24]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text_chosen', 'text_rejected', 'text_prompt'],
        num_rows: 3128
    })
})


In [27]:
dataset = dataset['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)


DatasetDict({
    train: Dataset({
        features: ['text_chosen', 'text_rejected', 'text_prompt'],
        num_rows: 2502
    })
    test: Dataset({
        features: ['text_chosen', 'text_rejected', 'text_prompt'],
        num_rows: 626
    })
})


In [29]:
for split in ["train", "test"]:
    dataset[split] = dataset[split].rename_columns(
        {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}
    )

In [30]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected', 'prompt'],
        num_rows: 2502
    })
    test: Dataset({
        features: ['chosen', 'rejected', 'prompt'],
        num_rows: 626
    })
})


## Define Training variables

In [33]:
training_args = TrainingArguments(
        do_eval=True,
        eval_strategy = "steps",
        save_strategy = "steps",
        eval_steps = 5,
        logging_steps = 1,
        max_steps = 40,
        warmup_ratio = 0.1,
        per_device_train_batch_size = 32,
        gradient_accumulation_steps = 4,
        per_device_eval_batch_size = 1,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        save_total_limit = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = checkpoint_dir,
        report_to = "tensorboard",
        logging_dir = log_base_path
)


from unsloth import PatchDPOTrainer
PatchDPOTrainer()

trainer = DPOTrainer(
    model,
    ref_model=None,
    args=training_args,
    beta=0.1,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    max_length = 1024,
    max_prompt_length = 512
)


Extracting prompt in train dataset (num_proc=12):   0%|          | 0/2502 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=12):   0%|          | 0/2502 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=12):   0%|          | 0/2502 [00:00<?, ? examples/s]

Extracting prompt in eval dataset (num_proc=12):   0%|          | 0/626 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=12):   0%|          | 0/626 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=12):   0%|          | 0/626 [00:00<?, ? examples/s]

In [34]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

logger.info(f"GPU  Information before Training")
logger.info(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
logger.info(f"{start_gpu_memory} GB of memory reserved.")

INFO:__main__:GPU  Information before Training
INFO:__main__:GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
INFO:__main__:7.623 GB of memory reserved.


In [35]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,502 | Num Epochs = 4 | Total steps = 40
O^O/ \_/ \    Batch size per device = 25 | Gradient accumulation steps = 10
\        /    Data Parallel GPUs = 1 | Total batch size (25 x 10 x 1) = 250
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


OutOfMemoryError: CUDA out of memory. Tried to allocate 9.50 GiB. GPU 0 has a total capacity of 39.56 GiB of which 9.01 GiB is free. Process 3641 has 30.50 GiB memory in use. Of the allocated memory 29.89 GiB is allocated by PyTorch, and 94.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

logger.info(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
logger.info(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
logger.info(f"Peak reserved memory = {used_memory} GB.")
logger.info(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
logger.info(f"Peak reserved memory % of max memory = {used_percentage} %.")
logger.info(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Save
Save the finetuned model

### Save Lora delta weights

In [None]:
model.save_pretrained(model_lora_adapter_save_path)  # Local saving
tokenizer.save_pretrained(model_lora_adapter_save_path)
logger.info(f"Saved LoRA adapters to {model_lora_adapter_save_path}")

In [None]:
token = userdata.get('HF_TOKEN')

# Push Lora weights to HF
model.push_to_hub_merged(f"ricostaedeli/{MODEL_NAME}-lora", tokenizer, save_method = "lora", token = token)

### Saving merged model

Save merged model

In [None]:
# Merge to 16bit and save local
if False:
  model.save_pretrained_merged(model_save_path, tokenizer, save_method = "merged_16bit",)
  logger.info(f"Saved merged model in 16bit to {model_save_path}")

# Merge to 16bit and push to HF
if True:
  token = userdata.get('HF_TOKEN')
  model.push_to_hub_merged(f"ricostaedeli/{MODEL_NAME}", tokenizer, save_method="merged_16bit", token=token, private=True)

In [None]:
%load_ext tensorboard
%tensorboard --logdir="$log_dir"

In [None]:
!git config --global user.name "Rico Städeli"
!git config --global user.email "rico@yabriga.ch"


commit_message = f"Training Number: {TRAINING_NUMBER}, Training logs in Google Drive"
!git add .
!git commit -m "{commit_message}"
!git push