<a href="https://colab.research.google.com/github/RicoStaedeli/NLP2025_CQG/blob/main/Training/3_Training_2_DPO_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Direct Preference Optimization for Critical Question Generation
In this notebook we train the pretrained vanilla LLM with DPO Training.
- **Model:** [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
- **Dataset:** [processed_train_data_filtered_dpo.json](../Data/Processed/CQ%20DPO%20Dataset.json)
- **Frameworks:** Unsloth, HuggingFace, transformers, bitsandbytes

## Setup
First we define some constant values and also install all needed libraries



### Installation

In [None]:
!pip install --no-deps xformers triton unsloth_zoo
!pip install sentencepiece protobuf huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install -U transformers
!pip install -U datasets
!pip install -U accelerate
!pip install -U peft
!pip install -U trl
!pip install -U bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import shutil
import os
import torch
from datasets import load_dataset
from trl import DPOTrainer
import logging
from transformers import TrainingArguments, EarlyStoppingCallback, IntervalStrategy, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

### Colab
This part is only relevant when using the notebook in google colab

In [3]:
from google.colab import userdata, drive

In [4]:
drive.mount('/content/drive')
token = userdata.get('GITHUB')

Mounted at /content/drive


Clone GitHub Repository to directly push generated files

In [5]:
repo_url = f"https://{token}@github.com/RicoStaedeli/NLP2025_CQG.git"

!git clone {repo_url}

Cloning into 'NLP2025_CQG'...
remote: Enumerating objects: 1603, done.[K
remote: Counting objects: 100% (251/251), done.[K
remote: Compressing objects: 100% (134/134), done.[K
remote: Total 1603 (delta 186), reused 143 (delta 116), pack-reused 1352 (from 2)[K
Receiving objects: 100% (1603/1603), 74.54 MiB | 15.76 MiB/s, done.
Resolving deltas: 100% (916/916), done.


### Path Variables and Logger

In [6]:
################################################################################
#######################   STATIC VARIABLES      ################################
################################################################################

TRAINING_NUMBER = 4
BASE_MODEL_REPO = "meta-llama/Llama-3.1-8B-Instruct"
MODEL_NAME = "Meta-Llama-3.1-8B-Instruct_DPO_1"

################################################################################
#######################   PATH VARIABLES        ################################
################################################################################

train_dataset_path = "/content/NLP2025_CQG/Data/Processed/CQ DPO Dataset.json"

log_base_path = f"/content/NLP2025_CQG/Training/Logs/Traing_{TRAINING_NUMBER}/Tensorboard/"
os.makedirs(log_base_path, exist_ok=True)

log_file_path = f"/content/NLP2025_CQG/Logs/training_{TRAINING_NUMBER}.log"

model_save_path = f"/content/drive/MyDrive/HSG/NLP/Project NLP/Training/Training_{TRAINING_NUMBER}/Model/{MODEL_NAME}_finetuned/"
os.makedirs(model_save_path, exist_ok=True)

model_lora_adapter_save_path = f"/content/drive/MyDrive/HSG/NLP/Project NLP/Training/Training_{TRAINING_NUMBER}/Model/{MODEL_NAME}_lora_adapters/"
os.makedirs(model_lora_adapter_save_path, exist_ok=True)


checkpoint_dir = f"/content/drive/MyDrive/HSG/NLP/Project NLP/Training/Training_{TRAINING_NUMBER}/Checkpoints/"
os.makedirs(checkpoint_dir, exist_ok=True)


################################################################################
#######################   LOGGER                ################################
################################################################################

# Setup logger manually
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create file handler (only if not already added)
if not logger.handlers:
    fh = logging.FileHandler(log_file_path)
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    logger.addHandler(fh)

# Detect device
device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

In [7]:
logger.info("--------  Start with Training  -------------")
logger.info(f'Device selected: {device}')
logger.info(f'Model: {MODEL_NAME}')
logger.info(f'Training number: {TRAINING_NUMBER}')

INFO:__main__:--------  Start with Training  -------------
INFO:__main__:Device selected: cuda
INFO:__main__:Model: Meta-Llama-3.1-8B-Instruct_DPO_1
INFO:__main__:Training number: 4


## Training Parameters

In [8]:
################################################################################
#######################   Unlsoth Parameters    ################################
################################################################################

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


################################################################################
#######################   PEFT Parameters       ################################
################################################################################

r = 64 # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                  "gate_proj", "up_proj", "down_proj",]
lora_alpha = 64
lora_dropout = 0 # Supports any, but = 0 is optimized
bias = "none"    # Supports any, but = "none" is optimized
use_gradient_checkpointing = "unsloth" # True or "unsloth" for very long context
random_state = 3407
use_rslora = False  # Unsloth supports rank stabilized LoRA
loftq_config = None # And LoftQ

### Unsloth

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = BASE_MODEL_REPO,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [10]:
model = FastLanguageModel.get_peft_model(
    model,
    r = r,
    target_modules = target_modules,
    lora_alpha = lora_alpha,
    lora_dropout = lora_dropout,
    bias = bias,
    use_gradient_checkpointing = use_gradient_checkpointing,
    random_state = random_state,
    use_rslora = use_rslora,
    loftq_config = loftq_config,
)

Unsloth 2025.5.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Load and preprocess dataset
The raw dataset [SocratiQ](https://github.com/NUS-IDS/eacl23_soqg/tree/main) has a label at the begining of the context. We have to remove that and also tokenize the input for the model training.

In [None]:
import json
from datasets import Dataset

# Load your original dataset
with open(train_dataset_path, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

formatted_data = []

# Process each entry
for entry in raw_data:
    # Ensure score fields exist and are numeric
    if not ("score_chosen" in entry and "score_rejected" in entry):
        continue

    try:
        score_chosen = float(entry["score_chosen"])
        score_rejected = float(entry["score_rejected"])
    except (TypeError, ValueError):
        continue

    # Filter by score difference
    if abs(score_chosen - score_rejected) <= 4:
        continue

    # Build prompt
    messages = entry["prompt"]
    prompt_parts = []

    for message in messages:
        role = message["role"]
        content = message["content"].strip()
        if role == "user":
            prompt_parts.append(f"User: {content}")
        elif role == "assistant":
            prompt_parts.append(f"Assistant: {content}")

    full_prompt = "\n\n".join(prompt_parts) + "\n\nAssistant:"

    # Extract chosen and rejected responses (assumed to be lists)
    try:
        chosen_response = entry["chosen"][0]["content"].strip()
        rejected_response = entry["rejected"][0]["content"].strip()
    except (IndexError, KeyError, TypeError):
        continue

    formatted_data.append({
        "prompt": full_prompt,
        "chosen": chosen_response,
        "rejected": rejected_response
    })

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(formatted_data)
dataset.save_to_disk("cq_dpo_dataset_filtered")

In [12]:
from datasets import load_from_disk
dataset = load_from_disk("cq_dpo_dataset_filtered")

In [13]:
print(dataset)

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 1572
})


In [14]:
print(dataset[0]['rejected'])

Is considering climate change a political issue detrimental to the billions of people who will be greatly affected by it, as well as future generations?


In [15]:
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)


In [16]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 1257
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 315
    })
})


## Define Training variables

In [None]:
from trl import DPOConfig
training_args = DPOConfig(
        do_eval=True,
        eval_strategy = "steps",
        save_strategy = "steps",
        eval_steps = 5,
        logging_steps = 1,
        max_steps = 40,
        warmup_ratio = 0.1,
        per_device_train_batch_size = 20,
        gradient_accumulation_steps = 4,
        per_device_eval_batch_size = 1,
        learning_rate = 5e-6,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        save_total_limit = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = checkpoint_dir,
        report_to = "tensorboard",
        logging_dir = log_base_path
)


from unsloth import PatchDPOTrainer
PatchDPOTrainer()

trainer = DPOTrainer(
    model,
    ref_model=None,
    args=training_args,
    beta=0.5,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    max_length = 1024,
    max_prompt_length = 512
)


In [18]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

logger.info(f"GPU  Information before Training")
logger.info(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
logger.info(f"{start_gpu_memory} GB of memory reserved.")

INFO:__main__:GPU  Information before Training
INFO:__main__:GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
INFO:__main__:7.623 GB of memory reserved.


In [19]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,257 | Num Epochs = 3 | Total steps = 40
O^O/ \_/ \    Batch size per device = 20 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (20 x 4 x 1) = 80
 "-____-"     Trainable parameters = 167,772,160/8,000,000,000 (2.10% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
5,0.676,0.692742,0.000202,-0.000897,0.520635,0.001099,-80.65461,-58.193073,-1.237724,-1.025032,0,0,0,0
10,0.4971,0.691041,-0.00575,-0.010552,0.526984,0.004802,-80.714127,-58.289635,-1.237507,-1.02416,No Log,No Log,No Log,No Log
15,0.3099,0.688899,-0.010854,-0.021021,0.498413,0.010167,-80.765167,-58.394321,-1.235197,-1.02204,No Log,No Log,No Log,No Log
20,0.1911,0.684706,-0.014775,-0.035333,0.555556,0.020558,-80.804375,-58.537437,-1.233967,-1.021422,No Log,No Log,No Log,No Log
25,0.1214,0.684612,-0.023641,-0.046536,0.495238,0.022895,-80.893044,-58.649471,-1.233927,-1.021017,No Log,No Log,No Log,No Log
30,0.1153,0.68242,-0.025046,-0.05461,0.52381,0.029564,-80.907082,-58.730209,-1.233941,-1.021265,No Log,No Log,No Log,No Log
35,0.1167,0.680431,-0.030331,-0.065643,0.55873,0.035312,-80.95993,-58.840538,-1.234215,-1.020993,No Log,No Log,No Log,No Log
40,0.0537,0.678499,-0.030139,-0.069782,0.590476,0.039642,-80.958023,-58.881927,-1.23511,-1.02202,No Log,No Log,No Log,No Log


In [20]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

logger.info(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
logger.info(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
logger.info(f"Peak reserved memory = {used_memory} GB.")
logger.info(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
logger.info(f"Peak reserved memory % of max memory = {used_percentage} %.")
logger.info(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

INFO:__main__:1470.9937 seconds used for training.
INFO:__main__:24.52 minutes used for training.
INFO:__main__:Peak reserved memory = 38.639 GB.
INFO:__main__:Peak reserved memory for training = 31.016 GB.
INFO:__main__:Peak reserved memory % of max memory = 97.679 %.
INFO:__main__:Peak reserved memory for training % of max memory = 78.408 %.


## Save
Save the finetuned model

### Save Lora delta weights

In [21]:
model.save_pretrained(model_lora_adapter_save_path)  # Local saving
tokenizer.save_pretrained(model_lora_adapter_save_path)
logger.info(f"Saved LoRA adapters to {model_lora_adapter_save_path}")

INFO:__main__:Saved LoRA adapters to /content/drive/MyDrive/HSG/NLP/Project NLP/Training/Training_4/Model/Meta-Llama-3.1-8B-Instruct_DPO_1_lora_adapters/


In [None]:
token = userdata.get('HF_TOKEN')

# Push Lora weights to HF
model.push_to_hub_merged(f"ricostaedeli/{MODEL_NAME}-lora", tokenizer, save_method = "lora", token = token)

### Saving merged model

Save merged model

In [None]:
# Merge to 16bit and save local
if False:
  model.save_pretrained_merged(model_save_path, tokenizer, save_method = "merged_16bit",)
  logger.info(f"Saved merged model in 16bit to {model_save_path}")

# Merge to 16bit and push to HF
if True:
  token = userdata.get('HF_TOKEN')
  model.push_to_hub_merged(f"ricostaedeli/{MODEL_NAME}", tokenizer, save_method="merged_16bit", token=token, private=True)

In [None]:
%load_ext tensorboard
%tensorboard --logdir="$log_dir"

In [25]:
os.chdir("NLP2025_CQG")
!ls


1_a_Generate_DPO_Dataset.ipynb	      Data
1_Information_preprocessing.md	      Development
1_Preprocessing_faster.ipynb	      Doc
1_Preprocessing.ipynb		      Evaluation
2_Baseline_Generation.ipynb	      INFORMATION.md
2_Information_Baseline_Generation.md  LICENSE
3_Evaluation.ipynb		      Logs
4_Finetuned_Generation.ipynb	      README.md
5_Evaluation_Analytics.ipynb	      Training


In [None]:
!git config --global user.name "Rico Städeli"
!git config --global user.email "rico@yabriga.ch"


commit_message = f"Training Number: {TRAINING_NUMBER}, Training logs in Google Drive"
!git add .
!git commit -m "{commit_message}"
!git push