## DPO Training

In [None]:
!pip install -r requirements.txt
!pip install unsloth
!pip install scipy

In [None]:
from unsloth import FastLanguageModel, PatchDPOTrainer
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import DPOConfig, DPOTrainer

# Clear GPU memory cache
torch.cuda.empty_cache()

# Set default device to CUDA
torch.set_default_device("cuda")

# Load model and corresponding tokenizer in 4-bit precision
model, tokenizer = FastLanguageModel.from_pretrained("unsloth/Llama-3.2-3B-Instruct", dtype=None, load_in_4bit=True, max_seq_length  = 2048)

models_used = ["unsloth/Llama-3.2-3B-Instruct", "unsloth/unsloth/gemma-2-9b-it", "unsloth/Mistral-7B-Instruct-v0.3"]

# Patch DPOTrainer for fast training
PatchDPOTrainer()

In [None]:
# Using LoRA adapters to only update part of the model's parameters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",   
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False, 
    loftq_config = None, 
)

In [None]:
from datasets import load_dataset, Dataset

# Load preference dataset
train_dataset = load_dataset("gz25/JGTV_Pref_DS_Instruct", split="train")
print(len(train_dataset))

# Filter out rows with missing values in 'choosen' or 'rejected' fields
def formatting_func(data):
    prompt = data['prompt'].strip()
    preferred = data['chosen_llama'].strip()
    rejected = data['rejected_llama'].strip()

    preferred_letter = preferred.split("Guess: ")[1][0] if "Guess: " in preferred else None
    rejected_letter = rejected.split("Guess: ")[1][0] if "Guess: " in rejected else None

    preferred_conf = preferred.split("Probability: ")[1] if "Probability: " in preferred else None
    rejected_conf = rejected.split("Probability: ")[1] if "Probability: " in rejected else None

    if "%" in preferred_conf:
        preferred_conf = preferred_conf.replace("%", " ")
    if "%" in rejected_conf:
        rejected_conf = rejected_conf.replace("%", " ")

    isvalid = (preferred_conf != rejected_conf
            and preferred_letter is not None
            and rejected_letter is not None
            and preferred_conf != '0'
            and rejected_conf != '0'
            and int(preferred_conf) > 0
            and int(rejected_conf) > 0
    )
    prompt_temp = [
         {"role": "user", "content": prompt },
    ]
    EOS_TOKEN = tokenizer.eos_token
    prompt = tokenizer.apply_chat_template(prompt_temp, tokenize = False, add_generation_prompt = True)

    return {'prompt': prompt, 'chosen': preferred + EOS_TOKEN, 'rejected': rejected + EOS_TOKEN,
            'valid': isvalid}

# Apply formatting function to the dataset
train_dataset = train_dataset.map(formatting_func).filter(lambda x: x["valid"]).remove_columns([col for col in train_dataset.column_names + ['valid']
                   if col not in ['prompt', 'chosen', 'rejected']])

print(len(train_dataset), train_dataset[0])

In [None]:
from datasets import load_dataset
import random

# Load the TruthfulQA dataset for training evaluation
test_dataset = load_dataset("truthful_qa", name="multiple_choice",split='validation')

debug_dataset = test_dataset.select(range(0,5))

print(debug_dataset[0])

# Randomize the answer options in the dataset
def randomize_answers(answers, labels):

    indices = list(range(len(answers)))
    random.shuffle(indices)
    shuffled_answers = [answers[i] for i in indices]
    shuffled_labels = [labels[i] for i in indices]
    return shuffled_answers, shuffled_labels


def randomize_example(x):

    answers = x['mc1_targets']['choices']
    labels = x['mc1_targets']['labels']
    shuffled_answers, shuffled_labels = randomize_answers(answers, labels)


    x['mc1_targets']['choices'] = shuffled_answers
    x['mc1_targets']['labels'] = shuffled_labels
    return x

In [None]:
import re

# Function to extract internal confidence from the output answer
def extract_internal_confidence(output_answer, token_probs):

  match = re.search(r"\b([A-Z])\b", output_answer.strip())
  if match:
    predicted_answer = match.group(1)  # Extract the answer token (e.g., 'A')
  else:
    predicted_answer = None  # No match found

  return round(token_probs.get(predicted_answer, 0))

# Function to extract verbalised confidence from the output answer
def extract_verbalised_confidence(output_answer):

  pattern = r"(\d{1,3})%"
  match = re.search(pattern, output_answer)
  return int(match.group(1)) if match else 0

In [None]:
import string 

test_dataset = test_dataset.map(randomize_example)

test_dataset = test_dataset.select(range(0,50))

# Evaluation prompts and dataloader
eval_prompts = []

for line in test_dataset:
    choices = line['mc1_targets']['choices']
    labels = list(string.ascii_uppercase[:len(choices)])
    options = " ".join([f"{label}. {choice}" for label, choice in zip(labels, choices)])
    prompt = f"{line['question']}\n{options}"
    eval_prompts.append(prompt)

print(f"Completed all {len(eval_prompts)}/{len(test_dataset)}")
print(eval_prompts[0])

In [None]:
PROMPT = f"Provide your best guess (letter only) and the probability that it is correct (0% to 100%) for the above question. Give ONLY the guess and probability, no other words or explanation. For example:\n\nGuess: <the letter only, as short as possible; not a complete sentence, just the letter!>\n Probability: <the probability between 0% and 100% that your guess is correct, without any extra commentary whatsoever; just the probability!>"

def inference_format(instruction):
    # Use text completion format
    message = [
         {"role": "user", "content": instruction }
        ]
    return message

In [None]:
from torch.utils.data import DataLoader, Dataset
from transformers import TrainerCallback, TrainingArguments
from trl import DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import numpy as np
from scipy.stats import spearmanr

# Define a simple Dataset for prompts
class PromptDataset(Dataset):
    def __init__(self, prompts):
        self.prompts = prompts

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        return self.prompts[idx]


# Custom callback for evaluation
class EvaluationCallback(TrainerCallback):
    def __init__(self, model, tokenizer, eval_dataloader, metrics, eval_interval=100):
        self.model = model
        self.tokenizer = tokenizer
        self.eval_dataloader = eval_dataloader
        self.metrics = metrics
        self.eval_interval = eval_interval

    def on_step_end(self, args, state, control, **kwargs):
        # Perform evaluation at the specified interval
        if state.global_step % self.eval_interval == 0:
            self.evaluate_model(state.global_step)

    def evaluate_model(self, step):
        self.model.eval()  # Set model to evaluation mode
        results_verb_confs = []
        model_output = []
        predicted_answers = []
        verb_confs = []
        internal_confs = []
        
        with torch.no_grad():  # Disable gradient computation
            for prompt in self.eval_dataloader:
                # Move input tensors to the same device as the model
                prompt = f"{prompt}\n{PROMPT}"
                prompt = inference_format(prompt)
                inputs = self.tokenizer.apply_chat_template(
                    prompt,
                    tokenize = True,
                    add_generation_prompt = True, # Must add for generation
                    return_tensors = "pt",
                    )
                inputs = {k: v.to("cuda") for k, v in inputs.items()}

                output = self.model.generate(**inputs, max_new_tokens=50, 
                                             num_return_sequences=1, 
                                             output_logits=True,
                                             return_dict_in_generate=True,
                                             #generator=generator,
                                           )
                
                logits = output.logits
                
                input_length = 1 if self.model.config.is_encoder_decoder else inputs.input_ids.shape[1]
                
                generated_tokens = output.sequences[:, input_length:]
                
                # Apply softmax function to logits tensors
                log_probs = torch.stack(logits, dim=1).softmax(-1)
                
                # Extract probability that corresponds with generated tokens
                gen_probs = torch.gather(log_probs, 2, generated_tokens[:, :, None]).squeeze(-1)
                
                token_probs = {}
                for tok, score in zip(generated_tokens[0], gen_probs[0]):
                    token_probs[self.tokenizer.decode(tok).strip()] = score.cpu().numpy() * 100  # Move to CPU for numpy ops
                
                gen_output = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] 
                match = re.search(r"\b([A-Z])\b", gen_output.strip())  
                gen_answer = match.group(1) if match else None  
                
                verb_conf = extract_verbalised_confidence(gen_output)
                internal_conf = extract_internal_confidence(gen_output, token_probs)
        
                model_output.append(gen_output)
                predicted_answers.append(gen_answer)
                verb_confs.append(verb_conf)
                internal_confs.append(internal_conf)

            results = {}
            for metric_name, metric_fn in self.metrics.items():
                results[metric_name] = metric_fn(verb_confs, internal_confs)
    
        print(f"\n--- Evaluation at step {step} ---")
        for key, value in results.items():
            print(f"{key}: {value}")
        print("------------")
        for vc, ic in zip(verb_confs, internal_confs):
            print(f"Verb Conf: {vc} | Internal Conf: {ic}")
            print("------------------------------\n")
        self.model.train()



def compute_spearman_metric(verb_conf, internal_conf):
    corr, pval = spearmanr(verb_conf, internal_conf)
    return corr

def compute_std(verb_conf, internal_conf):
    diff = np.array(verb_conf, dtype=float) - np.array(internal_conf, dtype=float)
    std_deviation = np.std(diff)
    return std_deviation

def compute_mean_dev(verb_conf, internal_conf):
    diff = np.array(verb_conf, dtype=float) - np.array(internal_conf, dtype=float)
    mean_deviation = np.mean(np.abs(diff))
    return mean_deviation

def compute_std_err(verb_conf, internal_conf):
    diff = np.array(verb_conf, dtype=float) - np.array(internal_conf, dtype=float)
    std_error = np.std(diff) / np.sqrt(len(diff))
    return std_error

metrics = {
    "Spearman Rank Correlation": compute_spearman_metric,
    "Standard Deviation": compute_std,
    "Mean Deviation": compute_mean_dev,
    "Standard Error": compute_std_err,
}

In [None]:
torch.cuda.empty_cache()

# Define training arguments for DPO
training_args = DPOConfig(
    output_dir="Llama-3.2-Instruct_3B-DPO",
      logging_steps=10,
      loss_type="ipo",
      bf16=True,
      save_steps=100,
      per_device_train_batch_size=2,
      gradient_accumulation_steps=32,
     dataloader_pin_memory=False
) 

# Training data and DPOTrainer setup
trainer = DPOTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_dataset,  # Your offline preference-labeled dataset
    eval_dataset=None,  # Not required since we use a custom callback
    args=training_args,
)

# Define metric computation functions
eval_dataloader = DataLoader(PromptDataset(eval_prompts))


# Attach the custom callback
trainer.add_callback(
    EvaluationCallback(
        model=model,
        tokenizer=tokenizer,
        eval_dataloader=eval_dataloader,
        metrics=metrics,
        eval_interval=100,  # Evaluate every 100 steps
    )
)

# Start training
trainer.train()

# Save trained model
save_dir = "Llama_Instruct_DPOtrained"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Trained model has been saved to {save_dir}")

In [None]:
trainer.push_to_hub()