In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121  # for GPU (CUDA 12.1)
# or: !pip install torch  # if you’re on CPU only

!pip install -q transformers
!pip install -q datasets
!pip install -q huggingface_hub
!pip install -q peft
!pip install -q trl
!pip install -q accelerate
!pip install -q bitsandbytes
!pip install -q numpy pandas

In [3]:
import pandas as pd
from datasets import load_dataset, Dataset
from huggingface_hub import login
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training
)
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)
from trl import DPOTrainer

In [4]:
# --- Parameters to set ---


MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct" 


HF_TOKEN = "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"


ALL_DATASET_NAMES = [
    "panda",
    "jigsaw",
    "biasdpo",
    "detoxdpo",
    "biassft",
    "detoxsft",
]

# Random seed for reproducibility
SEED = 42

In [5]:
login(HF_TOKEN)

In [6]:
# This will be defined after the tokenizer is loaded
tokenizer = None
context_length = 512

def tokenize_panda(element):
    outputs = tokenizer(
        element["perturbed"],
        truncation=True,
        max_length=context_length,
        padding="max_length",
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for input_ids in outputs["input_ids"]:
        input_batch.append(input_ids)
    return {"input_ids": input_batch}


def tokenize_jigsaw(element):
    outputs = tokenizer(
        element["comment_text"],
        truncation=True,
        max_length=context_length,
        padding="max_length",
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for input_ids in outputs["input_ids"]:
        input_batch.append(input_ids)
    return {"input_ids": input_batch}


def tokenize_sft(element):
    text = [
        element["prompt"][i] + " " + element["chosen"][i]
        for i in range(len(element["prompt"]))
    ]
    outputs = tokenizer(
        text,
        truncation=True,
        max_length=context_length,
        padding="max_length",
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for input_ids in outputs["input_ids"]:
        input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [7]:
# QLoRA configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

# LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules="all-linear",
    bias="none",
    r=64,
    lora_alpha=16,
    lora_dropout=0.1,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if not tokenizer.pad_token:
    tokenizer.pad_token = tokenizer.eos_token

# Load model
if "gemma" in MODEL_NAME:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        attn_implementation="eager",
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
    )

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
model.config.use_cache = False
model.config.pretraining_tp = 1
model.enable_input_require_grads()

print("✅ MODEL LOADED")

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

✅ MODEL LOADED


In [22]:
def tokenize_panda(element):
    outputs = tokenizer(
        element["perturbed"],
        truncation=True,
        max_length=context_length,
        padding="max_length",
        return_overflowing_tokens=True,
        return_length=True,
    )
    return {"input_ids": outputs["input_ids"]}

def tokenize_jigsaw(element):
     # ensure text is always a string, handle NaNs
    texts = [str(x) if x is not None else "" for x in element["comment_text"]]

    outputs = tokenizer(
        texts,
        truncation=True,
        max_length=context_length,
        padding="max_length",
        return_overflowing_tokens=True,
        return_length=True,
    )

    return {"input_ids": outputs["input_ids"], "attention_mask": outputs["attention_mask"]}

def tokenize_sft(element):
    text = [element["prompt"][i] + " " + element["chosen"][i] for i in range(len(element["prompt"]))]
    outputs = tokenizer(
        text,
        truncation=True,
        max_length=context_length,
        padding="max_length",
        return_overflowing_tokens=True,
        return_length=True,
    )
    return {"input_ids": outputs["input_ids"]}


In [9]:
def train_model(model, trainer, output_dir):
    print(f"🚀 Starting training for {output_dir}...")
    trainer.train()
    final_model = trainer.model.merge_and_unload()
    final_model.save_pretrained(output_dir)
    print(f"✅ Model saved at: {output_dir}\n\n")

In [31]:
def load_base_model():
    if "gemma" in MODEL_NAME:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            attn_implementation="eager",
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=bnb_config,
            device_map="auto",
        )
    model = prepare_model_for_kbit_training(model)
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    model.enable_input_require_grads()
    return model

In [28]:
# -------------------
# Helper: Save trained model (try merge_and_unload then fallback)
# -------------------
import os
def safe_save_trained_model(trainer, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    try:
        # many PEFT workflows have merge_and_unload()
        model = trainer.model
        if hasattr(model, "merge_and_unload"):
            merged = model.merge_and_unload()
            merged.save_pretrained(output_dir)
        else:
            # fallback
            trainer.model.save_pretrained(output_dir)
    except Exception as e:
        # final fallback: try to unwrap or just save weights
        try:
            trainer.model.save_pretrained(output_dir)
        except Exception as e2:
            raise RuntimeError(f"Could not save model: {e} / {e2}")

# -------------------
# Small utility wrapper to train and save
# -------------------
def train_and_save(trainer, out_dir):
    print(f"\n--- START TRAIN: {out_dir} ---")
    trainer.train()
    safe_save_trained_model(trainer, out_dir)
    print(f"✅ Saved to {out_dir}")

    # cleanup
    del trainer
    try:
        del model
    except:
        pass
    torch.cuda.empty_cache()
    import gc; gc.collect()
    print(f"🧹 Freed memory for {out_dir}\n")

In [26]:
# -------------------
# Helper: Save trained model (try merge_and_unload then fallback)
# -------------------
def safe_save_trained_model(trainer, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    try:
        # many PEFT workflows have merge_and_unload()
        model = trainer.model
        if hasattr(model, "merge_and_unload"):
            merged = model.merge_and_unload()
            merged.save_pretrained(output_dir)
        else:
            # fallback
            trainer.model.save_pretrained(output_dir)
    except Exception as e:
        # final fallback: try to unwrap or just save weights
        try:
            trainer.model.save_pretrained(output_dir)
        except Exception as e2:
            raise RuntimeError(f"Could not save model: {e} / {e2}")

# -------------------
# Small utility wrapper to train and save
# -------------------
def train_and_save(trainer, out_dir):
    print(f"\n--- START TRAIN: {out_dir} ---")
    trainer.train()
    safe_save_trained_model(trainer, out_dir)
    print(f"✅ Saved to {out_dir}")

    # cleanup
    del trainer
    try:
        del model
    except:
        pass
    torch.cuda.empty_cache()
    import gc; gc.collect()
    print(f"🧹 Freed memory for {out_dir}\n")



# ---------- 1) SFT on Panda (bias mitigation) ----------
print("Preparing PANDA (SFT) ...")
#model = load_base_model()
raw = load_dataset("facebook/panda")

tokenized = raw.map(
    tokenize_panda, batched=True, remove_columns=raw["train"].column_names
)
# for notebooks / quick debugging we use smaller subsets; remove .select for full training
train_ds = tokenized["train"].train_test_split(test_size=0.5, seed=SEED)["train"]
eval_ds = tokenized["validation"].train_test_split(test_size=0.5, seed=SEED)["train"]
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="panda_lora_model",
    num_train_epochs=1,
            save_total_limit=5,
            eval_strategy="steps",
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            warmup_steps=150,
            weight_decay=0.001,
            metric_for_best_model="eval_loss",
            fp16=True,
            remove_unused_columns=False,
            logging_steps=500,
            eval_steps=500,
            save_steps=500,
            save_strategy="steps",
            learning_rate=3e-4,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": True},
            load_best_model_at_end=True,
    seed=SEED,
)

model = get_peft_model(model, peft_config)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
)
train_and_save(trainer, "panda_lora_model")

print("Fine tuning done!")



Preparing PANDA (SFT) ...


README.md: 0.00B [00:00, ?B/s]

train.jsonl:   0%|          | 0.00/194M [00:00<?, ?B/s]

valid.jsonl:   0%|          | 0.00/21.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/94966 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10551 [00:00<?, ? examples/s]

Map:   0%|          | 0/94966 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [17]:
!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -p jigsaw_data
!unzip -o jigsaw_data/jigsaw-unintended-bias-in-toxicity-classification.zip -d jigsaw_data


Downloading jigsaw-unintended-bias-in-toxicity-classification.zip to jigsaw_data
 79%|███████████████████████████████▋        | 574M/723M [00:00<00:00, 1.98GB/s]
100%|████████████████████████████████████████| 723M/723M [00:00<00:00, 1.98GB/s]
Archive:  jigsaw_data/jigsaw-unintended-bias-in-toxicity-classification.zip
  inflating: jigsaw_data/all_data.csv  
  inflating: jigsaw_data/identity_individual_annotations.csv  
  inflating: jigsaw_data/sample_submission.csv  
  inflating: jigsaw_data/test.csv    
  inflating: jigsaw_data/test_private_expanded.csv  
  inflating: jigsaw_data/test_public_expanded.csv  
  inflating: jigsaw_data/toxicity_individual_annotations.csv  
  inflating: jigsaw_data/train.csv   


In [27]:
print("Preparing JIGSAW (SFT) ...")

from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Load Jigsaw train.csv only
raw = load_dataset("csv", data_files={"train": "jigsaw_data/train.csv"})

# keep non-toxic comments per the paper (target < 0.1)
raw = raw.filter(lambda ex: ex["target"] < 0.1)

tokenized = raw.map(
    tokenize_jigsaw, batched=True, remove_columns=raw["train"].column_names
)
train_ds = tokenized["train"].select(range(min(5000, len(tokenized["train"]))))
# paper uses test_private_leaderboard as validation
valid_key = "test_private_leaderboard" if "test_private_leaderboard" in tokenized else list(tokenized.keys())[-1]
eval_ds = tokenized[valid_key].select(range(min(1000, len(tokenized[valid_key]))))

training_args = TrainingArguments(
    output_dir="jigsaw_lora_model",
     num_train_epochs=1,
            save_total_limit=5,
            eval_strategy="steps",
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            warmup_steps=150,
            weight_decay=0.001,
            metric_for_best_model="eval_loss",
            fp16=True,
            remove_unused_columns=False,
            logging_steps=500,
            eval_steps=500,
            save_steps=500,
            save_strategy="steps",
            learning_rate=3e-4,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": True},
            load_best_model_at_end=True,
    seed=SEED,
)
model = get_peft_model(model, peft_config)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
)
train_and_save(trainer, "jigsaw_lora_model")


Preparing JIGSAW (SFT) ...


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.



--- START TRAIN: jigsaw_lora_model ---


Step,Training Loss,Validation Loss
500,3.1017,3.021605
1000,3.0189,2.824741


NameError: name 'os' is not defined

In [35]:
model = load_base_model()
model = get_peft_model(model, peft_config)   # create PEFT adapters now

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

In [43]:
# ---------- 3) DPO on BiasDPO (bias mitigation) ----------

import trl
from trl import DPOTrainer, DPOConfig
print("TRL version:", getattr(trl, "__version__", "unknown"))

print("Preparing BiasDPO (DPO) ...")

# 1) load and split dataset (keep 50% as you requested earlier)
raw = load_dataset("ahmedallam/BiasDPO")["train"]
ds = raw.train_test_split(test_size=0.5, seed=SEED)   # keep half
train_d = ds["train"].train_test_split(test_size=0.5, seed=SEED)["train"]  # smaller subset
eval_d  = ds["test"].train_test_split(test_size=0.5, seed=SEED)["train"]

# def tokenize_dpo(batch):
#     prompts = [str(x) if x is not None else "" for x in batch.get("prompt", [])]
#     chosen  = [str(x) if x is not None else "" for x in batch.get("chosen", [])]
#     rejected= [str(x) if x is not None else "" for x in batch.get("rejected", [])]

#     # prompt truncated to prompt length, completions to max_length
#     p = tokenizer(prompts, truncation=True, max_length=128, padding="max_length", return_attention_mask=True)
#     c = tokenizer(chosen,  truncation=True, max_length=512, padding="max_length", return_attention_mask=True)
#     r = tokenizer(rejected,truncation=True, max_length=512, padding="max_length", return_attention_mask=True)

#     return {
#         "prompt_input_ids": p["input_ids"],
#         "prompt_attention_mask": p["attention_mask"],
#         "chosen_input_ids": c["input_ids"],
#         "chosen_attention_mask": c["attention_mask"],
#         "rejected_input_ids": r["input_ids"],
#         "rejected_attention_mask": r["attention_mask"],
#     }

columns_to_remove = [col for col in train_d.column_names if col in ['prompt_input_ids', 'chosen_input_ids', 'rejected_input_ids']]

# 3) apply tokenization (batched)
train_tok = train_d.map(tokenize_dpo, batched=True, remove_columns=columns_to_remove)
eval_tok  = eval_d.map(tokenize_dpo,  batched=True, remove_columns=columns_to_remove)


training_args = DPOConfig(
    output_dir="biasdpo_lora_model",
    num_train_epochs=1,
            save_total_limit=5,
            eval_strategy="steps",
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            warmup_steps=150,
            weight_decay=0.001,
            metric_for_best_model="eval_loss",
            fp16=True,
            remove_unused_columns=False,
            logging_steps=500,
            eval_steps=500,
            save_steps=500,
            save_strategy="steps",
            learning_rate=3e-4,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": True},
            load_best_model_at_end=True,
    seed=SEED,
)
#training_args = DPOConfig(output_dir="Qwen2.5-0.5B-DPO")

training_args.padding_value = tokenizer.pad_token_id
#training_args.model_init_kwargs = {}

dpo_trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    processing_class=tokenizer,  # still allowed
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
)


# 8) train & save (uses your train_and_save / safe_save_trained_model)
train_and_save(dpo_trainer, "biasdpo_lora_model")



TRL version: 0.23.0
Preparing BiasDPO (DPO) ...


Map:   0%|          | 0/286 [00:00<?, ? examples/s]

Map:   0%|          | 0/286 [00:00<?, ? examples/s]

Extracting prompt in train dataset:   0%|          | 0/286 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/286 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/286 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/286 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/286 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/286 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.



--- START TRAIN: biasdpo_lora_model ---




Step,Training Loss,Validation Loss




✅ Saved to biasdpo_lora_model
🧹 Freed memory for biasdpo_lora_model



In [None]:
# ---------- 4) DPO on DetoxDPO (toxicity mitigation) ----------
print("Preparing DetoxDPO (DPO) ...")
import os
#model = load_base_model()
model = get_peft_model(model, peft_config) 
# load the toxicity_pairwise files (paper uses 6 splits). If not present locally, you should download them.
# For robustness, try up to 6 files and stop if file missing.
seed = 42
#tokenizer = AutoTokenizer.from_pretrained("your-model-name")

# Load the dataset
dataset = load_dataset("DopeorNope/New_DPO_dataset")

# Convert to JSONL and save
for split in dataset.keys():
    dataset[split].to_json(f"toxicity_pairwise/split_{split}.jsonl", orient="records", lines=True)

for split_name, split_data in dataset.items():
    # split_data is a Dataset, which has .to_pandas()
    df = split_data.to_pandas()
    df.to_json(f"toxicity_pairwise/split_{split_name}.jsonl", orient="records", lines=True)
    print(f"✅ Converted split '{split_name}' to JSONL")
    
# Prepare the dataset
#df = dataset.to_pandas()
df = df.rename(columns={"prompt_text": "prompt", "unpert_gen_text": "chosen", "pert_gen_text": "rejected"})
dataset = Dataset.from_pandas(df).train_test_split(test_size=0.3, seed=seed)

train_d = dataset["train"].select(range(min(1000, len(dataset["train"]))))
eval_d = dataset["test"].select(range(min(200, len(dataset["test"]))))

training_args = DPOConfig(
    output_dir="detoxdpo_lora_model",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    save_steps=100,
    learning_rate=1e-5,
    gradient_checkpointing=True,
    fp16=True,
    seed=SEED,
)

dpo_trainer = DPOTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    processing_class=tokenizer,  # still allowed
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
)

train_and_save(dpo_trainer, "detoxdpo_lora_model")

print("🎉 All pipelines finished.")

Preparing DetoxDPO (DPO) ...




Creating json from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

✅ Converted split 'train' to JSONL
✅ Converted split 'validation' to JSONL

--- START TRAIN: detoxdpo_lora_model ---




Step,Training Loss,Validation Loss




In [10]:
# ===============================================================================
#  SFT training on PANDA or Jigsaw
# ===============================================================================
if DATASET_NAME in ["panda", "jigsaw"]:
    if DATASET_NAME == "panda":
        raw_datasets = load_dataset("facebook/panda")
        valid_name = "validation"
        tokenize_func = tokenize_panda
    elif DATASET_NAME == "jigsaw":
        raw_datasets = load_dataset(
            "jigsaw_unintended_bias",
            data_dir="jigsaw-unintended-bias-in-toxicity-classification",
        )
        raw_datasets = raw_datasets.filter(
            lambda example: example["target"] < 0.1
        )
        valid_name = "test_private_leaderboard"
        tokenize_func = tokenize_jigsaw
    
    tokenized_datasets = raw_datasets.map(
        tokenize_func,
        batched=True,
        remove_columns=raw_datasets["train"].column_names,
    )
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
    print(f"✅ DATASET '{DATASET_NAME}' LOADED")

# ===============================================================================
# DPO training
# ===============================================================================
elif DATASET_NAME in ["biasdpo", "detoxdpo"]:
    if DATASET_NAME == "biasdpo":
        dataset = load_dataset("ahmedallam/BiasDPO")[
            "train"
        ].train_test_split(test_size=0.05, seed=SEED)
        num_epochs = 20
        n_steps = 25
    elif DATASET_NAME == "detoxdpo":
        # NOTE: This requires local data files
        dataset = pd.concat(
            [
                pd.read_json(
                    f"toxicity_pairwise/split_{i}.jsonl", lines=True
                )[["prompt_text", "unpert_gen_text", "pert_gen_text"]]
                for i in range(6)
            ],
            ignore_index=True,
        ).rename(
            columns={
                "prompt_text": "prompt",
                "unpert_gen_text": "chosen",
                "pert_gen_text": "rejected",
            }
        )
        num_epochs = 1
        n_steps = 100
        dataset = Dataset.from_pandas(dataset).train_test_split(
            test_size=0.05, seed=SEED
        )
    print(f"✅ DATASET '{DATASET_NAME}' LOADED")

# ===============================================================================
# SFT training on DPO data
# ===============================================================================
elif DATASET_NAME in ["biassft", "detoxsft"]:
    if DATASET_NAME == "biassft":
        dataset = load_dataset("ahmedallam/BiasDPO")[
            "train"
        ].train_test_split(test_size=0.05, seed=SEED)
        num_epochs = 20
        n_steps = 25
    elif DATASET_NAME == "detoxsft":
        # NOTE: This requires local data files
        dataset = pd.concat(
            [
                pd.read_json(
                    f"toxicity_pairwise/split_{i}.jsonl", lines=True
                )[["prompt_text", "unpert_gen_text", "pert_gen_text"]]
                for i in range(6)
            ],
            ignore_index=True,
        ).rename(
            columns={
                "prompt_text": "prompt",
                "unpert_gen_text": "chosen",
                "pert_gen_text": "rejected",
            }
        )
        num_epochs = 1
        n_steps = 100
        dataset = Dataset.from_pandas(dataset).train_test_split(
            test_size=0.05, seed=SEED
        )

    tokenized_datasets = dataset.map(
        tokenize_sft,
        batched=True,
        remove_columns=dataset["train"].column_names,
    )
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
    print(f"✅ DATASET '{DATASET_NAME}' LOADED")

README.md: 0.00B [00:00, ?B/s]

train.jsonl:   0%|          | 0.00/194M [00:00<?, ?B/s]

valid.jsonl:   0%|          | 0.00/21.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/94966 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10551 [00:00<?, ? examples/s]

Map:   0%|          | 0/94966 [00:00<?, ? examples/s]

Map:   0%|          | 0/10551 [00:00<?, ? examples/s]

✅ DATASET 'panda' LOADED


In [None]:
# ===============================================================================
#  SFT Training (panda, jigsaw)
# ===============================================================================
if DATASET_NAME in ["panda", "jigsaw"]:
    training_args = TrainingArguments(
        output_dir=f"{MODEL_NAME}_lora_{DATASET_NAME}",
        num_train_epochs=1,
        save_total_limit=5,
        eval_strategy="steps",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=150,
        weight_decay=0.001,
        metric_for_best_model="eval_loss",
        fp16=True,
        remove_unused_columns=False,
        logging_steps=500,
        eval_steps=500,
        save_steps=500,
        save_strategy="steps",
        learning_rate=3e-4,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": True},
        load_best_model_at_end=True,
        seed=SEED,
    )
    model = get_peft_model(model, peft_config)
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=tokenized_datasets["train"].select(range(94966)),
        eval_dataset=tokenized_datasets[valid_name].select(range(10551)),
        data_collator=data_collator,
    )
    print("🚀 STARTING SFT TRAINING")
    trainer.train()
    model = model.merge_and_unload()
    model.save_pretrained(f"{MODEL_NAME}_lora_{DATASET_NAME}_model")

# ===============================================================================
#  DPO Training
# ===============================================================================
elif DATASET_NAME in ["biasdpo", "detoxdpo"]:
    training_args = TrainingArguments(
        output_dir=f"{MODEL_NAME}_lora_{DATASET_NAME}",
        optim="rmsprop",
        learning_rate=1e-5,
        save_total_limit=5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=4,
        remove_unused_columns=False,
        num_train_epochs=num_epochs,
        metric_for_best_model="eval_loss",
        logging_steps=n_steps,
        save_steps=n_steps,
        fp16=True,
        eval_steps=n_steps,
        eval_strategy="steps",
        lr_scheduler_type="constant_with_warmup",
        save_strategy="steps",
        warmup_steps=150,
        weight_decay=0.05,
        max_grad_norm=10,
        load_best_model_at_end=True,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": True},
        seed=SEED,
    )
    trainer = DPOTrainer(
        model,
        beta=0.1,
        max_prompt_length=128,
        max_length=512,
        peft_config=peft_config,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
    )
    print("🚀 STARTING DPO TRAINING")
    trainer.train()
    model = trainer.model.merge_and_unload()
    model.save_pretrained(f"{MODEL_NAME}_lora_{DATASET_NAME}_model")
    
# ===============================================================================
#  SFT Training (biassft, detoxsft)
# ===============================================================================
elif DATASET_NAME in ["biassft", "detoxsft"]:
    training_args = TrainingArguments(
        output_dir=f"{MODEL_NAME}_lora_{DATASET_NAME}",
        num_train_epochs=num_epochs,
        save_total_limit=5,
        eval_strategy="steps",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=n_steps,
        weight_decay=0.001,
        metric_for_best_model="eval_loss",
        fp16=True,
        remove_unused_columns=False,
        logging_steps=500,
        eval_steps=500,
        save_steps=500,
        save_strategy="steps",
        learning_rate=3e-4,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": True},
        load_best_model_at_end=True,
        seed=SEED,
    )
    model = get_peft_model(model, peft_config)
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        data_collator=data_collator,
    )
    print("🚀 STARTING SFT TRAINING")
    trainer.train()
    model = trainer.model.merge_and_unload()
    model.save_pretrained(f"{MODEL_NAME}_lora_{DATASET_NAME}_model")

print("✅ Training complete!")

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


🚀 STARTING SFT TRAINING


<IPython.core.display.Javascript object>