In [None]:
!pip install trl transformers datasets peft accelerate



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch

data = [
    {"prompt": "What is AI? ", "chosen": "AI means Artificial Intelligence.", "rejected": "AI is a kind of fruit."},
    {"prompt": "Explain Python: ", "chosen": "Python is a programming language.", "rejected": "Python is a type of snake."},
    {"prompt": "Describe climate change: ", "chosen": "Climate change refers to long-term shifts in temperatures and weather patterns.", "rejected": "Climate change is a myth."},
    {"prompt": "What causes rain? ", "chosen": "Rain is caused by water vapor condensing and falling from clouds.", "rejected": "Rain is caused by people crying."},
    {"prompt": "Define photosynthesis: ", "chosen": "Photosynthesis is the process by which plants convert sunlight into energy.", "rejected": "Photosynthesis is a type of dance."},
    {"prompt": "What is reinforcement learning? ", "chosen": "Reinforcement learning is a type of machine learning where agents learn by interacting with an environment.", "rejected": "Reinforcement learning is about reinforcing weak WiFi signals."},
    {"prompt": "What is quantum computing? ", "chosen": "Quantum computing uses quantum bits to perform complex computations more efficiently.", "rejected": "Quantum computing is about computing how many quanta of light are in a room."},
    {"prompt": "Who wrote Hamlet? ", "chosen": "Hamlet was written by William Shakespeare.", "rejected": "Hamlet is a famous soccer player."},
    {"prompt": "Explain gravity: ", "chosen": "Gravity is the force that attracts two bodies towards each other.", "rejected": "Gravity is a made-up concept to explain why things fall."},
    {"prompt": "What is blockchain? ", "chosen": "Blockchain is a decentralized digital ledger technology.", "rejected": "Blockchain is a chain made of blocks of stone."},
]


# Load tokenizer and model
base_model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(base_model_name)
model.resize_token_embeddings(len(tokenizer))
model.train()

# Format data for SFT
sft_data = [
    {"input_ids": tokenizer(d["prompt"] + d["chosen"], truncation=True, padding="max_length", max_length=64, return_tensors="pt")["input_ids"].squeeze(), "labels": tokenizer(d["prompt"] + d["chosen"], truncation=True, padding="max_length", max_length=64, return_tensors="pt")["input_ids"].squeeze()}
    for d in data
]

class SFTDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self): return len(self.data)
    def __getitem__(self, idx): return self.data[idx]

sft_dataset = SFTDataset(sft_data)

training_args = TrainingArguments(
    output_dir="./sft_model",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_steps=1,
    save_strategy="no",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=sft_dataset,
    tokenizer=tokenizer
)

trainer.train()


  trainer = Trainer(


Step,Training Loss
1,7.7411
2,5.9307
3,3.4781
4,1.7622
5,1.0917
6,0.696
7,0.3717
8,0.5023
9,0.5103
10,0.6874


TrainOutput(global_step=15, training_loss=1.6571203509966532, metrics={'train_runtime': 3.4852, 'train_samples_per_second': 8.608, 'train_steps_per_second': 4.304, 'total_flos': 979526615040.0, 'train_loss': 1.6571203509966532, 'epoch': 3.0})

In [None]:
model.save_pretrained("./sft_finetuned_model")
tokenizer.save_pretrained("./sft_finetuned_model")

('./sft_finetuned_model/tokenizer_config.json',
 './sft_finetuned_model/special_tokens_map.json',
 './sft_finetuned_model/vocab.json',
 './sft_finetuned_model/merges.txt',
 './sft_finetuned_model/added_tokens.json',
 './sft_finetuned_model/tokenizer.json')

In [None]:
from transformers import AutoModelForSequenceClassification

# RM is a classifier scoring chosen > rejected
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token
reward_model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=1)
reward_model.resize_token_embeddings(len(tokenizer))
reward_model.config.pad_token_id = tokenizer.pad_token_id

reward_model.train()

# Prepare RM dataset
rm_data = []
for ex in data:
    rm_data.append({"text": ex["prompt"] + ex["chosen"], "label": 1.0})
    rm_data.append({"text": ex["prompt"] + ex["rejected"], "label": 0.0})

class RMDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self): return len(self.data)
    def __getitem__(self, idx):
        enc = tokenizer(self.data[idx]["text"], truncation=True, padding="max_length", max_length=64, return_tensors="pt")
        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels": torch.tensor(self.data[idx]["label"])
        }

rm_dataset = RMDataset(rm_data)

reward_args = TrainingArguments(
    output_dir="./rm_model",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_steps=1,
    save_strategy="no",
    report_to="none",
)

reward_trainer = Trainer(
    model=reward_model,
    args=reward_args,
    train_dataset=rm_dataset,
    tokenizer=tokenizer
)

reward_trainer.train()


Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-125M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  reward_trainer = Trainer(


Step,Training Loss
1,4.4084
2,2.0397
3,0.5995
4,1.1548
5,1.8084
6,1.9536
7,0.3074
8,0.2187
9,0.3132
10,2.0468


TrainOutput(global_step=30, training_loss=1.4541328070064385, metrics={'train_runtime': 2.6384, 'train_samples_per_second': 22.741, 'train_steps_per_second': 11.37, 'total_flos': 1959070924800.0, 'train_loss': 1.4541328070064385, 'epoch': 3.0})

In [None]:
reward_model.save_pretrained("./reward_model")
#tokenizer.save_pretrained("./sft_finetuned_model")

In [None]:
from torch.optim import AdamW
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained("./sft_finetuned_model")
model.resize_token_embeddings(len(tokenizer))
model.train()
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-6)

# DPO loss from earlier
def dpo_loss(logits_chosen, logits_rejected, beta=0.1):
    logp_chosen = torch.log_softmax(logits_chosen[:, -1, :], dim=-1)
    logp_rejected = torch.log_softmax(logits_rejected[:, -1, :], dim=-1)
    chosen_token_id = logits_chosen[:, -1, :].argmax(dim=-1)
    rejected_token_id = logits_rejected[:, -1, :].argmax(dim=-1)
    chosen_scores = logp_chosen.gather(1, chosen_token_id.unsqueeze(1))
    rejected_scores = logp_rejected.gather(1, rejected_token_id.unsqueeze(1))
    diff = chosen_scores - rejected_scores
    return -torch.log(torch.sigmoid(beta * diff)).mean()

# DPO Training loop
epochs = 3
for epoch in range(epochs):
    for example in data:
        chosen_enc = tokenizer(example["prompt"] + example["chosen"], return_tensors="pt", truncation=True, padding="max_length", max_length=64)
        rejected_enc = tokenizer(example["prompt"] + example["rejected"], return_tensors="pt", truncation=True, padding="max_length", max_length=64)
        chosen_enc = {k: v.to(device) for k, v in chosen_enc.items()}
        rejected_enc = {k: v.to(device) for k, v in rejected_enc.items()}
        out_chosen = model(**chosen_enc)
        out_rejected = model(**rejected_enc)
        loss = dpo_loss(out_chosen.logits, out_rejected.logits)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"[DPO] Epoch {epoch} | Loss: {loss.item():.4f}")


[DPO] Epoch 0 | Loss: 0.6254
[DPO] Epoch 0 | Loss: 0.7407
[DPO] Epoch 0 | Loss: 0.6532
[DPO] Epoch 0 | Loss: 0.6500
[DPO] Epoch 0 | Loss: 0.5708
[DPO] Epoch 0 | Loss: 0.7446
[DPO] Epoch 0 | Loss: 0.6245
[DPO] Epoch 0 | Loss: 0.6905
[DPO] Epoch 0 | Loss: 0.6895
[DPO] Epoch 0 | Loss: 0.6365
[DPO] Epoch 1 | Loss: 0.6656
[DPO] Epoch 1 | Loss: 0.6451
[DPO] Epoch 1 | Loss: 0.6367
[DPO] Epoch 1 | Loss: 0.6002
[DPO] Epoch 1 | Loss: 0.5412
[DPO] Epoch 1 | Loss: 0.5962
[DPO] Epoch 1 | Loss: 0.5840
[DPO] Epoch 1 | Loss: 0.6877
[DPO] Epoch 1 | Loss: 0.6903
[DPO] Epoch 1 | Loss: 0.5994
[DPO] Epoch 2 | Loss: 0.6065
[DPO] Epoch 2 | Loss: 0.5596
[DPO] Epoch 2 | Loss: 0.6372
[DPO] Epoch 2 | Loss: 0.5745
[DPO] Epoch 2 | Loss: 0.5124
[DPO] Epoch 2 | Loss: 0.6126
[DPO] Epoch 2 | Loss: 0.5593
[DPO] Epoch 2 | Loss: 0.6862
[DPO] Epoch 2 | Loss: 0.6797
[DPO] Epoch 2 | Loss: 0.5950


In [None]:
model.save_pretrained("./fine_tuned_dpo_model")
tokenizer.save_pretrained("./fine_tuned_dpo_model")

('./fine_tuned_dpo_model/tokenizer_config.json',
 './fine_tuned_dpo_model/special_tokens_map.json',
 './fine_tuned_dpo_model/vocab.json',
 './fine_tuned_dpo_model/merges.txt',
 './fine_tuned_dpo_model/added_tokens.json',
 './fine_tuned_dpo_model/tokenizer.json')

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prompt to test
test_prompts = [
    "What is AI?"
]

# Load base model
base_model_name = "./sft_finetuned_model"
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForCausalLM.from_pretrained(base_model_name).to(device)
base_model.eval()

# Fix padding if needed
if base_tokenizer.pad_token is None:
    base_tokenizer.pad_token = base_tokenizer.eos_token

# Load fine-tuned model (from your DPO step)
fine_tuned_path = "./fine_tuned_dpo_model"  # Change if you saved elsewhere
ft_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_dpo_model")
ft_model = AutoModelForCausalLM.from_pretrained("./fine_tuned_dpo_model").to(device)
ft_model.eval()

# Function to generate response
def generate_response(model, tokenizer, prompt, max_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Compare outputs
print("\nBase vs Fine-tuned Model Comparison:\n")
for prompt in test_prompts:
    base_response = generate_response(base_model, base_tokenizer, prompt)
    ft_response = generate_response(ft_model, ft_tokenizer, prompt)

    print(f"   Prompt: {prompt}")
    print(f"   🔹 Base Model:       {base_response}")
    print(f"   🔸 Fine-tuned Model: {ft_response}")
    print("-" * 80)



🔍 Base vs Fine-tuned Model Comparison:

🧠 Prompt: What is AI?
   🔹 Base Model:       What is AI? AI is a computer-generated artificial intelligence.
   🔸 Fine-tuned Model: What is AI? AI is a computer vision technology.
--------------------------------------------------------------------------------


In [None]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorWithPadding
from trl import PPOTrainer, PPOConfig

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Use a standard public model for testing
model_name_public = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name_public)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

policy_model = AutoModelForCausalLM.from_pretrained(model_name_public).to(device)
ref_model = AutoModelForCausalLM.from_pretrained(model_name_public).to(device)
reward_model = AutoModelForCausalLM.from_pretrained(model_name_public).to(device) # Placeholder

policy_model.train()

ppo_config = PPOConfig(
    batch_size=1,
    learning_rate=5e-6,
    report_to="none"
)

class TokenizedPromptDataset(Dataset):
    def __init__(self, tokenizer, prompts, max_length=64):
        self.tokenizer = tokenizer
        self.prompts = prompts
        self.max_length = max_length
    def __len__(self):
        return len(self.prompts)
    def __getitem__(self, idx):
        prompt = self.prompts[idx]
        encodings = self.tokenizer(
            prompt,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in encodings.items()}
        return item

prompts = ["What is AI?", "Explain Python:", "Describe climate change:"]
train_dataset = TokenizedPromptDataset(tokenizer, prompts)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

ppo_trainer = PPOTrainer(
    ppo_config,
    policy_model,
    ref_model,
    tokenizer,
    reward_model=reward_model,
    train_dataset=train_dataset,
    data_collator=data_collator,
    value_model=ref_model
)

ppo_trainer.train()
print("Training complete.")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

AttributeError: GPT2TokenizerFast has no attribute modules