In [None]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.1


# Inference of fine tuned model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import os

# Model paths
model_name = "mistralai/Mistral-7B-v0.1"
adapter_path = "/content/drive/MyDrive/quiz-lora-adapter"

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load the base model and tokenizer
print("Loading base model...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config, # Add quantization config here
    device_map="auto", # Use device_map="auto" with quantization
    trust_remote_code=True
)

# Load the LoRA adapter
print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, adapter_path)

# Set model to evaluation mode
model.eval()
print("✓ Model loaded successfully!")
print(f"Model: {model_name}")
print(f"Adapter: {adapter_path}")
print(f"Model device: {model.device}")

Using device: cuda
Loading base model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Loading LoRA adapter...
✓ Model loaded successfully!
Model: mistralai/Mistral-7B-v0.1
Adapter: /content/drive/MyDrive/quiz-lora-adapter
Model device: cuda:0


In [None]:
# Fix pad token for Mistral
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

prompt = """### Instruction:
Generate a multiple-choice question.

Subject: Machine Learning
Topic: Optimizers
Difficulty: Easy

### Response:
"""

# Tokenize input (NO truncation needed for short prompts)
inputs = tokenizer(
    prompt,
    return_tensors="pt",
    padding=True
)

# Move inputs to model device
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Generate
with torch.no_grad():
    output_ids = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

# Decode
response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# print("=== Prompt ===")
# print(prompt)
print("\n=== Model Response ===")
print(response)



=== Model Response ===
### Instruction:
Generate a multiple-choice question.

Subject: Machine Learning
Topic: Optimizers
Difficulty: Easy

### Response:
Question: What is the main difference between a deterministic and a stochastic optimizer?

Options:
A. Deterministic optimizers use randomness, while stochastic optimizers do not
B. Deterministic optimizers are faster, while stochastic optimizers are slower
C. Deterministic optimizers are more accurate, while stochastic optimizers are less accurate
D. Deterministic optimizers are used for classification, while stochastic optimizers are used for regression

Correct Answer: A

Explanation:
The main difference between a deterministic and a stochastic optimizer is that deterministic optimizers use randomness in their search for the optimal parameters, while stochastic optimizers do not.



# Applying RL

In [None]:
pip install -U transformers peft trl accelerate bitsandbytes

Collecting transformers
  Downloading transformers-4.57.6-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.18.1-py3-none-any.whl.metadata (14 kB)
Collecting trl
  Downloading trl-0.27.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.57.6-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.18.1-py3-none-any.whl (556 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.0/557.0 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.27.0-py3-none-any.whl (532 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.5/532.5 kB[0m [31m39.9 MB/s[0m eta [36m0:00:

# Dataset preperation

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, LoraConfig
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset

# 1. Load Tokenizer (Use the one from your adapter path)
adapter_path = "/content/drive/MyDrive/quiz-lora-adapter"
model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(adapter_path)
tokenizer.pad_token = tokenizer.eos_token

# 2. Load Base Model in 4-bit (Memory Efficient)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto"
)

# 3. Load your existing LoRA adapter for further training
# Note: DPO will update these adapter weights
model = PeftModel.from_pretrained(base_model, adapter_path, is_trainable=True)

# 4. Configure DPO Training
training_args = DPOConfig(
    output_dir="./mistral-rl-output",
    per_device_train_batch_size=1, # Keep small for memory
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=1000,
    save_steps=100,
    logging_steps=10,
    fp16=True,
    remove_unused_columns=False,
    gradient_checkpointing=True # Crucial for memory constraints
)

# 5. Initialize Trainer
# Replace 'your_dataset' with your actual preference dataset
dataset = load_dataset("Intel/orca_dpo_pairs", split="train[:10%]")

dpo_trainer = DPOTrainer(
    model,
    args=training_args,
    beta=0.1, # Temperature for DPO
    train_dataset=dataset,
    tokenizer=tokenizer,
    max_length=512,
    max_prompt_length=256,
)

# 6. Start RL Training
dpo_trainer.train()

# 7. Save the new RL-enhanced adapter
dpo_trainer.save_model("./mistral-7b-rl-adapter")