In [None]:
# training model
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer

# plotting and saving models
import matplotlib.pyplot as plt
# import wandb # we can start saving models inside wandb when they get good enough
import random
import numpy as np
import pandas as pd

In [None]:
# bASE model
BASE_HUGGINGFACE_DIRECTORY = "unsloth"
# BASE_MODEL = "Qwen3-4B"
BASE_MODEL = "Qwen3-14B-unsloth-bnb-4bit"
MAX_SEQ_LENGTH = 4096 # base model

# Dataset
DATASET_PATH = "../datasets/dataset.jsonl"
TRAINSET_SIZE = 4
TESTSET_SIZE = 2

# Training
LORA_RANK = 16
GPU_MEMORY_UTILIZATION = 0.3
MAX_PROMPT_LENGTH = 4096
MAX_COMPLETION_LENGTH = 4096
EPOCHS = 10
USE_VLLM = True

# SAVING MODEL
from huggingface_hub import login
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
login(HUGGINGFACE_TOKEN)

SAVE_HUGGINGFACE_DIRECTORY = "RaghaRao314159"
# SAVE_MODEL_NAME = "GRPO-Qwen3-4B"
SAVE_MODEL_NAME = "Qwen3-14B-unsloth-bnb-4bit"
EXPERIMENT_DESCRIPTION = "all_rewards_epoch_5"

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = f"{BASE_HUGGINGFACE_DIRECTORY}/{BASE_MODEL}",
    max_seq_length = MAX_SEQ_LENGTH,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = USE_VLLM, # Enable vLLM fast inference
    max_lora_rank = LORA_RANK, # 8, 16, 32, ... (the larger the rank, the more memory it uses)
    gpu_memory_utilization = GPU_MEMORY_UTILIZATION, # Reduce if out of memory
)

# Disable "think" by monkey-patching the tokenizer’s chat template
# orig_apply = tokenizer.apply_chat_template
# def apply_no_think(messages, **kwargs):
#     return orig_apply(
#         messages,
#         tokenize=False,
#         add_generation_prompt=True,
#         enable_thinking=False,      # ← hard-disable chain-of-thought
#         **{k: v for k, v in kwargs.items() if k not in ("tokenize","add_generation_prompt")}
#     )
# tokenizer.apply_chat_template = apply_no_think

model = FastLanguageModel.get_peft_model(
    model,
    r = LORA_RANK, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = LORA_RANK, # scaling parameter for delta W (LoRA) = alpha/rank ---> set same as rank so no need to retune for different rank
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    # use_gradient_checkpointing= False, # Disable gradient checkpointing for faster training
    random_state = 3407,
)                           

In [None]:

from datasets import load_dataset
# format of dataset: 
# {"conversations": [{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}]}
# {"conversations": [{"role": "user", "content": prompt2}, {"role": "assistant", "content": answer2}]} ...
dataset_original = load_dataset("json", data_files=DATASET_PATH, split="train").select(range(TRAINSET_SIZE)) # select first 1000 rows for training

SYSTEM_INSTRUCTION = f"""
You are a genius.
"""

# def formatting_prompts_func(example):
#     # if number of tokens exceed max prompt length, do not include the prompt
#     if len(tokenizer.encode(example["conversations"][0]["content"])) < 10000:
#         return {
#             "prompt": [
#                 {"role": "user", "content": SYSTEM_INSTRUCTION + "\n" + example["conversations"][0]["content"]}
#             ],
#             "answer": example["conversations"][1]["content"]
#         }
#     else:
#         # do not add this row to the new dataset object
#         return None

def formatting_prompts_func(example):
    # if number of tokens exceed max prompt length, do not include the prompt
    return {
        "prompt": [
            {"role": "user", "content": SYSTEM_INSTRUCTION + "\n" + example["conversations"][0]["content"]}
        ],
        "answer": example["conversations"][1]["content"]
    }

def is_short_enough(example):
    return (len(tokenizer.encode(example["conversations"][0]["content"])) < 4096 - 150
    ) and (len(tokenizer.encode(example["conversations"][1]["content"])) < 4096 - 150)
filtered_dataset = dataset_original.filter(is_short_enough)

dataset_split = filtered_dataset.train_test_split(test_size=TESTSET_SIZE, seed=42)
dataset_mapped = dataset_split.map(formatting_prompts_func, remove_columns=["conversations"])
# dataset_split = dataset_original.train_test_split(test_size=TESTSET_SIZE, seed=42)
# dataset_mapped = dataset_split.map(formatting_prompts_func, remove_columns=["conversations"])
print("\nOriginal dataset\n", dataset_original)
print("\nMapped dataset\n", dataset_mapped)

In [None]:
def reward_fn():
    pass

In [None]:
training_args = GRPOConfig(
        use_vllm = USE_VLLM, # use vLLM for fast inference!
        learning_rate = 5e-6, # learning rate for gradient descent
        adam_beta1 = 0.9, # beta1 for adamw optimizer
        adam_beta2 = 0.99, # beta2 for adamw optimizer
        weight_decay = 0.1, # weight decay for adamw optimizer
        warmup_ratio = 0.1, # Gradually increases learning rate at the start
        lr_scheduler_type = "cosine", # After warmup, does cosine annealing on learning rate
        optim = "adamw_8bit", # Quantisation of adam update steps. This is fine and doesnt have huge impact on accuracy
        per_device_train_batch_size = 4, # batch size per device
        gradient_accumulation_steps = 2, # effective batch size is 256 = 64 * 4
        num_generations = 8, # Number of outputs that GRPO produces to estimate the advantage
        logging_steps = 1, # log every 4 steps
        bf16 = is_bfloat16_supported(), # use bfloat16 for faster training
        fp16 = not is_bfloat16_supported(), # if bf16 is not supported, use fp16
        max_prompt_length = MAX_PROMPT_LENGTH, # max length of the prompt
        max_completion_length = MAX_COMPLETION_LENGTH, # max length of the completion
        num_train_epochs = EPOCHS, # number of times to train over whole dataset
        # max_steps = 1, # max number of steps
        # save_steps = 250, # save every 250 steps
        max_grad_norm = 0.1, # max gradient norm clipping to prevent exploding gradients
        report_to = "none", # Can use Weights & Biases
        output_dir = "outputs",
        loss_type = "bnpo", # or "grpo" or "dr_grpo"
    )

In [None]:
trainer = GRPOTrainer(
        model = model,
        processing_class = tokenizer,
        reward_funcs = [
            reward_fn
        ],
        args = training_args,
        train_dataset = dataset_mapped["train"],
        # eval_dataset = dataset_mapped["test"],
    )
# wandb.init(project=WEIGHTS_AND_BIASES_PROJECT_NAME,
#            name=f"{SAVE_MODEL_NAME}_rank{LORA_RANK}_epochs{EPOCHS}_{EXPERIMENT_DESCRIPTION}")
trainer.train()

In [None]:
# log the training data
logs = trainer.state.log_history

# save the training logs
df = pd.DataFrame(logs)
df.to_csv(f"../logs/{SAVE_MODEL_NAME}_rank{LORA_RANK}_{EXPERIMENT_DESCRIPTION}.csv", index=False)

In [None]:
model.push_to_hub_merged(f"{SAVE_HUGGINGFACE_DIRECTORY}/{SAVE_MODEL_NAME}_rank{LORA_RANK}_{EXPERIMENT_DESCRIPTION}", tokenizer, save_method = "merged_16bit", token=HUGGINGFACE_TOKEN)
