In [1]:
%%capture
# Skip restarting message in Colab
import sys; modules = list(sys.modules.keys())
for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None

!pip install unsloth vllm
!pip install --upgrade pillow

In [None]:
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)

In [3]:
from unsloth import is_bfloat16_supported

In [None]:
import torch
max_seq_length = 512
lora_rank = 32

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True,
    fast_inference = True,
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.6,
)

In [5]:
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

In [6]:
from google.colab import userdata
from huggingface_hub import login

In [7]:
login(token=userdata.get('niru_hf_read'))

In [8]:
from huggingface_hub import snapshot_download

model_id = "nirusanan/GRPO-llama3.1-reasoning"

snapshot_download(repo_id=model_id, local_dir="llama-grpo_saved_lora",
                  local_dir_use_symlinks=False, revision="main")

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/814 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

'/content/llama-grpo_saved_lora'

In [10]:
# Load LoRA Adapter
model.load_adapter("/content/llama-grpo_saved_lora")

In [11]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "How many times p occurance in apple"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
)[0].outputs[0].text

output

Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.65s/it, est. speed input: 14.21 toks/s, output: 17.44 toks/s]


"<analysis>\nThe word 'apple' has 5 letters: a-p-p-l-e. To find the number of times 'p' occurs in 'apple', we need to count the number of times the letter 'p' appears in the word. There is only 1 occurrence of the letter 'p' in the word 'apple'.\n</analysis>\n<answer>\n1\n</answer>"

In [11]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "Calculate pi."},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
)[0].outputs[0].text

output

Processed prompts: 100%|██████████| 1/1 [00:23<00:00, 23.73s/it, est. speed input: 2.57 toks/s, output: 16.52 toks/s]


'<reasoning>\nCalculating pi is a complex task that involves using various mathematical formulas and algorithms. One of the most well-known formulas is the Leibniz formula for pi, which is an infinite series:\n\nπ = 4 * (1 - 1/3 + 1/5 - 1/7 + 1/9 - ...)\n\nAnother way to calculate pi is by using the Gauss-Legendre algorithm, which is an iterative method that uses arithmetic and geometric means to converge to the value of pi.\n\nFor simplicity, we will use the Bailey-Borwein-Plouffe formula, which is a spigot algorithm that allows for the computation of any binary digit of pi.\n\n</reasoning>\n<answer>\nTo calculate pi, we can use a programming language like Python to implement the Bailey-Borwein-Plouffe formula. Here is a simplified example:\n\n```\ndef calculate_pi(n):\n    pi = 0.0\n    for k in range(n):\n        pi += 1/(16**k)*(\n            4/(8*k+1) -\n            2/(8*k+4) -\n            1/(8*k+5) -\n            1/(8*k+6)\n        )\n    return pi\n\nn = 100\nprint("Approximati

In [12]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "At the Bertolli Farm, they grow 2073 tomatoes, 4112 cobs of corn, and 985 onions. How many fewer onions are grown than tomatoes and corn together? "},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
)[0].outputs[0].text

output

Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.85s/it, est. speed input: 16.58 toks/s, output: 17.95 toks/s]


'To find the difference between the total number of tomatoes and corn, we need to add the number of tomatoes and corn together and then subtract the number of onions.\n\nFirst, add the number of tomatoes and corn: 2073 (tomatoes) + 4112 (corn) = 6185.\n\nNow, subtract the number of onions: 6185 (tomatoes and corn) - 985 (onions) = 5200.\n\nThere are 5200 fewer onions grown than tomatoes and corn together.'

To find the difference between the total number of tomatoes and corn, we need to add the number of tomatoes and corn together and then subtract the number of onions.

First, add the number of tomatoes and corn: 2073 (tomatoes) + 4112 (corn) = 6185.

Now, subtract the number of onions: 6185 (tomatoes and corn) - 985 (onions) = 5200.

There are 5200 fewer onions grown than tomatoes and corn together.