In [1]:
import json
import os

from vllm import LLM, SamplingParams

In [2]:
def load_math(path="../datasets/MATH", split="train"):
    with open(os.path.join(path, split, "dataset.json")) as f:
        data = json.load(f)
    
    examples = [{
        'question': q,
        'answer': a,
    } for q, a in zip(data['question'], data['extracted_answers'])]

    return examples

In [3]:
data = load_math(split='test')
len(data)

5000

In [None]:
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", download_dir="/home/amittur/.cache/huggingface/hub", )

INFO 11-22 19:34:23 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.
INFO 11-22 19:34:23 config.py:1136] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 11-22 19:34:23 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir='/home/amittur/.cache/huggingface/hub', load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 11-22 19:34:28 model_runner.py:1077] Loading model weights took 14.9888 GB
INFO 11-22 19:34:28 worker.py:232] Memory profiling results: total_gpu_memory=39.50GiB initial_memory_usage=15.49GiB peak_torch_memory=16.16GiB memory_usage_post_profile=15.53GiB non_torch_memory=0.53GiB kv_cache_size=18.85GiB gpu_memory_utilization=0.90
INFO 11-22 19:34:28 gpu_executor.py:113] # GPU blocks: 9650, # CPU blocks: 2048
INFO 11-22 19:34:28 gpu_executor.py:117] Maximum concurrency for 131072 tokens per request: 1.18x
INFO 11-22 19:34:32 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-22 19:34:32 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
I

In [55]:
PROMPT = """
Answer the math question in the format shown below. End your response with "<|eot_id|>".

---
Question: <you will be given a math question> 
Reasoning: <your step by step reasoning for the answer>
Answer: <your final answer only>
---

Question: {}
Reasoning:
"""

def get_prompts(data):
    return [PROMPT.format(d['question']) for d in data]

In [56]:
sampling_params = SamplingParams(
    temperature=0,
    max_tokens=1000,
    stop_token_ids=[128001, 128008, 128009],
)

In [60]:
# Sanity
outputs = llm.generate(PROMPT.format(data[3]['question']), sampling_params)
print(outputs[0].outputs[0].text)

Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.13s/it, est. speed input: 45.05 toks/s, output: 71.76 toks/s]

Step 1:  To find a fair price to pay to play the game, we need to consider the expected value of playing the game. This involves calculating the probability of winning a certain amount and then multiplying it by the amount won.
Step 2:  We start by considering the four suits: $\clubsuit$, $\spadesuit$, $\heartsuit$, and $\diamondsuit$. Each suit has 13 cards, and we need to calculate the expected value for each suit separately.
Step 3:  For the $\clubsuit$ suit, we have a $\frac{1}{4}$ chance of drawing an Ace, a $\frac{4}{52}$ chance of drawing a 2 through 10, and a $\frac{3}{52}$ chance of drawing a face card. We calculate the expected value for this suit by multiplying the probability of each outcome by the amount won and summing them up.
Step 4:  Similarly, for the $\spadesuit$ suit, we have a $\frac{1}{4}$ chance of drawing an Ace, a $\frac{4}{52}$ chance of drawing a 2 through 10, and a $\frac{3}{52}$ chance of drawing a face card. We calculate the expected value for this suit by




In [62]:
data_prompts = get_prompts(data)

In [None]:
outputs = llm.generate(data_prompts, sampling_params)
answers = []

for i, output in enumerate(outputs):
    # prompt = output.prompt
    generated_text = output.outputs[0].text

    # Find answer
    answer = generated_text.split("Answer: ")[-1].strip()
    answers.append({"idx": i, "answer": answer})

    print(f"Question Idx: {i}, Answer: {generated_text!r}")

Question Idx: 0, Answer: 'The probability of getting a 6 in at most 2 of the rolls is the same as the probability of getting a 6 in exactly 0 of the rolls, or exactly 1 of the rolls, or exactly 2 of the rolls.  We can calculate the probability of each of these cases and add them together.  The probability of getting a 6 in exactly 0 of the rolls is $\\binom{5}{0}(\\frac{1}{6})^0(\\frac{5}{6})^5$.  The probability of getting a 6 in exactly 1 of the rolls is $\\binom{5}{1}(\\frac{1}{6})^1(\\frac{5}{6})^4$.  The probability of getting a 6 in exactly 2 of the rolls is $\\binom{5}{2}(\\frac{1}{6})^2(\\frac{5}{6})^3$.  We can add these probabilities together to get the probability of getting a 6 in at most 2 of the rolls.  This is equal to $\\binom{5}{0}(\\frac{1}{6})^0(\\frac{5}{6})^5 + \\binom{5}{1}(\\frac{1}{6})^1(\\frac{5}{6})^4 + \\binom{5}{2}(\\frac{1}{6})^2(\\frac{5}{6})^3 = \\frac{3125}{7776}$.\nAnswer: $\\frac{3125}{7776}$'
Question Idx: 1, Answer: 'Step 1:  To find the value of $n$

In [66]:
# Save answers as json
with open("answers.json", "w") as f:
    json.dump(answers, f)