In [1]:
import json
import os
import re

from tqdm.auto import tqdm

from vllm import LLM, SamplingParams

import sys
from pathlib import Path
sys.path.append(str(Path().resolve().parent / "src"))

from utils import math_eval

os.environ ['CUDA_LAUNCH_BLOCKING'] ='1'
os.environ['VLLM_LOGGING_LEVEL'] = 'DEBUG'

In [2]:
def load_math(path="../datasets/MATH", split="train"):
    with open(os.path.join(path, split, "dataset.json")) as f:
        data = json.load(f)
    
    examples = [{
        'question': q,
        'answer': a,
    } for q, a in zip(data['question'], data['extracted_answers'])]

    return examples

In [3]:
data = load_math(split='test')
len(data)

5000

In [4]:
llm = LLM(
    model="meta-llama/Llama-3.1-8B-Instruct", 
    download_dir="/home/amittur/.cache/huggingface/hub", 
    max_model_len=20000,
    gpu_memory_utilization=0.99,
    max_num_seqs=10,
)

INFO 11-25 09:17:46 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.
INFO 11-25 09:17:46 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=20000, download_dir='/home/amittur/.cache/huggingface/hub', load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 11-25 09:17:51 model_runner.py:1077] Loading model weights took 14.9888 GB
INFO 11-25 09:17:53 worker.py:232] Memory profiling results: total_gpu_memory=39.50GiB initial_memory_usage=15.49GiB peak_torch_memory=17.06GiB memory_usage_post_profile=15.51GiB non_torch_memory=0.51GiB kv_cache_size=21.53GiB gpu_memory_utilization=0.99
INFO 11-25 09:17:53 gpu_executor.py:113] # GPU blocks: 11023, # CPU blocks: 2048
INFO 11-25 09:17:53 gpu_executor.py:117] Maximum concurrency for 20000 tokens per request: 8.82x
INFO 11-25 09:17:56 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-25 09:17:56 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
I

In [5]:
def get_solve_prompts(data):
    SOLVE_PROMPT = """Answer the math problem in the format shown below. End your response with "<|eot_id|>".

---
Question: <you will be given a math question> 
Reasoning: <your step by step reasoning for the answer>
Answer: <your final answer only>
---

Question: {}
"""

    return [SOLVE_PROMPT.format(d['question']) for d in data]


def get_n_shot_cot_solve_prompts(data, synth_data):
    SOLVE_N_SHOT_PROMPT = """Some examples of math problems and their answers, similar to the one you are going to be asked are provided below. Use them to understand and solve the problem. End your response with "<|eot_id|>".    

### Examples ###
{examples}

### Output Format ###
Question: <the math problem you need to solve> 
Reasoning: <your step by step reasoning for the answer>
Answer: <your final answer only>

### Input ###
Question: {question}
"""
    return [
        SOLVE_N_SHOT_PROMPT.format(
            examples="\n\n".join([f"Question: {d['question']}\nAnswer: {d['answer']}" for d in synth_data[i]]), 
            question=d['question']
        ) 
        for i, d in enumerate(data)
    ]


def get_generate_prompt(data, n_samples=1):
    GENERATE_PROMPT = """Given a reference math problem, generate {n_samples} similar math problems, along with their answers. End your response with "<|eot_id|>".
Use this format for each generated problem:
Question: <new math problem>
Answer: <answer to the math problem>

Question: {question}
"""

    return [GENERATE_PROMPT.format(n_samples=n_samples, question=d['question']) for d in data]

In [6]:
sampling_params = SamplingParams(
    temperature=0,
    max_tokens=5120,
    stop_token_ids=[128001, 128008, 128009],
)

In [7]:
def extract_cot_answer(text):
    match = re.search(r"Reasoning: (.+)Answer: (.+)", text, re.DOTALL)
    answer = reasoning = ''
    if match:
        reasoning = match.group(1).strip('\n ')
        answer = match.group(2).strip('\n ')
    return {
        'answer': answer,
        'reasoning': reasoning
    }


def extract_synthetic_data(text):
    matches = re.findall(r"Question: (.*?)\nAnswer: (.*?)(?=\s*Question:|$)", text, re.DOTALL)
    return [{
        'question': q.strip('\n '),
        'answer': a.strip('\n ')
    } for q, a in matches]

In [8]:
# Sanity Check
q_idx = 6

outputs = llm.generate(get_generate_prompt([data[q_idx]], 4), sampling_params)

print(outputs[0].prompt, outputs[0].outputs[0].text)
synth_data = extract_synthetic_data(outputs[0].outputs[0].text)
synth_data

Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.30s/it, est. speed input: 31.64 toks/s, output: 69.32 toks/s]

Given a reference math problem, generate 4 similar math problems, along with their answers. End your response with "<|eot_id|>".
Use this format for each generated problem:
Question: <new math problem>
Answer: <answer to the math problem>

Question: I have a spinner that lands on 1 with a probability of $\frac{1}{10}$, 2 with a probability of $\frac{2}{10}$, 3 with a probability of $\frac{3}{10}$, and 4 with a probability of $\frac{4}{10}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?
 Answer: $\frac{1}{5}$

Question: I have a spinner that lands on 1 with a probability of $\frac{1}{8}$, 2 with a probability of $\frac{2}{8}$, 3 with a probability of $\frac{3}{8}$, and 4 with a probability of $\frac{4}{8}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?
Answer: $\frac{1}{4}$

Question: I have a spinner that lands on 1 with a probability of $\frac{1}{12}$, 2 with a probability of $\frac{2}{




[{'question': 'I have a spinner that lands on 1 with a probability of $\\frac{1}{8}$, 2 with a probability of $\\frac{2}{8}$, 3 with a probability of $\\frac{3}{8}$, and 4 with a probability of $\\frac{4}{8}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?',
  'answer': '$\\frac{1}{4}$'},
 {'question': 'I have a spinner that lands on 1 with a probability of $\\frac{1}{12}$, 2 with a probability of $\\frac{2}{12}$, 3 with a probability of $\\frac{3}{12}$, and 4 with a probability of $\\frac{4}{12}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?',
  'answer': '$\\frac{1}{6}$'},
 {'question': 'I have a spinner that lands on 1 with a probability of $\\frac{1}{16}$, 2 with a probability of $\\frac{2}{16}$, 3 with a probability of $\\frac{3}{16}$, and 4 with a probability of $\\frac{4}{16}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?',
  'ans

In [9]:
print(get_n_shot_cot_solve_prompts([data[q_idx]], [synth_data])[0])

Some examples of math problems and their answers, similar to the one you are going to be asked are provided below. Use them to understand and solve the problem. End your response with "<|eot_id|>".    

### Examples ###
Question: I have a spinner that lands on 1 with a probability of $\frac{1}{8}$, 2 with a probability of $\frac{2}{8}$, 3 with a probability of $\frac{3}{8}$, and 4 with a probability of $\frac{4}{8}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?
Answer: $\frac{1}{4}$

Question: I have a spinner that lands on 1 with a probability of $\frac{1}{12}$, 2 with a probability of $\frac{2}{12}$, 3 with a probability of $\frac{3}{12}$, and 4 with a probability of $\frac{4}{12}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?
Answer: $\frac{1}{6}$

Question: I have a spinner that lands on 1 with a probability of $\frac{1}{16}$, 2 with a probability of $\frac{2}{16}$, 3 with a probab

In [10]:
outputs = llm.generate(get_n_shot_cot_solve_prompts([data[q_idx]], [synth_data]), sampling_params)

print(outputs[0].prompt, outputs[0].outputs[0].text)
result = extract_cot_answer(outputs[0].outputs[0].text)
result

Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.09s/it, est. speed input: 111.93 toks/s, output: 69.16 toks/s]

Some examples of math problems and their answers, similar to the one you are going to be asked are provided below. Use them to understand and solve the problem. End your response with "<|eot_id|>".    

### Examples ###
Question: I have a spinner that lands on 1 with a probability of $\frac{1}{8}$, 2 with a probability of $\frac{2}{8}$, 3 with a probability of $\frac{3}{8}$, and 4 with a probability of $\frac{4}{8}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?
Answer: $\frac{1}{4}$

Question: I have a spinner that lands on 1 with a probability of $\frac{1}{12}$, 2 with a probability of $\frac{2}{12}$, 3 with a probability of $\frac{3}{12}$, and 4 with a probability of $\frac{4}{12}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?
Answer: $\frac{1}{6}$

Question: I have a spinner that lands on 1 with a probability of $\frac{1}{16}$, 2 with a probability of $\frac{2}{16}$, 3 with a probab




{'answer': '$\\frac{3}{10}$',
 'reasoning': 'To find the probability that Phil and Sarah get the same number, we need to find the probability of each possible outcome where they get the same number and add them up. The possible outcomes are: both land on 1, both land on 2, both land on 3, and both land on 4. The probability of each outcome is the product of the individual probabilities of each event. So, the probability of both landing on 1 is $\\frac{1}{10} * \\frac{1}{10} = \\frac{1}{100}$, the probability of both landing on 2 is $\\frac{2}{10} * \\frac{2}{10} = \\frac{4}{100}$, the probability of both landing on 3 is $\\frac{3}{10} * \\frac{3}{10} = \\frac{9}{100}$, and the probability of both landing on 4 is $\\frac{4}{10} * \\frac{4}{10} = \\frac{16}{100}$. Adding these probabilities together, we get $\\frac{1}{100} + \\frac{4}{100} + \\frac{9}{100} + \\frac{16}{100} = \\frac{30}{100} = \\frac{3}{10}$.'}

In [11]:
data[q_idx]

{'question': 'I have a spinner that lands on 1 with a probability of $\\frac{1}{10}$, 2 with a probability of $\\frac{2}{10}$, 3 with a probability of $\\frac{3}{10}$, and 4 with a probability of $\\frac{4}{10}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?',
 'answer': '\\dfrac{3}{10}'}

In [12]:
# Check eval

math_eval.process_results(data[q_idx], result['answer'])

{'exact_match': 1}

In [13]:
# generate_prompts = get_generate_prompt(data[:5], 3)
generate_prompts = get_generate_prompt(data, 3)

In [13]:
BATCH_SIZE = 64

# synth_data = []

# for i in tqdm(range(0, len(generate_prompts), BATCH_SIZE)):
#     outputs = llm.generate(generate_prompts[i:i+BATCH_SIZE], sampling_params)
#     for output in outputs:
#         generated_text = output.outputs[0].text
#         synth_data.append(extract_synthetic_data(generated_text))

In [14]:
with open('../src/synth_data.json', 'r') as f:
    synth_data = json.load(f)

In [15]:
# solve_n_shot_prompts = get_n_shot_solve_prompts(data[:5], synth_data)
solve_n_shot_prompts = get_n_shot_cot_solve_prompts(data, synth_data)

In [16]:
answers = []

# for i in tqdm(range(0, len(solve_n_shot_prompts), BATCH_SIZE)):
# outputs = llm.generate(solve_n_shot_prompts[i:i+BATCH_SIZE], sampling_params)
outputs = llm.generate(solve_n_shot_prompts, sampling_params)
for output in outputs:
    generated_text = output.outputs[0].text
    answers.append(extract_cot_answer(generated_text))

Processed prompts:   2%|▏         | 99/5000 [00:37<38:12,  2.14it/s, est. speed input: 864.36 toks/s, output: 435.53 toks/s]  

KeyboardInterrupt: 

In [None]:
# Save answers as json
with open("answers_3_shot_synth.json", "w") as f:
    json.dump(answers, f)