In [1]:
import json
import os
import re

from tqdm.auto import tqdm

from vllm import LLM, SamplingParams

import sys
from pathlib import Path

# Add the 'src' directory to sys.path
sys.path.append(str(Path().resolve().parent / "src"))

# Import math_eval from the utils package
from utils import math_eval

# os.environ ['CUDA_LAUNCH_BLOCKING'] ='1'
# os.environ['VLLM_LOGGING_LEVEL'] = 'DEBUG'
# os.environ['NCCL_P2P_DISABLE'] = '1'

In [2]:
def load_math(path="../datasets/MATH", split="train"):
    with open(os.path.join(path, split, "dataset.json")) as f:
        data = json.load(f)
    
    examples = [{
        'question': q,
        'answer': a,
    } for q, a in zip(data['question'], data['extracted_answers'])]

    return examples

In [3]:
data = load_math(split='test')
len(data)

5000

In [4]:
llm = LLM(
    # model="meta-llama/Llama-3.1-8B-Instruct", 
    model="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4", 
    tensor_parallel_size=2,
    download_dir="/home/amittur/.cache/huggingface/hub", 
    max_model_len=50000,
    # gpu_memory_utilization=0.99,
    # max_num_seqs=10,
)

INFO 11-29 16:04:48 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.
INFO 11-29 16:04:48 awq_marlin.py:109] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 11-29 16:04:48 config.py:1020] Defaulting to use mp for distributed inference
INFO 11-29 16:04:48 config.py:1136] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 11-29 16:04:48 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', speculative_config=None, tokenizer='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=50000, download_dir='/home/amittur/.cache/huggingface/hub', load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_

Loading safetensors checkpoint shards:   0% Completed | 0/9 [00:00<?, ?it/s]


INFO 11-29 16:06:33 model_runner.py:1077] Loading model weights took 18.5799 GB
[1;36m(VllmWorkerProcess pid=3837248)[0;0m INFO 11-29 16:06:33 model_runner.py:1077] Loading model weights took 18.5799 GB
[1;36m(VllmWorkerProcess pid=3837248)[0;0m INFO 11-29 16:06:34 worker.py:232] Memory profiling results: total_gpu_memory=44.42GiB initial_memory_usage=19.32GiB peak_torch_memory=18.70GiB memory_usage_post_profile=19.44GiB non_torch_memory=0.85GiB kv_cache_size=20.43GiB gpu_memory_utilization=0.90
INFO 11-29 16:06:34 worker.py:232] Memory profiling results: total_gpu_memory=44.42GiB initial_memory_usage=19.32GiB peak_torch_memory=19.77GiB memory_usage_post_profile=19.46GiB non_torch_memory=0.86GiB kv_cache_size=19.35GiB gpu_memory_utilization=0.90
INFO 11-29 16:06:34 distributed_gpu_executor.py:57] # GPU blocks: 7925, # CPU blocks: 1638
INFO 11-29 16:06:34 distributed_gpu_executor.py:61] Maximum concurrency for 50000 tokens per request: 2.54x
[1;36m(VllmWorkerProcess pid=3837248)[0

In [None]:
def get_solve_prompts(data):
    SOLVE_PROMPT = """Answer the math problem in the format shown below. End your response with "<|eot_id|>".

---
Problem: <you will be given a math problem> 
Reasoning: <your step by step reasoning for the answer>
Answer: <your final answer only>
---

Problem: {}
"""

    return [SOLVE_PROMPT.format(d['question']) for d in data]


def get_n_shot_cot_solve_prompts(data, synth_data):
    SOLVE_N_SHOT_PROMPT = """Some examples of math problems and their answers, similar to the one you are going to be asked are provided below. Use them to understand and solve the problem. End your response with "<|eot_id|>".    

### Examples ###
{examples}

### Output Format ###
Problem: <the math problem you need to solve> 
Reasoning: <your step by step reasoning for the answer>
Answer: <your final answer only>

### Input ###
Problem: {problem}
"""
    return [
        SOLVE_N_SHOT_PROMPT.format(
            examples="\n\n".join([f"Problem: {d['problem']}\nAnswer: {d['answer']}" for d in synth_data[i]]), 
            # examples="\n\n".join([f"Problem: {d['problem']}" for d in synth_data[i]]), 
            problem=d['question']
        ) 
        for i, d in enumerate(data)
    ]


def get_generate_prompt(data, n_samples=1):
    GENERATE_PROMPT = """Given a reference math problem, generate {n_samples} similar math problems, along with their answers. End your response with "<|eot_id|>".
Use this format for each generated problem:
Problem: <new math problem>
Answer: <answer to the math problem>

Problem: {problem}
"""

    return [GENERATE_PROMPT.format(n_samples=n_samples, problem=d['question']) for d in data]

In [6]:
sampling_params = SamplingParams(
    temperature=0,
    max_tokens=5120,
    stop_token_ids=[128001, 128008, 128009],
)

In [7]:
def extract_cot_answer(text):
    match = re.search(r"Reasoning: (.+)Answer: (.+)", text, re.DOTALL)
    answer = reasoning = ''
    if match:
        reasoning = match.group(1).strip('\n ')
        answer = match.group(2).strip('\n ')
    return {
        'answer': answer,
        'reasoning': reasoning
    }


def extract_synthetic_data(text):
    matches = re.findall(r"Problem: (.*?)\nAnswer: (.*?)(?=\s*Problem:|$)", text, re.DOTALL)
    return [{
        'problem': q.strip('\n '),
        'answer': a.strip('\n ')
    } for q, a in matches]

In [8]:
# Sanity Check
q_idx = 6

outputs = llm.generate(get_generate_prompt([data[q_idx]], 4), sampling_params)

print(outputs[0].prompt, outputs[0].outputs[0].text)
synth_data = extract_synthetic_data(outputs[0].outputs[0].text)
synth_data

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 1/1 [00:12<00:00, 12.93s/it, est. speed input: 10.52 toks/s, output: 31.08 toks/s]

Given a reference math problem, generate 4 similar math problems, along with their answers. End your response with "<|eot_id|>".
Use this format for each generated problem:
Problem: <new math problem>
Answer: <answer to the math problem>

Problem: I have a spinner that lands on 1 with a probability of $\frac{1}{10}$, 2 with a probability of $\frac{2}{10}$, 3 with a probability of $\frac{3}{10}$, and 4 with a probability of $\frac{4}{10}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?
 Answer: $\frac{1}{25}$

Here are 4 similar math problems:

Problem: I have a spinner that lands on 1 with a probability of $\frac{1}{8}$, 2 with a probability of $\frac{2}{8}$, 3 with a probability of $\frac{3}{8}$, and 4 with a probability of $\frac{2}{8}$. If Alex and Ben both spin the spinner, what is the probability that they get the same number?
Answer: $\frac{1}{16}$

Problem: I have a spinner that lands on 1 with a probability of $\frac{1}{6}$, 2 wi




[{'problem': 'I have a spinner that lands on 1 with a probability of $\\frac{1}{8}$, 2 with a probability of $\\frac{2}{8}$, 3 with a probability of $\\frac{3}{8}$, and 4 with a probability of $\\frac{2}{8}$. If Alex and Ben both spin the spinner, what is the probability that they get the same number?',
  'answer': '$\\frac{1}{16}$'},
 {'problem': 'I have a spinner that lands on 1 with a probability of $\\frac{1}{6}$, 2 with a probability of $\\frac{2}{6}$, 3 with a probability of $\\frac{2}{6}$, and 4 with a probability of $\\frac{1}{6}$. If Charlie and David both spin the spinner, what is the probability that they get the same number?',
  'answer': '$\\frac{1}{9}$'},
 {'problem': 'I have a spinner that lands on 1 with a probability of $\\frac{1}{12}$, 2 with a probability of $\\frac{3}{12}$, 3 with a probability of $\\frac{4}{12}$, and 4 with a probability of $\\frac{4}{12}$. If Emily and Frank both spin the spinner, what is the probability that they get the same number?',
  'answer'

In [11]:
print(get_n_shot_cot_solve_prompts([data[q_idx]], [synth_data])[0])

Some examples of math problems and their answers, similar to the one you are going to be asked are provided below. Use them to understand and solve the problem. End your response with "<|eot_id|>".    

### Examples ###
Problem: I have a spinner that lands on 1 with a probability of $\frac{1}{8}$, 2 with a probability of $\frac{2}{8}$, 3 with a probability of $\frac{3}{8}$, and 4 with a probability of $\frac{2}{8}$. If Alex and Ben both spin the spinner, what is the probability that they get the same number?

Problem: I have a spinner that lands on 1 with a probability of $\frac{1}{6}$, 2 with a probability of $\frac{2}{6}$, 3 with a probability of $\frac{2}{6}$, and 4 with a probability of $\frac{1}{6}$. If Charlie and David both spin the spinner, what is the probability that they get the same number?

Problem: I have a spinner that lands on 1 with a probability of $\frac{1}{12}$, 2 with a probability of $\frac{3}{12}$, 3 with a probability of $\frac{4}{12}$, and 4 with a probability 

In [48]:
outputs = llm.generate(get_n_shot_cot_solve_prompts([data[q_idx]], [synth_data]), sampling_params)

print(outputs[0].prompt, outputs[0].outputs[0].text)
result = extract_cot_answer(outputs[0].outputs[0].text)
result

Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.23s/it, est. speed input: 141.69 toks/s, output: 70.54 toks/s]

Some examples of math problems and their answers, similar to the one you are going to be asked are provided below. Use them to understand and solve the problem. End your response with "<|eot_id|>".    

### Examples ###
Problem: I have a spinner that lands on 1 with a probability of $\frac{1}{8}$, 2 with a probability of $\frac{2}{8}$, 3 with a probability of $\frac{3}{8}$, and 4 with a probability of $\frac{4}{8}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?
Answer: $\frac{1}{4}$

Problem: I have a spinner that lands on 1 with a probability of $\frac{1}{12}$, 2 with a probability of $\frac{2}{12}$, 3 with a probability of $\frac{3}{12}$, and 4 with a probability of $\frac{4}{12}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?
Answer: $\frac{1}{6}$

Problem: I have a spinner that lands on 1 with a probability of $\frac{1}{16}$, 2 with a probability of $\frac{2}{16}$, 3 with a probabili




{'answer': '$\\frac{3}{10}$',
 'reasoning': 'To find the probability that Phil and Sarah get the same number, we need to find the probability of each possible outcome and add them together. The probability of both getting 1 is $\\frac{1}{10} \\cdot \\frac{1}{10} = \\frac{1}{100}$. The probability of both getting 2 is $\\frac{2}{10} \\cdot \\frac{2}{10} = \\frac{4}{100}$. The probability of both getting 3 is $\\frac{3}{10} \\cdot \\frac{3}{10} = \\frac{9}{100}$. The probability of both getting 4 is $\\frac{4}{10} \\cdot \\frac{4}{10} = \\frac{16}{100}$. Adding these probabilities together, we get $\\frac{1}{100} + \\frac{4}{100} + \\frac{9}{100} + \\frac{16}{100} = \\frac{30}{100} = \\frac{3}{10}$.'}

In [49]:
data[q_idx]

{'question': 'I have a spinner that lands on 1 with a probability of $\\frac{1}{10}$, 2 with a probability of $\\frac{2}{10}$, 3 with a probability of $\\frac{3}{10}$, and 4 with a probability of $\\frac{4}{10}$. If Phil and Sarah both spin the spinner, what is the probability that they get the same number?',
 'answer': '\\dfrac{3}{10}'}

In [50]:
# Check eval

math_eval.process_results(data[q_idx], result['answer'])

{'exact_match': 1}

In [13]:
# generate_prompts = get_generate_prompt(data[:5], 3)
generate_prompts = get_generate_prompt(data, 3)

In [61]:
with open('../src/_synth_data_70b_int4_5.json', 'r') as f:
    synth_data = json.load(f)

In [62]:
# solve_n_shot_prompts = get_n_shot_solve_prompts(data[:5], synth_data)
solve_n_shot_prompts = get_n_shot_cot_solve_prompts(data, synth_data)

In [63]:
answers = []

# for i in tqdm(range(0, len(solve_n_shot_prompts), BATCH_SIZE)):
# outputs = llm.generate(solve_n_shot_prompts[i:i+BATCH_SIZE], sampling_params)
outputs = llm.generate(solve_n_shot_prompts, sampling_params)
for output in outputs:
    generated_text = output.outputs[0].text
    answers.append(extract_cot_answer(generated_text))

Processed prompts:  20%|█▉        | 998/5000 [05:47<43:49,  1.52it/s, est. speed input: 1383.35 toks/s, output: 652.94 toks/s]  



Processed prompts:  20%|██        | 1007/5000 [06:00<2:49:17,  2.54s/it, est. speed input: 1347.30 toks/s, output: 638.81 toks/s]



Processed prompts:  28%|██▊       | 1404/5000 [15:12<4:41:17,  4.69s/it, est. speed input: 730.78 toks/s, output: 999.54 toks/s] 



Processed prompts:  56%|█████▌    | 2792/5000 [25:59<47:59,  1.30s/it, est. speed input: 849.62 toks/s, output: 1221.65 toks/s]  



Processed prompts:  56%|█████▌    | 2803/5000 [27:21<3:06:44,  5.10s/it, est. speed input: 810.27 toks/s, output: 1190.32 toks/s]



Processed prompts:  90%|█████████ | 4510/5000 [38:08<13:26,  1.65s/it, est. speed input: 928.31 toks/s, output: 1273.13 toks/s]  



Processed prompts:  91%|█████████ | 4530/5000 [39:53<1:10:53,  9.05s/it, est. speed input: 892.30 toks/s, output: 1247.74 toks/s]



Processed prompts: 100%|██████████| 5000/5000 [47:22<00:00,  1.76it/s, est. speed input: 826.85 toks/s, output: 1378.95 toks/s]  


In [64]:
# Save answers as json
with open("answers_5_shot_synth_8b_v2.json", "w") as f:
    json.dump(answers, f)