In [1]:
import json
import os
import re

from vllm import LLM, SamplingParams

# os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
# os.environ["NCCL_P2P_DISABLE"] = "1"

In [2]:
def load_math(path="../datasets/MATH", split="train"):
    with open(os.path.join(path, split, "dataset.json")) as f:
        data = json.load(f)
    
    examples = [{
        'question': q,
        'answer': a,
    } for q, a in zip(data['question'], data['extracted_answers'])]

    return examples

In [3]:
data = load_math(split='test')
# data = load_math(split='train') + load_math(split='test')
len(data)

5000

In [None]:
llm = LLM(
    # model="meta-llama/Llama-3.1-8B-Instruct", 
    model="hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4", 
    tensor_parallel_size=2,
    # download_dir="/home/amittur/.cache/huggingface/hub", 
    max_model_len=10000,
    # gpu_memory_utilization=0.95
    # max_num_seqs=128,
)

INFO 11-27 11:56:14 config.py:350] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'.
INFO 11-27 11:56:14 awq_marlin.py:109] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 11-27 11:56:15 config.py:1020] Defaulting to use mp for distributed inference
INFO 11-27 11:56:15 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', speculative_config=None, tokenizer='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=10000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda,

In [6]:
SOLVE_PROMPT = """
Answer the math problem in the format shown below. End your response with "<|eot_id|>".

---
Problem: <you will be given a math question> 
Reasoning: <your step by step reasoning for the answer>
Answer: <your final answer only>
---

Problem: {}
"""

def get_prompts(data):
    return [SOLVE_PROMPT.format(d['question']) for d in data]

In [7]:
sampling_params = SamplingParams(
    temperature=0,
    max_tokens=5120,
    stop_token_ids=[128001, 128008, 128009],
)

In [8]:
def extract_answer(text):
    match = re.search(r"Answer: (.+)", text, re.DOTALL)
    answer = ''
    reasoning = '[NO_COT_REASONING]'
    if match:
        # reasoning = match.group(1).strip()
        answer = match.group(1).strip()
    return {
        'answer': answer,
        'reasoning': reasoning
    }

def extract_cot_answer(text):
    match = re.search(r"Reasoning: (.+)Answer: (.+)", text, re.DOTALL)
    answer = reasoning = ''
    if match:
        reasoning = match.group(1).strip()
        answer = match.group(2).strip()
    return {
        'answer': answer,
        'reasoning': reasoning
    }

In [9]:
# Sanity Check

outputs = llm.generate(SOLVE_PROMPT.format(data[5]['question']), sampling_params)
print(outputs[0].outputs[0].text)
print(extract_cot_answer(outputs[0].outputs[0].text))

Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.70s/it, est. speed input: 26.71 toks/s, output: 30.44 toks/s]

Reasoning: The area of the shaded region is the area of the entire circle minus the area of the unshaded region. The area of the entire circle is $\pi \cdot 6^2 = 36\pi$ square inches. The area of the unshaded region is $\pi \cdot 4^2 = 16\pi$ square inches. Therefore, the area of the shaded region is $36\pi - 16\pi = 20\pi$ square inches. The ratio of the area of the shaded region to the area of the entire circle is $\frac{20\pi}{36\pi} = \frac{5}{9}$. Since the darts land randomly, we can expect $\frac{5}{9}$ of the darts to land in the shaded region. Therefore, we can expect $9 \cdot \frac{4}{9} = 4$ darts to land in the non-shaded region.
Answer: 4
{'answer': '4', 'reasoning': 'The area of the shaded region is the area of the entire circle minus the area of the unshaded region. The area of the entire circle is $\\pi \\cdot 6^2 = 36\\pi$ square inches. The area of the unshaded region is $\\pi \\cdot 4^2 = 16\\pi$ square inches. Therefore, the area of the shaded region is $36\\pi - 1




In [10]:
data_prompts = get_prompts(data)

In [11]:
outputs = llm.generate(data_prompts, sampling_params)
answers = []

for i, output in enumerate(outputs):
    # prompt = output.prompt
    generated_text = output.outputs[0].text

    answers.append(extract_cot_answer(generated_text))

    # print(f"Question Idx: {i}, Response: {generated_text!r}")

Processed prompts:  13%|█▎        | 671/5000 [05:11<34:08,  2.11it/s, est. speed input: 270.40 toks/s, output: 510.89 toks/s]  



Processed prompts:  15%|█▌        | 769/5000 [06:05<47:30,  1.48it/s, est. speed input: 262.74 toks/s, output: 533.38 toks/s]  



Processed prompts:  16%|█▋        | 814/5000 [07:19<2:59:53,  2.58s/it, est. speed input: 231.88 toks/s, output: 517.65 toks/s]



Processed prompts:  17%|█▋        | 826/5000 [08:51<20:09:21, 17.38s/it, est. speed input: 195.13 toks/s, output: 459.05 toks/s]



Processed prompts:  17%|█▋        | 874/5000 [18:30<6:18:21,  5.50s/it, est. speed input: 98.99 toks/s, output: 422.23 toks/s]  



Processed prompts:  18%|█▊        | 905/5000 [20:00<3:20:49,  2.94s/it, est. speed input: 95.70 toks/s, output: 434.24 toks/s]



Processed prompts:  19%|█▊        | 929/5000 [20:41<2:32:50,  2.25s/it, est. speed input: 94.92 toks/s, output: 439.93 toks/s]



Processed prompts:  19%|█▉        | 969/5000 [21:25<55:55,  1.20it/s, est. speed input: 95.58 toks/s, output: 447.95 toks/s]  



Processed prompts:  20%|█▉        | 997/5000 [21:56<1:51:39,  1.67s/it, est. speed input: 95.67 toks/s, output: 455.16 toks/s]



Processed prompts:  21%|██        | 1037/5000 [22:30<43:50,  1.51it/s, est. speed input: 96.84 toks/s, output: 465.31 toks/s]  



Processed prompts:  22%|██▏       | 1105/5000 [23:29<1:28:18,  1.36s/it, est. speed input: 98.37 toks/s, output: 467.71 toks/s]



Processed prompts:  23%|██▎       | 1134/5000 [25:17<12:32:28, 11.68s/it, est. speed input: 94.02 toks/s, output: 454.73 toks/s]



Processed prompts:  23%|██▎       | 1174/5000 [31:15<1:15:20,  1.18s/it, est. speed input: 78.92 toks/s, output: 448.33 toks/s] 



Processed prompts:  24%|██▍       | 1196/5000 [31:56<2:00:06,  1.89s/it, est. speed input: 78.96 toks/s, output: 457.90 toks/s]



Processed prompts:  24%|██▍       | 1217/5000 [32:20<1:03:37,  1.01s/it, est. speed input: 79.36 toks/s, output: 458.88 toks/s]



Processed prompts:  25%|██▌       | 1255/5000 [32:50<35:48,  1.74it/s, est. speed input: 80.33 toks/s, output: 462.97 toks/s]  



Processed prompts:  28%|██▊       | 1410/5000 [34:09<21:47,  2.75it/s, est. speed input: 84.76 toks/s, output: 471.54 toks/s]  



Processed prompts:  30%|██▉       | 1491/5000 [35:09<32:25,  1.80it/s, est. speed input: 86.62 toks/s, output: 474.81 toks/s]  



Processed prompts:  30%|███       | 1524/5000 [35:40<57:13,  1.01it/s, est. speed input: 86.96 toks/s, output: 474.74 toks/s]  



Processed prompts:  31%|███       | 1549/5000 [38:50<15:29:40, 16.16s/it, est. speed input: 81.11 toks/s, output: 462.53 toks/s]



Processed prompts:  32%|███▏      | 1588/5000 [41:18<1:12:06,  1.27s/it, est. speed input: 78.09 toks/s, output: 463.00 toks/s] 



Processed prompts:  32%|███▏      | 1601/5000 [43:00<4:22:37,  4.64s/it, est. speed input: 75.71 toks/s, output: 463.76 toks/s] 



Processed prompts:  32%|███▏      | 1611/5000 [43:20<1:32:11,  1.63s/it, est. speed input: 75.53 toks/s, output: 466.52 toks/s]



Processed prompts:  38%|███▊      | 1924/5000 [45:39<47:54,  1.07it/s, est. speed input: 83.12 toks/s, output: 483.63 toks/s]  



Processed prompts:  41%|████      | 2060/5000 [47:04<48:39,  1.01it/s, est. speed input: 86.75 toks/s, output: 489.77 toks/s]  



Processed prompts:  42%|████▏     | 2078/5000 [47:31<39:44,  1.23it/s, est. speed input: 86.70 toks/s, output: 489.33 toks/s]  



Processed prompts:  43%|████▎     | 2128/5000 [49:14<4:59:33,  6.26s/it, est. speed input: 85.92 toks/s, output: 487.37 toks/s]



Processed prompts:  43%|████▎     | 2164/5000 [55:53<2:58:49,  3.78s/it, est. speed input: 77.05 toks/s, output: 478.54 toks/s] 



Processed prompts:  45%|████▍     | 2233/5000 [56:54<18:05,  2.55it/s, est. speed input: 77.95 toks/s, output: 483.63 toks/s]  



Processed prompts:  47%|████▋     | 2351/5000 [58:24<1:01:57,  1.40s/it, est. speed input: 80.53 toks/s, output: 485.38 toks/s]



Processed prompts:  47%|████▋     | 2371/5000 [1:00:47<13:27:19, 18.43s/it, est. speed input: 78.32 toks/s, output: 476.15 toks/s]



Processed prompts:  62%|██████▏   | 3095/5000 [1:10:06<16:58,  1.87it/s, est. speed input: 88.21 toks/s, output: 486.28 toks/s]   



Processed prompts:  70%|██████▉   | 3493/5000 [1:13:20<39:14,  1.56s/it, est. speed input: 98.28 toks/s, output: 488.25 toks/s]



Processed prompts:  70%|███████   | 3521/5000 [1:17:29<2:20:29,  5.70s/it, est. speed input: 93.91 toks/s, output: 483.26 toks/s]



Processed prompts:  74%|███████▍  | 3698/5000 [1:19:34<14:31,  1.49it/s, est. speed input: 97.76 toks/s, output: 485.64 toks/s]  



Processed prompts:  76%|███████▌  | 3796/5000 [1:20:53<38:57,  1.94s/it, est. speed input: 99.50 toks/s, output: 485.24 toks/s]



Processed prompts:  77%|███████▋  | 3861/5000 [1:21:44<07:56,  2.39it/s, est. speed input: 99.83 toks/s, output: 489.03 toks/s]



Processed prompts:  94%|█████████▎| 4680/5000 [1:26:24<02:03,  2.59it/s, est. speed input: 111.18 toks/s, output: 503.98 toks/s]



Processed prompts: 100%|██████████| 5000/5000 [1:36:20<00:00,  1.16s/it, est. speed input: 105.98 toks/s, output: 511.36 toks/s]


In [12]:
# Save answers as json
with open("answers_70b_int4.json", "w") as f:
    json.dump(answers, f)