In [1]:
import re
import json
from typing import List

import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")

In [2]:
# Prompt Builders
# ----------------Algorithmic----------------
def prompt_algorithmic(ex): 
    return ex["prompt"]

# ----------------MMLU-Med----------------
def format_subject(subject):
    l = subject.split("_")
    s = ""
    for entry in l:
        s += " " + entry
    return s
def format_example(example, include_answer=False):
    prompt = f"Question: {example['question']}\n Options:"
    these_choices = example["choices"]
    choices = ["A", "B", "C", "D"]

    for i in range(len(these_choices)):
        prompt += f"\n{choices[i]}. {these_choices[i]}"

    prompt += "\nAnswer:"   
    if include_answer:
        # for in-context learning
        prompt += f" {choices[example['answer']]}\n\n"
    return prompt
def prompt_mmlu_med(ex):
    # https://github.com/hendrycks/test/blob/master/evaluate.py
    prompt = f"The following is a multiple choice question (with answers) about {format_subject(ex['subject'])}.  Output the answer in the format of \"The answer is (X)\" at the end.\n\n"
    return prompt + format_example(ex, include_answer=False)

# ----------------InfoBench----------------
def prompt_infobench(ex):
    return f"Instruction: {ex['instruction']}\nQuestion: {ex['input']}\nGeneration:"

In [None]:
# Task-specific Metrics
# ----------------Algorithmic----------------
def score_algorithmic(pred: str, gold: dict):
    pred = json.loads(pred)
    try:
        pred_pairs = {(tuple(pred['paths'][i]), pred['weights'][i]) for i in range(len(pred['paths']))}
    except:
        print("!! Failed to parse prediction:", pred)
        return None
    
    if len(pred_pairs) == 0:
        return 0.0

    gold_pairs = gold['paths']
    gold_pairs = {(tuple(d["path"]), d["weight"]) for d in gold_pairs}
    
    overlap = pred_pairs & gold_pairs
    return len(overlap) / len(pred)

# ----------------MMLU-Med----------------
def extract_answer(text):
    # remove the latex box, common for AIME
    text = re.sub(r'\$\\boxed\{([A-Za-z])\}\$', r'\1', text)

    pattern = r"answer is \(?([A-J])\)?"
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        print("1st answer extract failed\n" + text)
        return extract_again(text)

def extract_again(text):
    match = re.search(r'.*[aA]nswer:\s*([A-J])', text)
    if match:
        return match.group(1)
    else:
        return extract_final(text)

def extract_final(text):
    pattern = r"\b[A-J]\b(?!.*\b[A-J]\b)"
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(0)
    else:
        pattern = r"option \(?([A-J])\)?"
        match = re.search(pattern, text)
        if match:
            return match.group(1)
        else:
            return None

def convert_llm_response_to_solution(llm_response: str) -> str:
    # adapted from https://github.com/TIGER-AI-Lab/MMLU-Pro/blob/main/evaluate_from_api.py
    return extract_answer(llm_response.replace('**', ''))

def score_mmlu_med(pred: str, gold: int):
    print("Begin scoring for mmlu-med task...")
    choices = ["A", "B", "C", "D"]
    predicted_solution = convert_llm_response_to_solution(pred)
    print(f"Predicted solution: {predicted_solution}, Gold solution: {choices[gold]}")
    return choices[gold] == predicted_solution

# ----------------InfoBench----------------
# def score_infobench(pred: str, gold: dict):
import os
from openai import OpenAI
import time
SYS_MSG ="Based on the provided Input (if any) and Generated Text, answer the ensuing Questions with either a YES or NO choice. Your selection should be based on your judgment as well as the following rules:\n\n- YES: Select 'YES' if the generated text entirely fulfills the condition specified in the question. However, note that even minor inaccuracies exclude the text from receiving a 'YES' rating. As an illustration. consider a question that asks. \"Does each sentence in the generated text use a second person?” If even one sentence does not use the second person, the answer should NOT be 'YES'. To qualify for a 'YES' rating, the generated text must be entirely accurate and relevant to the question\n\n- NO: Opt for 'NO' if the generated text fails to meet the question's requirements or provides no information that could be utilized to answer the question. For instance, if the question asks. \"Is the second sentence in the generated text a compound sentence?\" and the generated text only has one sentence. it offers no relevant information to answer the question. Consequently, the answer should be 'NO'.'''"
def bool_ratio(bool_results: List[bool]) -> float:
    "Calculate true false ratio for eval results"
    count = {"true":0, "false":0}
    for entry in bool_results:
        if entry:
            count["true"] += 1
        else:
            count["false"] += 1
        
    return count['true']/sum(count.values())

def score_infobench(predicted_solution: str, example: str) -> float:
    # https://github.com/qinyiwei/InfoBench/blob/main/evaluation.py
    message = []
    answer = ""
    input_task = example['input']
    output = predicted_solution
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    for question in example["decomposed_questions"]:
        if len(message) == 0:
            if input_task:
                content =  f"{SYS_MSG}\n\nInput:\n\"{input_task}\"\n\nGenerated Text:\n\"{output}\"\n\nQuestion:\n{question}\n"
            else:
                content =  f"{SYS_MSG}\n\nGenerated Text:\n\"{output}\"\n\nQuestion:\n{question}\n"
        else:
            content = f"{question}\n"
        message.append({"role": "user", "content": content})
        # create a chat completion
        success = False
        early_stop = True
        while not success:
            try:
                # default config
                temperature = 1.0
                eval_model = "gpt-5-nano-2025-08-07"

                completion = client.chat.completions.create(
                        model=eval_model,
                        messages=message,
                        temperature=temperature,
                    )
                generation = completion.choices[0].message.content
                message.append(
                        {"role": "assistant", "content": generation})
                # check if generation is yes or no
                if generation.lower().startswith("yes") or generation.lower().startswith("no"):
                    if generation.lower().startswith("yes"):
                        answer += "Yes\n"
                    else:
                        answer += "No\n"
                else:
                    if "YES" in generation and "NO" not in generation:
                        answer += "Yes\n"
                    elif "YES" not in generation and "NO" in generation:
                        answer += "No\n"
                    else:
                        for msg in message:
                            print(msg['content'])
                        print("NO YES or NO answer!" + generation)
                        answer += "None\n"
                        early_stop = True
                        break
                success = True
            except Exception as e:
                print("ERROR!")
                print(e)
                print("Retry!")
                time.sleep(5)

            # when no answer occurs, break the loop and continue to next instance
            if early_stop:
                break

    answer = answer[:-1]
    # save eval results as List[bool]
    bool_results = []
    for i in answer.split('\n'):
        if i == "Yes":
            bool_results.append(True)
        elif i == "No":
            bool_results.append(False)
        else:
            bool_results.append(None)

    return bool_ratio(bool_results)

In [4]:
import os
from pydantic import BaseModel
from typing import List
from huggingface_hub import InferenceClient

client = InferenceClient(
    model="http://babel-9-3:9010/v1" # TODO
)

# class PathPair(BaseModel):
#     path: List[int]
#     weight: int
class AlgorithmicOutput(BaseModel):
    paths: List[List[int]]
    weights: List[int]

grammar_algorithmic = {
    "type": "json",
    "value": {
        "type": "object",
        "properties": {
            "paths": {
                "type": "array",
                "items": {
                    "type": "array",
                    "items": {"type": "integer"},
                },
            },
            "weights": {
                "type": "array",
                "items": {"type": "integer"},
            },
        },
        "required": ["paths", "weights"],
    },
}

In [5]:
MODELS = [
    ("Qwen3-4B", "Qwen/Qwen3-4B", True),
    # ("Qwen3-4B-Instruct-2507", "Qwen/Qwen3-4B-Instruct-2507", False),
    # ("Qwen3-1.7B", "Qwen/Qwen3-1.7B", True),
]

# Decoding configs
DECODE = [
    ("default", {}),  # model's generation_config
    ("greedy", {"do_sample": False}),
    ("temp_0-25", {"do_sample": True, "temperature": 0.25, "top_p": 1.0}),
    ("temp_1-5",  {"do_sample": True, "temperature": 1.5,  "top_p": 1.0}),
    ("beam3", {"do_sample": False, "num_beams": 3}),
    ("beam25", {"do_sample": False, "num_beams": 25}),
    ("typical", {"do_sample": True, "typical_p": 0.9, "top_p": 1.0, "temperature": 1.0}),
]

TASKS = [
    # ("graph_dev", "dev_test", prompt_algorithmic, score_algorithmic, grammar_algorithmic),
    # ("infobench", "dev_test", prompt_infobench, score_infobench, None),
    ("mmlu_med",  "dev_test", prompt_mmlu_med,  score_mmlu_med, None),
]

MAX_NEW_TOKENS = 4096 # TODO

DECODE_AGAINST_SCALING = [
    ("Qwen3-4B", {"temperature": 1.5, "top_p": 0.8}),
    ("Qwen3-1.7B", {"do_sample": False}),
]

In [6]:
def main():
    rows = []

    for model_name, hf_id, thinking_mode in MODELS:
        tokenizer = AutoTokenizer.from_pretrained(hf_id, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(
            hf_id, 
            trust_remote_code=True,
            torch_dtype="auto",
            device_map="auto"
        )

        for task, split, build_prompt_func, calculate_score_func, res_format in TASKS:
            print(f"\n=== Model: {model_name} | Task: {task} ===")
            print(f"response_format: {res_format}")
            dataset = load_dataset("vashistht/11763_datasets", task, split=split)
            dataset = dataset.select(range(5))  # debugging

            for mode_name, co in DECODE:
                metrics_sum, n = 0, 0
                for ex in dataset:
                    # Build input
                    prompt = build_prompt_func(ex)
                    messages = [{"role": "user", "content": prompt}]
                    if thinking_mode:
                        prompt = tokenizer.apply_chat_template(
                            messages,
                            tokenize=False,
                            add_generation_prompt=True,
                            enable_thinking=True,
                        )
                    else:
                        prompt = tokenizer.apply_chat_template(
                            messages,
                            tokenize=False,
                            add_generation_prompt=True,
                        )
                    model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
                    gen_config = GenerationConfig(
                        max_new_tokens=MAX_NEW_TOKENS,
                        **co,
                    )
                    generated_ids = model.generate(
                        **model_inputs,
                        generation_config=gen_config,
                    )
                                        
                    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
                    # parsing thinking content
                    try:
                        # rindex finding 151668 (</think>)
                        index = len(output_ids) - output_ids[::-1].index(151668)
                    except ValueError:
                        index = 0

                    thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
                    content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

                    print("thinking content:", thinking_content)
                    print("content:", content)
                    txt = content
                    # txt = tokenizer.decode(output_ids, skip_special_tokens=True)
                    # print(txt)
                    # print('-'*20)

                    # output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

                    # # parsing thinking content
                    # try:
                    #     # rindex finding 151668 (</think>)
                    #     index = len(output_ids) - output_ids[::-1].index(151668)
                    # except ValueError:
                    #     index = 0

                    # thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
                    # content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")


                    # response = client.text_generation(
                    #     prompt,
                    #     model=hf_id,
                    #     max_new_tokens=MAX_NEW_TOKENS,
                    #     grammar=res_format,
                    #     **co,
                    # )
                    # print(response)
                    # txt = response
                    

                    # response = client.chat.completions.create(
                    #     model=hf_id,
                    #     messages=messages,
                    #     max_tokens=MAX_NEW_TOKENS,
                    #     response_format=res_format,
                    #     extra_body={
                    #         'chat_template_kwargs': {'enable_thinking': False}
                    #     } if thinking_mode else {},
                    #     **co,
                    # )
                    # txt = response.choices[0].message.content
                    # print(response)
                    # print(txt)
                    
                    if task == "graph_dev":
                        m = calculate_score_func(txt, ex["solution"])
                    elif task == "mmlu_med":
                        m = calculate_score_func(txt, ex["answer"])
                        print(ex["answer"], type(ex["answer"]))
                        print('=' * 20)
                    elif task == "infobench":
                        m = calculate_score_func(txt, ex)

                    if m is not None:
                        metrics_sum += m
                        n += 1

                row = {"model": model_name, "hf_id": hf_id, "task": task, "split": split, "decode": mode_name}
                row['score'] = metrics_sum / max(1, n)
                rows.append(row)
                print(row)

        # del model
        if DEVICE == "cuda": torch.cuda.empty_cache()

    df = pd.DataFrame(rows)
    df.to_csv('shared_task_result.csv', index=False)
    print(f"\nResults saved!")
    for t in df.task.unique():
        print("\n==", t, "==")
        print(df[df.task==t].to_string(index=False))


In [7]:
main() # MMLU-med only

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


=== Model: Qwen3-4B | Task: mmlu_med ===
response_format: None


`generation_config` default values have been modified to match model-specific defaults: {'do_sample': True, 'temperature': 0.6, 'top_k': 20, 'top_p': 0.95, 'pad_token_id': 151643, 'bos_token_id': 151643, 'eos_token_id': [151645, 151643]}. If this is not desired, please set these values explicitly.


KeyboardInterrupt: 

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "Qwen/Qwen3-1.7B"
model = AutoModelForCausalLM.from_pretrained(model_id)
print(model.generation_config.to_dict())


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

{'max_length': 20, 'max_new_tokens': None, 'min_length': 0, 'min_new_tokens': None, 'early_stopping': False, 'max_time': None, 'stop_strings': None, 'do_sample': True, 'num_beams': 1, 'num_beam_groups': 1, 'use_cache': True, 'cache_implementation': None, 'cache_config': None, 'return_legacy_cache': None, 'prefill_chunk_size': None, 'temperature': 0.6, 'top_k': 20, 'top_p': 0.95, 'min_p': None, 'typical_p': 1.0, 'epsilon_cutoff': 0.0, 'eta_cutoff': 0.0, 'diversity_penalty': 0.0, 'repetition_penalty': 1.0, 'encoder_repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'bad_words_ids': None, 'force_words_ids': None, 'renormalize_logits': False, 'constraints': None, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'sequence_bias': None, 'token_healing': False, 'guidance_scale': None, 'watermarking_config': None, 'num_return_sequenc