In [1]:
import os
import gc
import time
import warnings

import json
import pandas as pd
import re
import torch
import json
from tqdm import tqdm

from vllm import LLM, SamplingParams
import ctypes

INFO 08-10 07:40:11 [__init__.py:235] Automatically detected platform cuda.


In [2]:
warnings.simplefilter('ignore')

os.environ["TOKENIZERS_PARALLELISM"] = "false"

def clean_memory(deep=False):
    gc.collect()
    if deep:
        ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()

llm_model_path = '/pscratch/sd/r/ritesh11/temp_dir/trained_models/base'
tok_path = '/pscratch/sd/r/ritesh11/temp_dir/Qwen3-1.7B'
res_dir = "/pscratch/sd/r/ritesh11/temp_dir/MATS_qna/og_noreason"
FINETUNE = False

if not FINETUNE:
    llm_model_path = tok_path

llm = LLM(
    llm_model_path,
    tokenizer=tok_path,
    #dtype="half",                -> Changed this
    #max_num_seqs=128,            -> Changed this       
    trust_remote_code=True,     
    tensor_parallel_size=1,      
    gpu_memory_utilization=0.90, 
)

INFO 08-10 07:40:20 [config.py:1604] Using max model len 40960
INFO 08-10 07:40:20 [config.py:2434] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 08-10 07:40:21 [core.py:572] Waiting for init message from front-end.
INFO 08-10 07:40:21 [core.py:71] Initializing a V1 LLM engine (v0.10.0) with config: model='/pscratch/sd/r/ritesh11/temp_dir/Qwen3-1.7B', speculative_config=None, tokenizer='/pscratch/sd/r/ritesh11/temp_dir/Qwen3-1.7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=40960, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), o

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 08-10 07:40:26 [default_loader.py:262] Loading weights took 2.76 seconds
INFO 08-10 07:40:27 [gpu_model_runner.py:1892] Model loading took 3.2152 GiB and 2.968192 seconds
INFO 08-10 07:40:33 [backends.py:530] Using cache directory: /global/homes/r/ritesh11/.cache/vllm/torch_compile_cache/ed38c1902a/rank_0_0/backbone for vLLM's torch.compile
INFO 08-10 07:40:33 [backends.py:541] Dynamo bytecode transform time: 6.51 s
INFO 08-10 07:40:40 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 5.867 s
INFO 08-10 07:40:40 [monitor.py:34] torch.compile takes 6.51 s in total
INFO 08-10 07:40:41 [gpu_worker.py:255] Available KV cache memory: 30.79 GiB
INFO 08-10 07:40:42 [kv_cache_utils.py:833] GPU KV cache size: 288,272 tokens
INFO 08-10 07:40:42 [kv_cache_utils.py:837] Maximum concurrency for 40,960 tokens per request: 7.04x


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:01<00:00, 43.17it/s]


INFO 08-10 07:40:43 [gpu_model_runner.py:2485] Graph capturing finished in 2 secs, took 0.49 GiB
INFO 08-10 07:40:43 [core.py:193] init engine (profile, create kv cache, warmup model) took 16.73 seconds


In [3]:
tokenizer  = llm.get_tokenizer()

In [4]:
N_SAMPLES = 20

sampling_params = SamplingParams(
    n=N_SAMPLES,
    temperature=1,
    max_tokens=9000,
)

In [5]:
qtypes = ['MCQ_KNOWLEDGE_TRUE',
 'MCQ_KNOWLEDGE_FALSE',
 'MCQ_DISTINGUISH',
 'GENERATIVE_DISTINGUISH']

qsubtypes = ['direct_reference', 'indirect_reference']

In [6]:
def apply_template(prompt, tokenizer, qtype):
    
    if qtype == "GENERATIVE_DISTINGUISH":
        messages = [
            {
                "role": "system",
                "content": (
                    "You are an intelligent assistant tasked with distinguishing between "
                    "factual and counterfactual scenarios. Read the prompt and the two scenarios. "
                    "Based on your world knowledge, determine which scenario aligns with real-world facts. "
                    "Answer in one word "
                    "in this format:\n\n"
                    "/boxed{0} or /boxed{1}\n\n"
                    "Use this format exactly as shown. Do not write anything after the boxed answer."
                )
            },
            {
                "role": "user",
                "content": (
                    f"Prompt: {prompt['prompt']}\n\n"
                    f"Scenario 0: {prompt['scenarios']['true_fact_scenario']}\n"
                    f"Scenario 1: {prompt['scenarios']['false_fact_scenario']}\n\n"
                    "Which scenario aligns with real-world facts?"
                )
            }
        ]
    else:
        messages = [
            {
                "role": "system",
                "content": (
                    "You are an intelligent assistant tasked with answering multiple-choice questions. "
                    "Choose the most appropriate option from the given choices. "
                    "Your final answer must be a single word or phrase exactly as it appears in the options. "
                    "End your response with the answer written in this format:\n\n"
                    "/boxed{ANSWER}\n\n "
                    "Do not write anything else after boxed answer."
                )
            },
            {
                "role": "user",
                "content": (
                    f"{prompt['question']}\n"
                    f"Options: {prompt['options']}"
                )
            }
        ]

    formatted_prompt = tokenizer.apply_chat_template(
        conversation=messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )
    return formatted_prompt


In [7]:
for temp in [0.5, 1.0]:
    
    sampling_params.temperature = temp
    
    for fact_num in range(10):
        with open(f"/pscratch/sd/r/ritesh11/temp_dir/MATS_qna/Fact{fact_num}.json", "r") as f:
            data = json.load(f)
            
            # Output structure
            updated_rows = {}
            BATCH_SIZE = 1  # or as per your VRAM and throughput
            all_gens = []
            
            for qt in qtypes:
                
                if FINETUNE and qt == "MCQ_KNOWLEDGE_TRUE":
                    continue
                if not FINETUNE and qt == "MCQ_KNOWLEDGE_FALSE":
                    continue
                
                for qsub in qsubtypes:
            
                    qdata = data[qt][qsub]
            
                    for i in tqdm(range(0, len(qdata), BATCH_SIZE), desc=f"{qt} | {qsub}"):
                        batch_raw_prompts = qdata[i:i + BATCH_SIZE]
            
                        # Format prompts for vLLM (chat-style)
                        batch_prompts = [
                            apply_template(prompt, tokenizer, qt) for prompt in batch_raw_prompts
                        ]
            
                        # Generate using vLLM
                        request_output = llm.generate(
                            prompts=batch_prompts,
                            sampling_params=sampling_params,
                            use_tqdm=False,
                        )
            
                        # Store results: handle multiple outputs per prompt
                        for j, prompt_dict in enumerate(batch_raw_prompts):
                            if qt not in updated_rows:
                                updated_rows[qt] = {}
                            if qsub not in updated_rows[qt]:
                                updated_rows[qt][qsub] = []
                        
                            generations = [out.text.strip() for out in request_output[j].outputs]
                        
                            # Clone original prompt and attach generations
                            prompt_with_gen = prompt_dict.copy()
                            prompt_with_gen["generations"] = generations
                        
                            # Append to results list
                            updated_rows[qt][qsub].append(prompt_with_gen)
                            all_gens.extend(generations)
            
                        # Inside the batch loop:
                        batch_filename = os.path.join(res_dir, f"backup_Fact{fact_num}_{temp}.json")
                        with open(batch_filename, "w") as f:
                            json.dump(updated_rows, f, indent=2)
                        
                        # print(f"✅ BATCH {i + BATCH_SIZE} / {len(qdata)} DONE for {qt} | {qsub}")
            
            # Final dump after all batches
            final_filename = os.path.join(res_dir, f"final_output_Fact{fact_num}_{temp}.json")
            with open(final_filename, "w") as f:
                json.dump(updated_rows, f, indent=2)
            
            print(f"✅ Fact {fact_num} Done")
            
        print(f"TEMPERATURE {temp} Done")

MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 15.88it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 17.92it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 16.40it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 17.77it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 26.72it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 18.93it/s]


✅ Fact 0 Done
TEMPERATURE 0.5 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 14.08it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:01<00:00, 14.11it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 14.51it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 15.11it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 10/10 [00:00<00:00, 37.55it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 10/10 [00:00<00:00, 35.94it/s]


✅ Fact 1 Done
TEMPERATURE 0.5 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 10.59it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:01<00:00, 10.35it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 14.12it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:01<00:00,  9.48it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 38.07it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 32.76it/s]


✅ Fact 2 Done
TEMPERATURE 0.5 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 20.32it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 20.45it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 20.10it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 19.63it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 24.36it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 22.65it/s]


✅ Fact 3 Done
TEMPERATURE 0.5 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 14.45it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 15.14it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 17.49it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 17.69it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 28.03it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 28.25it/s]


✅ Fact 4 Done
TEMPERATURE 0.5 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 20.51it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 20.14it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 20.40it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 19.76it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 38.25it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 37.47it/s]


✅ Fact 5 Done
TEMPERATURE 0.5 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 10.90it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:01<00:00, 11.67it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:01<00:00,  9.94it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:01<00:00, 12.70it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 22.50it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 20.94it/s]


✅ Fact 6 Done
TEMPERATURE 0.5 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 12.82it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:01<00:00, 13.48it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 12.18it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:01<00:00, 14.55it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 30.35it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 23.35it/s]


✅ Fact 7 Done
TEMPERATURE 0.5 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 18.57it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 17.57it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 17.13it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 17.65it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 30.30it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 23.68it/s]


✅ Fact 8 Done
TEMPERATURE 0.5 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 18.25it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 18.16it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 17.90it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 17.44it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 26.86it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 30.43it/s]


✅ Fact 9 Done
TEMPERATURE 0.5 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 16.88it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 18.22it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 16.78it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 18.35it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 24.15it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 27.83it/s]


✅ Fact 0 Done
TEMPERATURE 1.0 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 14.29it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:01<00:00, 14.22it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 14.93it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 15.16it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 10/10 [00:00<00:00, 35.91it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 10/10 [00:00<00:00, 39.44it/s]


✅ Fact 1 Done
TEMPERATURE 1.0 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 10.95it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:01<00:00, 10.49it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 14.22it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:01<00:00,  9.36it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 34.20it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 28.07it/s]


✅ Fact 2 Done
TEMPERATURE 1.0 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 20.48it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 20.61it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 20.51it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 20.02it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 23.60it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 20.24it/s]


✅ Fact 3 Done
TEMPERATURE 1.0 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 14.68it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:01<00:00, 14.94it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 17.81it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 18.00it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 28.16it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 26.31it/s]


✅ Fact 4 Done
TEMPERATURE 1.0 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 20.82it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 19.28it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 20.47it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 19.95it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 38.30it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 37.89it/s]


✅ Fact 5 Done
TEMPERATURE 1.0 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 10.99it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:01<00:00, 11.78it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:01<00:00,  9.42it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:01<00:00, 12.80it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 20.99it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 19.75it/s]


✅ Fact 6 Done
TEMPERATURE 1.0 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 12.80it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:01<00:00, 13.57it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:01<00:00, 11.77it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:01<00:00, 14.93it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 29.30it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 20.87it/s]


✅ Fact 7 Done
TEMPERATURE 1.0 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 18.99it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 17.99it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 17.56it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 17.93it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 27.63it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 23.82it/s]


✅ Fact 8 Done
TEMPERATURE 1.0 Done


MCQ_KNOWLEDGE_TRUE | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 18.55it/s]
MCQ_KNOWLEDGE_TRUE | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 18.44it/s]
MCQ_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 17.52it/s]
MCQ_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 17.28it/s]
GENERATIVE_DISTINGUISH | direct_reference: 100%|██████████| 15/15 [00:00<00:00, 23.52it/s]
GENERATIVE_DISTINGUISH | indirect_reference: 100%|██████████| 15/15 [00:00<00:00, 29.82it/s]

✅ Fact 9 Done
TEMPERATURE 1.0 Done



