## MATH-500

In [18]:
import os
import csv
import time
import pandas as pd
from dotenv import load_dotenv
import dashscope

# Load environment variable (DASHSCOPE_API_KEY)
load_dotenv("dashscope_api_key.env")
api_key = os.getenv("DASHSCOPE_API_KEY")
if not api_key:
    print("❌ DASHSCOPE_API_KEY not found!")

def call_model(
    prompt: str,
    model_name: str,
    system_prompt: str = (
        "You are a helpful assistant. "
        "Please show your reasoning step by step (Chain of Thought). "
        "Then, on a new line at the end, write: 'Final Answer: <the result>'."
    ),
    temperature: float = 0.7,
    top_p: float = 0.9,
    max_retries: int = 3,
    retry_delay: float = 0.3
) -> tuple[str, dict]:
    """
    Call the first model and return the full text response along with usage info.
    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]
    for attempt in range(max_retries):
        response = dashscope.Generation.call(
            api_key=api_key,
            model=model_name,
            messages=messages,
            temperature=temperature,
            top_p=top_p,
            result_format="message"
        )
        status_code = response.get("status_code", None)
        if status_code == 429:
            print(f"⏳ [Attempt {attempt+1}/{max_retries}] Rate limit! Waiting {retry_delay}s...")
            time.sleep(retry_delay)
            continue
        try:
            content = response["output"]["choices"][0]["message"]["content"]
            usage = response.get("usage", {})
            return content, usage
        except (TypeError, KeyError, IndexError):
            print(f"⚠️ [Attempt {attempt+1}/{max_retries}] Unexpected structure:\n{response}")
            time.sleep(retry_delay)
    print("❌ All attempts failed, returning empty result.")
    return "", {}


def judge_answers(
    responses_texts: list[str],
    reference_answer: str,
    judge_model: str,
    system_prompt: str = (
        "You are a strict and precise mathematical judge.\n\n"
        "You are given 5 responses generated by a model, and the correct reference answer (in LaTeX).\n\n"
        "Your task is to:\n"
        "1. Determine the majority answer — the most frequently occurring final answer across the responses. "
        "   - If there is a tie (i.e., two or more answers occur with the same highest frequency), select the one that appears first.\n"
        "   - If all 5 responses are different (i.e., no answer occurs more than once), select the final answer from the first response.\n"
        "2. Compare this majority answer to the reference answer.\n"
        "3. If the majority answer matches the reference answer, then Correct = True; otherwise, False.\n"
        "4. Identify the indices (starting from 1) of all responses that produced the majority answer (regardless of whether it's correct).\n\n"
        "Respond in **exactly** this format:\n"
        "Correct: True or False\n"
        "SelectedIndices: <comma separated indices, e.g., 1,2,3>\n\n"
        "Respond with exactly those two lines. Do not include any extra explanation or reasoning."
    ),
    temperature: float = 0.0,
    top_p: float = 1.0,
    max_retries: int = 3,
    retry_delay: float = 0.3
) -> dict:
    """
    Return both correctness, a confidence score, and the indices (1-indexed) of the responses
    that are considered as the majority answer.
    """
    import re

    user_prompt = f"Reference answer (LaTeX): {reference_answer}\n\n"
    user_prompt += f"Below are {len(responses_texts)} responses from the model:\n"
    for i, text in enumerate(responses_texts, start=1):
        user_prompt += f"\n--- Response {i} ---\n{text}\n"
    user_prompt += (
        "\nQuestion: Are the majority of these responses correct?\n"
        "Respond in the format:\n"
        "Correct: True or False\n"
        "Confidence: <float>\n"
        "SelectedIndices: <comma separated indices (starting from 1)>\n"
    )

    for attempt in range(max_retries):
        response = dashscope.Generation.call(
            api_key=api_key,
            model=judge_model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=temperature,
            top_p=top_p,
            result_format="message"
        )
        try:
            text = response["output"]["choices"][0]["message"]["content"]
        except (TypeError, KeyError, IndexError):
            time.sleep(retry_delay)
            continue

        print(f"Judge Text:{text}")
        
        match = re.search(r"Correct:\s*(True|False)", text, flags=re.IGNORECASE)
        # conf_match = re.search(r"Confidence:\s*([0-1](?:\.\d+)?)", text)
        sel_match = re.search(r"SelectedIndices:\s*([\d,\s]+)", text)
        
        if match and sel_match:
            is_correct = match.group(1).strip().lower() == "true"
            # confidence = float(conf_match.group(1))
            selected_indices_str = sel_match.group(1).strip()
            try:
                # transform selected indices string to int list
                selected_indices = [int(idx.strip()) for idx in selected_indices_str.split(",") if idx.strip().isdigit()]
            except ValueError:
                selected_indices = []
            return {"correct": is_correct, "confidence": confidence, "selected_indices": selected_indices}
    print("❌ judge_answers: failed all retries.")
    return {"correct": False, "confidence": 0.0, "selected_indices": []}



# Load the MATH-500 dataset (CSV file with fields: unique_id, subject, level, problem, solution, answer)
data_path = "dataset/MATH-500/math500_processed.csv"
df = pd.read_csv(data_path)
num_samples = 20  # Set to -1 to use the full dataset; set to a positive number to limit sample size
subset = df if num_samples == -1 else df.head(num_samples)

# List of initial models to evaluate (used to generate responses)
models_to_test = ["qwen2.5-math-1.5b-instruct"]

# The second (judge) model believed to be more reliable
judge_model = "deepseek-v3"  # Replace this with the actual judge model name

# Global parameters
temperature = 0.7
top_p = 0.9
samples_per_question = 5

for model_name in models_to_test:
    print(f"\n\n==================== Evaluating Model: {model_name} ====================")
    model_short_names = {"qwen2.5-math-1.5b-instruct": "qwen"}
    dataset_name = "math500"
    short_name = model_short_names.get(model_name, model_name.replace("/", "_"))
    save_path = f"results/self_consistency_{dataset_name}_{short_name}.csv"

    # Resume mode: check if result file exists and read already processed indices
    done_indices = set()
    if os.path.exists(save_path):
        try:
            existing_df = pd.read_csv(save_path)
            done_indices = set(existing_df["index"].tolist())
            print(f"🔁 Resuming from existing result file: {save_path}")
        except Exception as e:
            print(f"⚠️ Failed to read existing result file: {e}")

    # Iterate through each question in the dataset
    for idx, row in subset.iterrows():
        if idx in done_indices:
            continue

        question = row["problem"]
        gold_answer = row["answer"]

        print(f"\n=== Evaluating question at index {idx} ===")
        print("Question:", question)

        # Collect multiple responses from the model along with usage info
        responses_texts = []
        usages = []
        for sample_i in range(samples_per_question):
            try:
                output, usage = call_model(
                    prompt=question,
                    model_name=model_name,
                    temperature=temperature,
                    top_p=top_p
                )
            except Exception as e:
                print(f"[Error in call_model] {e}")
                output = ""
                usage = {}
            responses_texts.append(output)
            usages.append(usage)
        
        # Build a dictionary to store token usage info per response (1-indexed)
        usage_dict = {i+1: usages[i] for i in range(len(usages))}

        # Use the judging model to determine correctness and get selected response indices
        judgement = judge_answers(
            responses_texts=responses_texts,
            reference_answer=gold_answer,
            judge_model=judge_model,
            temperature=0.0,
            top_p=1.0
        )
        is_correct = judgement["correct"]
        selected_indices = judgement.get("selected_indices", [])
        confidence = len(selected_indices) / 5
        
        # Compute the average length (in characters) of the selected responses
        selected_responses = [
            responses_texts[sel_idx - 1] for sel_idx in selected_indices if sel_idx - 1 < len(responses_texts)
        ]
        if selected_responses:
            response_length = sum(len(resp) for resp in selected_responses) / len(selected_responses)
            response_length = int(response_length)
        else:
            response_length = 0

        # Compute average token usage across all responses
        if usages:
            total_completion_tokens = 0
            total_prompt_tokens = 0
            total_total_tokens = 0
            for usage in usages:
                total_completion_tokens += usage.get("output_tokens", usage.get("completion_tokens", 0))
                total_prompt_tokens += usage.get("input_tokens", usage.get("prompt_tokens", 0))
                total_total_tokens += usage.get("total_tokens", 0)
            n = len(usages)
            completion_tokens = int(total_completion_tokens / n)
            prompt_tokens = int(total_prompt_tokens / n)
            total_tokens = int(total_total_tokens / n)
        else:
            completion_tokens = 0
            prompt_tokens = 0
            total_tokens = 0

        # Create result dictionary for this example
        result_row = {
            "index": idx,
            "gold_answer": gold_answer,
            "correct": is_correct,
            "response_length": response_length,
            "confidence": confidence,
            "completion_tokens": completion_tokens,
            "prompt_tokens": prompt_tokens,
            "total_tokens": total_tokens,
        }

        # Write the result immediately to the CSV file (supports resuming)
        write_header = not os.path.exists(save_path)
        with open(save_path, mode='a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=result_row.keys())
            if write_header:
                writer.writeheader()
            writer.writerow(result_row)

        print(f"✅ Saved result for index {idx}: {'Correct' if is_correct else 'Incorrect'}")



🔁 Resuming from existing result file: results/self_consistency_math500_qwen.csv

=== Evaluating question at index 2 ===
Question: If $f(x) = \frac{3x-2}{x-2}$, what is the value of $f(-2) +f(-1)+f(0)$? Express your answer as a common fraction.


KeyboardInterrupt: 