In [1]:
import logging
import builtins

# ─── Configure the root logger ───────────────────────────────────────────────
logger = logging.getLogger()              # root logger
logger.setLevel(logging.INFO)

# Create console handler and file handler
console_handler = logging.StreamHandler()
file_handler    = logging.FileHandler('qwen_app.log', encoding='utf-8')

console_handler.setLevel(logging.INFO)
file_handler.setLevel(logging.INFO)

# Common log format
formatter = logging.Formatter(
    '%(asctime)s %(levelname)-8s %(name)s:%(lineno)d │ %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)

# Attach handlers
logger.addHandler(console_handler)
logger.addHandler(file_handler)

# ─── Monkey-patch built-in print() ────────────────────────────────────────────
_original_print = builtins.print

def _logged_print(*args, **kwargs):
    """
    Capture everything printed and send it to logger.info,
    so it goes to both console and file.
    """
    sep = kwargs.get('sep', ' ')
    end = kwargs.get('end', '\n')
    msg = sep.join(str(a) for a in args)
    logger.info(msg)
    # If you still want the original behavior on the console, uncomment:
    # _original_print(*args, **kwargs)

builtins.print = _logged_print


In [2]:
import re
import sympy as sp
from sympy.parsing.sympy_parser import parse_expr
from sympy.parsing.latex import parse_latex
# load from os.env
import os
# New function for statistical comparison:
import random
import numpy as np
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from dotenv import load_dotenv
load_dotenv()
# Set the OpenAI API key from environment variable
# or directly set it here
api_key= os.getenv("OPENAI_API_KEY")
import re
import sympy as sp
from sympy.parsing.sympy_parser import parse_expr
from sympy.parsing.latex import parse_latex
import openai
from openai import OpenAI
import pandas as pd
import numpy as np
# Initialize the client using your API key.
client = OpenAI(api_key=api_key)

from sympy.calculus.util import function_range
from sympy import lambdify
from sklearn.metrics import mean_absolute_error


import numpy as np
import sympy as sp
from sympy.calculus.util import function_range
from sympy import lambdify
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr

import numpy as np
import sympy as sp
from sympy import lambdify
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.stats import pearsonr

def stats_test(expr1, expr2, samples=1000, default_range=(-10, 10)):
    """
    Compare two SymPy expressions (or equation-tuples) via sampling.
    
    1) If both are tuples, compare diff1 = rhs−lhs vs. diff2 = rhs−lhs.
    2) If only one is a tuple, pull out its rhs and compare as constants.
    3) If neither is a tuple, compare normally.
    """
    is_eq1 = isinstance(expr1, tuple) and len(expr1) == 2
    is_eq2 = isinstance(expr2, tuple) and len(expr2) == 2

    if is_eq1 and is_eq2:
        expr1 = expr1[1] - expr1[0]
        expr2 = expr2[1] - expr2[0]
    elif is_eq1 and not is_eq2:
        expr1 = expr1[1]
    elif is_eq2 and not is_eq1:
        expr2 = expr2[1]
    print("calculating stats for")
    print("expr1",expr1)
    print("expr2",expr2)
    # 2. Identify all variables, sorted for consistent lambdify order
    try:
        variables = sorted(list(expr1.free_symbols.union(expr2.free_symbols)), key=lambda s: s.name)
    except AttributeError as e:
         return {
            "error": f"Could not extract free symbols. Input might not be a valid SymPy expression. Detail: {e}",
            "similarity_score": 0.0,
            "expr1_type": str(type(expr1)),
            "expr2_type": str(type(expr2)),
            "expr1": str(expr1),
            "expr2": str(expr2),
        }

    num_vars = len(variables)
    # print("num_vars",num_vars)
    # 3. Handle pure-constant case
    if num_vars == 0:
        try:
            # Attempt numeric evaluation first
            v1 = float(expr1.evalf() if hasattr(expr1, 'evalf') else expr1)
            v2 = float(expr2.evalf() if hasattr(expr2, 'evalf') else expr2)
            mse = (v1 - v2)**2
            return {
                "type": "constant_comparison",
                "value1": v1,
                "value2": v2,
                "absolute_difference": abs(v1 - v2),
                "mse": mse,
                "are_equal": abs(v1 - v2) < 1e-10,
                "similarity_score": 1.0 / (1.0 + mse),
                "expr1": str(expr1),
                "expr2": str(expr2),
                "variables": []
            }
        except (TypeError, ValueError, AttributeError):
            # Fallback to symbolic comparison for non-numeric constants
            try:
                are_equal_symbolic = sp.simplify(expr1 - expr2) == 0
                return {
                    "type": "symbolic_constant_comparison",
                    "are_equal": are_equal_symbolic,
                    "similarity_score": 1.0 if are_equal_symbolic else 0.0,
                    "expr1": str(expr1),
                    "expr2": str(expr2),
                    "variables": []
                }
            except Exception as e_simp:
                 # If simplification itself fails
                 return {
                    "type": "constant_comparison_failed",
                    "error": f"Symbolic comparison failed: {e_simp}",
                    "similarity_score": 0.0, # Treat as non-equivalent if comparison fails
                    "expr1": str(expr1),
                    "expr2": str(expr2),
                    "variables": []
                 }


    # 4. Lambdify for multi-variable evaluation
    # Add common numpy functions and Abs mapping for compatibility
        # Define a local dictionary for standard functions and constants.
    func_mapping = {
        'cos': sp.cos,
        'sin': sp.sin,
        'tan': sp.tan,
        'log': sp.log,
        'exp': sp.exp,
        'Derivative': sp.Derivative,
        'Integral': sp.Integral,
        'E': sp.E,
        'e': sp.E,
        'S': sp.S,
        'I': sp.I,
        'pi': sp.pi,
        'sqrt': sp.sqrt,
        'Abs': sp.Abs,

    }
    try:
        f1 = lambdify(variables, expr1, modules=["numpy", func_mapping])
        f2 = lambdify(variables, expr2, modules=["numpy", func_mapping])
    except Exception as e:
        return {
            "error": f"Lambdify failed: {e}",
            "variables": [str(v) for v in variables],
            "similarity_score": 0.0,
             "expr1": str(expr1),
             "expr2": str(expr2),
        }

    # 5. Generate multi-dimensional sample points
    # Each row in sample_points corresponds to one sample, columns correspond to variables
    try:
        sample_points = np.random.uniform(default_range[0], default_range[1], size=(samples, num_vars))
    except ValueError as e:
         return {
            "error": f"Failed to generate sample points (check default_range): {e}",
            "variables": [str(v) for v in variables],
            "similarity_score": 0.0,
             "expr1": str(expr1),
             "expr2": str(expr2),
         }

    # 6. Evaluate functions at sample points
    y1_list, y2_list = [], []
    valid_indices = []
    evaluation_errors = 0
    for i, point in enumerate(sample_points):
        try:
            # Pass arguments individually using *
            v1 = complex(f1(*point)).real # Handle potential complex results, take real part
            v2 = complex(f2(*point)).real # Handle potential complex results, take real part

            # Check for NaN or Inf
            if np.isfinite(v1) and np.isfinite(v2):
                y1_list.append(v1)
                y2_list.append(v2)
                valid_indices.append(i)
            else:
                evaluation_errors += 1
        except (TypeError, ValueError, OverflowError, ZeroDivisionError, AttributeError, NameError, KeyError) as e:
             # Catch potential numerical or evaluation errors during function call
            # print(f"Warning: Evaluation skipped for point {point} due to error: {e}") # Optional debug print
            evaluation_errors += 1
            continue # Skip this sample point

    # Check if enough valid points were collected
    if len(y1_list) < 2:
        return {
            "valid_samples": len(y1_list),
            "total_samples": samples,
            "evaluation_errors": evaluation_errors,
            "error": "Insufficient valid data points for statistical analysis after evaluation",
            "variables": [str(v) for v in variables],
            "similarity_score": 0.0,
            "expr1": str(expr1),
            "expr2": str(expr2),
        }

    # Convert lists to numpy arrays
    y1 = np.array(y1_list)
    y2 = np.array(y2_list)
    # valid_sample_points = sample_points[valid_indices] # Keep if needed for debugging

    # 7. Compute statistical metrics
    results = {
        "valid_samples": len(y1),
        "total_samples": samples,
        "evaluation_errors": evaluation_errors,
        "variables": [str(v) for v in variables],
        "sample_range_per_variable": default_range,
        "expr1": str(expr1),
        "expr2": str(expr2),
    }

    # Basic error metrics
    try:
        results["mean_absolute_error"] = mean_absolute_error(y1, y2)
        results["max_absolute_error"] = np.max(np.abs(y1 - y2))
    except Exception as e:
        results["error_mae"] = f"MAE calculation failed: {e}"

    # Pearson correlation - check for constant arrays first
    std_y1 = np.std(y1)
    std_y2 = np.std(y2)
    if std_y1 > 1e-10 and std_y2 > 1e-10: # Check if both have variance
        try:
            corr, pval = pearsonr(y1, y2)
            # Handle potential NaN correlation if variance is extremely small but > 1e-10
            results["correlation"] = corr if np.isfinite(corr) else 1.0 if np.allclose(y1,y2) else 0.0
            results["correlation_p_value"] = pval if np.isfinite(pval) else 0.0 if np.allclose(y1,y2) else 1.0
        except ValueError: # Should be caught by std check, but as safeguard
             results["correlation"] = 1.0 if np.allclose(y1, y2) else 0.0
             results["correlation_p_value"] = 0.0 if np.allclose(y1, y2) else 1.0
        except Exception as e:
            results["error_correlation"] = f"Correlation calculation failed: {e}"
            results["correlation"] = 0.0
            results["correlation_p_value"] = None
    else: # Handle cases where one or both arrays are constant
        # If both are constant and equal, correlation is 1. Otherwise 0.
        are_constant_and_equal = std_y1 <= 1e-10 and std_y2 <= 1e-10 and np.allclose(y1, y2)
        results["correlation"] = 1.0 if are_constant_and_equal else 0.0
        results["correlation_p_value"] = 0.0 if are_constant_and_equal else 1.0


    # R² score
    try:
        # R2 score is undefined if y1 is constant. Check this case.
        if std_y1 > 1e-10:
             results["r2_score"] = r2_score(y1, y2)
        else:
             # If y1 is constant, R2 is 1 if y2 is also constant and equal to y1, else 0.
             results["r2_score"] = 1.0 if std_y2 <= 1e-10 and np.allclose(y1, y2) else 0.0
    except Exception as e:
        results["error_r2"] = f"R2 score calculation failed: {e}"
        results["r2_score"] = 0.0 # Default to 0 in case of error

    # 8. Aggregate similarity score [0,1]
    comps = []
    # Use |correlation| if available and numeric
    corr_val = results.get("correlation")
    if isinstance(corr_val, (float, int)) and np.isfinite(corr_val):
        comps.append(abs(corr_val))

    # Use R² if available and numeric (clamp negative R² to 0 for averaging)
    r2_val = results.get("r2_score")
    if isinstance(r2_val, (float, int)) and np.isfinite(r2_val):
        comps.append(max(0.0, r2_val)) # Clamp R^2 >= 0

    # Inverse-MAE term
    mae_val = results.get("mean_absolute_error")
    if isinstance(mae_val, (float, int)) and np.isfinite(mae_val):
        comps.append(1.0 / (1.0 + mae_val))

    # Calculate mean similarity score if components are available
    if comps:
        results["similarity_score"] = float(np.mean(comps))
    else:
        # Fallback if no components could be calculated
        results["similarity_score"] = 0.0

    # Ensure similarity score is always float and within [0, 1]
    final_score = results.get("similarity_score", 0.0)
    results["similarity_score"] = max(0.0, min(1.0, float(final_score if np.isfinite(final_score) else 0.0)))


    # 9. Final cleanup - remove potentially large data points from results if not needed
    # results.pop("valid_sample_points", None) # Uncomment if sample points were stored

    return results


def llm_to_sympy(expr_str: str) -> sp.Expr:
    """
    Convert a mathematical expression string to a Sympy expression using an LLM-based conversion.
    
    This function calls the LLM API and then parses the returned string (which should be a valid 
    Sympy expression) into a Sympy object.
    """
#     result = client.chat.completions.create(
#         model="gpt-4o",
#         messages=[
#             {"role": "system", "content": """You are a Sympy expression converter. Return only the final Sympy expression 
# (e.g., (1/S(2))*log(...) + C). Do not include code fences or additional text. 
# No imports, definitions, or explanations. Just one line: the Sympy expression."""},
#             {"role": "user", "content": f"""
# "You are an agent that converts mathematical expressions to valid Sympy expressions. 
# Output only the Sympy expression itself on a single line, with no imports, no extra 
# Python code, and no explanations. Use standard Sympy functions like exp, log, sin, 
# cos, etc. The result must parse successfully with 'sympy.parse_expr'. 

# Convert the following expression to Sympy format: {expr_str}"
# """}
#         ],
#     )
    
#     output = result.choices[0].message.content.strip()
    
    # Define a local dictionary for standard functions and constants.
    local_dict = {
        'cos': sp.cos,
        'sin': sp.sin,
        'tan': sp.tan,
        'log': sp.log,
        'exp': sp.exp,
        'Derivative': sp.Derivative,
        'Integral': sp.Integral,
        'E': sp.E,
        'e': sp.E,
        'S': sp.S,
        'I': sp.I,
        'pi': sp.pi,
        'sqrt': sp.sqrt,
        'Abs': sp.Abs,

    }
    import ollama
    response = ollama.chat(
    model="qwen3:8b",
    messages=[
        {"role": "system", "content": """You are a SymPy expression converter that converts mathematical expressions to SymPy expressions. 
You must always respond in JSON format as:
{ "sympy_expr": "<the SymPy expression here>" }

Rules:
- Output ONLY a single JSON object, nothing else.
- The 'sympy_expr' value must be a single-line SymPy expression that can be parsed by 'sympy.parse_expr'.
- No explanations, no text outside the JSON.
- If you include anything outside the JSON format, it will cause an error."""},
        {"role": "user", "content": f"""Convert the following expression into SymPy format and output only JSON:
{expr_str} and Make sure the expression you place in sympy_expr parses with sympy.parse_expr; otherwise regenerate it.
"""}
    ],
)

# # Now parse the JSON safely
    output = response["message"]["content"]
    # remove everything up to and including the </think> tag
    if "</think>" in output:
        output = output.split("</think>")[-1].strip()
    
    # print("output",output)

# Pre-compile for speed if you’ll call it repeatedly
    _SYMPY_EXPR_RE = re.compile(
    r"""
    \{                    # opening brace
    [^{}]*?               # anything (non-greedy) until the key
    ['"]sympy_expr['"]    # the key, in single or double quotes
    \s*:\s*               # colon with optional spaces
    ['"]                  # opening quote for the value
    (?P<expr>[^'"]+?)     # <- the expression itself (capture group)
    ['"]                  # closing quote
    [^{}]*?               # anything else until
    \}                    # closing brace
    """,
    re.VERBOSE | re.DOTALL | re.UNICODE,
    )
    try:
        match = _SYMPY_EXPR_RE.search(output)
        symoy_compatible = match.group("expr").strip() if match else None
        # print("Final SymPy Expression:", symoy_compatible)
    except Exception as e:
        print("regex error", e)

    try:
        sympy_expr = parse_expr(symoy_compatible, local_dict=local_dict)
        return sympy_expr
    except Exception as e:
        raise ValueError(f"Error parsing LLM output:\nOutput: {output}\n{e}")

def preprocess_expression(expr_str: str):
    """
    Preprocess an expression string to produce either:
      - a tuple (lhs, rhs) of Sympy Exprs if there’s an “=”
      - a single Sympy Expr if there isn’t.
    """
    expr_str = expr_str.strip()
    if '=' in expr_str:
        left_str, right_str = expr_str.split('=', 1)
        lhs = llm_to_sympy(left_str.strip())
        rhs = llm_to_sympy(right_str.strip())
        return (lhs, rhs)
    else:
        return llm_to_sympy(expr_str)

def compare_expressions(expr1, expr2) -> bool:
    """
    Compare two Sympy expressions (or equation-tuples) according to:
      1) both eq → compare (rhs−lhs) against (rhs−lhs)
      2) one eq  → compare rhs against the other
      3) neither → compare expr1 − expr2 == 0
    """
    # both equations?
    if isinstance(expr1, tuple) and isinstance(expr2, tuple):
        diff1 = expr1[1] - expr1[0]
        diff2 = expr2[1] - expr2[0]
        try:
            return sp.simplify(diff1 - diff2) == 0
        except TypeError:
            return str(diff1) == str(diff2)

    # only expr1 is equation
    if isinstance(expr1, tuple):
        try:
            return sp.simplify(expr1[1] - expr2) == 0
        except TypeError:
            return str(expr1[1]) == str(expr2)

    # only expr2 is equation
    if isinstance(expr2, tuple):
        try:
            return sp.simplify(expr1 - expr2[1]) == 0
        except TypeError:
            return str(expr1) == str(expr2[1])

    # neither is equation
    try:
        return sp.simplify(expr1 - expr2) == 0
    except TypeError:
        return str(expr1) == str(expr2)

def compare_pipeline(expr_str1: str, expr_str2: str) -> bool:
    """
    The complete pipeline that:
      1. Checks if an expression is an equation (using '='), splitting it if necessary.
      2. Calls the LLM-based conversion (llm_to_sympy) to get the Sympy expression.
      3. Normalizes and compares the resulting expressions.
      
    The function prints the parsed expressions and returns whether they are equivalent.
    """
    # print("Comparing expressions")

    try:
        parsed_expr1 = preprocess_expression(expr_str1)
        parsed_expr2 = preprocess_expression(expr_str2)
    except ValueError as e:
        print("Parsing error:", e)
        return False, None, None
    
    # Now we should have compatible expressions (either both tuples or both expressions)
    
    print("Parsed Expression 1:", parsed_expr1)
    print("Parsed Expression 2:", parsed_expr2)
    # print("The expressions are equivalent." if equivalent else "The expressions are NOT equivalent.")
    equivalent = compare_expressions(parsed_expr1, parsed_expr2)
    return equivalent, parsed_expr1, parsed_expr2




df = pd.read_csv('together_deepseek - Sheet3.csv',dtype=str,keep_default_na=False)

# var  =sp.Symbol('x')
columns_answer = ["deepseek-ai/DeepSeek-V3_answer",'deepseek-ai/DeepSeek-R1_answer',"Qwen/Qwen2.5-72B-Instruct-Turbo_answer","Qwen/QwQ-32B_answer"]
all_results = {}
column_scores = {}
# print("df",df)
for idx,row in df.iterrows():
    # if idx!=3:
    #     continue
    row_results = {}
    try:
        ground_truth = str(row['answer'])
        # print("Ground Truth:", ground_truth)

        for col in columns_answer:
            column_expression = str(row[col])
            # print("Column Expression:", column_expression)
            try:

                result,parsed_expr1,parsed_expr2 = compare_pipeline(column_expression, ground_truth)
                # print(result,parsed_expr1,parsed_expr2)
                if result:
                    stats = {"similarity_score": 1.0, "is_equivalent": True}
                else:
                    stats = stats_test(parsed_expr1, parsed_expr2)

                row_results[col] = stats
                print("stats",stats)
            except Exception as e:
                print(f"Error processing row {idx}, column {col}: {e}")
                row_results[col] = {"error": str(e), "similarity_score": 0.0}
                # Update column scores with zero for errors
                if col not in column_scores:
                    column_scores[col] = []
                column_scores[col].append(0.0)
                    # Rank columns for this row based on similarity score
        row_rankings = sorted(
                [(col, res["similarity_score"]) for col, res in row_results.items()],
                key=lambda x: x[1],
                reverse=True  # Higher scores are better
        )
                    # Store results for this row
        all_results[idx] = {
                "id":row["Sub Task Id"],
                "ground_truth": ground_truth,
                "column_results": row_results,
                "rankings": row_rankings
            }
        
    except Exception as e:
        print(f"Error processing row {idx}: {e}")
        all_results[idx] = {"error": str(e)}

                
avg_scores = {col: np.mean(scores) for col, scores in column_scores.items()}
    # Rank columns based on average similarity score
overall_rankings = sorted(
        [(col, score) for col, score in avg_scores.items()],
        key=lambda x: x[1],
        reverse=True  # Higher scores are better
    )
import json
# save to json file 
with open("qwen3.json","w") as f:
    json.dump(all_results , f)


2025-04-30 15:55:39 INFO     httpx:1025 │ HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-04-30 15:56:02 INFO     httpx:1025 │ HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-04-30 15:56:02 INFO     root:38 │ Parsed Expression 1: 24
2025-04-30 15:56:02 INFO     root:38 │ Parsed Expression 2: 1024
2025-04-30 15:56:02 INFO     root:38 │ calculating stats for
2025-04-30 15:56:02 INFO     root:38 │ expr1 24
2025-04-30 15:56:02 INFO     root:38 │ expr2 1024
2025-04-30 15:56:02 INFO     root:38 │ stats {'type': 'constant_comparison', 'value1': 24.0, 'value2': 1024.0, 'absolute_difference': 1000.0, 'mse': 1000000.0, 'are_equal': False, 'similarity_score': 9.99999000001e-07, 'expr1': '24', 'expr2': '1024', 'variables': []}
2025-04-30 15:56:16 INFO     httpx:1025 │ HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-04-30 15:56:33 INFO     httpx:1025 │ HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2

In [3]:
import json
import pandas as pd

# ——— Config ———
INPUT_CSV = 'together_deepseek - Sheet3.csv'
RESULTS_JSON = 'qwen3.json'
OUTPUT_CSV = 'together_with_scores.csv'

# These should match exactly the columns you evaluated
MODEL_COLUMNS = [
    "deepseek-ai/DeepSeek-V3_answer",
    "deepseek-ai/DeepSeek-R1_answer",
    "Qwen/Qwen2.5-72B-Instruct-Turbo_answer",
    "Qwen/QwQ-32B_answer"
]

# ——— Load data ———
df = pd.read_csv(INPUT_CSV, dtype=str, keep_default_na=False)
with open(RESULTS_JSON, 'r') as f:
    results = json.load(f)

# ——— Inject similarity columns ———
for idx, row in df.iterrows():
    key = str(idx)  # JSON keys are strings
    entry = results.get(key, {})
    col_results = entry.get('column_results', {})
    rankings   = entry.get('rankings', [])

    # For each model column, pull its similarity_score
    for col in MODEL_COLUMNS:
        score = None
        if col in col_results:
            score = col_results[col].get('similarity_score')
        df.at[idx, f'{col}_sim'] = score

    # Build a ranking string, e.g. "DeepSeek-R1>DeepSeek-V3>Qwen2.5>QwQ32"
    ordered_models = [m for m, _ in rankings]
    # Optionally you can include scores: ["DeepSeek-R1 (0.92)", ...]
    # ordered_models = [f"{m} ({s:.2f})" for m, s in rankings]
    df.at[idx, 'Rank'] = ' > '.join(ordered_models)

# ——— Save ———
df.to_csv(OUTPUT_CSV, index=False)
print(f"Wrote augmented table with similarity scores and ranks to {OUTPUT_CSV}")


2025-04-30 18:44:34 INFO     root:38 │ Wrote augmented table with similarity scores and ranks to together_with_scores.csv


### using the csv

Ground Truth: x = -1
Column Expression: -1
Comparing expressions
Error processing row 2, column deepseek-ai/DeepSeek-V3_answer: model "deepseek-r1:7b" not found, try pulling it first (status code: 404)
Column Expression: -1
Comparing expressions
Error processing row 2, column deepseek-ai/DeepSeek-R1_answer: model "deepseek-r1:7b" not found, try pulling it first (status code: 404)
Column Expression: \pm \sqrt{3}
Comparing expressions
Error processing row 2, column Qwen/Qwen2.5-72B-Instruct-Turbo_answer: model "deepseek-r1:7b" not found, try pulling it first (status code: 404)
Column Expression: -1
Comparing expressions
Error processing row 2, column Qwen/QwQ-32B_answer: model "deepseek-r1:7b" not found, try pulling it first (status code: 404)
Ground Truth: a + b + c = 0  
Column Expression: a + b + c = 0
Comparing expressions
Error processing row 3, column deepseek-ai/DeepSeek-V3_answer: model "deepseek-r1:7b" not found, try pulling it first (status code: 404)
Column Expression: a + b +

[['deepseek-ai/DeepSeek-V3_answer', 0.0],
 ['deepseek-ai/DeepSeek-R1_answer', 0.0],
 ['Qwen/Qwen2.5-72B-Instruct-Turbo_answer', 0.0],
 ['Qwen/QwQ-32B_answer', 0.0]]

In [6]:
import json
import numpy as np
import pandas as pd

# --- 1. Load your JSON results file ---
with open("qwen3.json", "r") as f:
    data = json.load(f)

# --- 2. Build a DataFrame of similarity scores ---
# assume every entry data[q]["column_results"] has identical model keys
model_names = list(next(iter(data.values()))["column_results"].keys())

# create a list of question-indices sorted numerically
q_idxs = sorted(data.keys(), key=lambda x: int(x))

# extract a 2D array: rows=questions, cols=models
scores = np.array([
    [ data[q]["column_results"][m]["similarity_score"] for m in model_names ]
    for q in q_idxs
])

df = pd.DataFrame(scores, columns=model_names)

# --- 3. Compute each model’s overall mean similarity ---
means = df.mean(axis=0)

# --- 4. Bootstrap 95% CIs on the mean (percentile method) ---
rng = np.random.default_rng(0)
B = 1000
n, J = df.shape

boot_means = np.empty((B, J))
for b in range(B):
    idx = rng.choice(n, size=n, replace=True)
    boot_means[b] = df.iloc[idx].mean(axis=0)

ci_lower = np.percentile(boot_means, 2.5, axis=0)
ci_upper = np.percentile(boot_means, 97.5, axis=0)

# --- 5. Assemble and display the leaderboard ---
leaderboard = pd.DataFrame({
    "model":       model_names,
    "mean_score":  means.values,
    "ci_lower":    ci_lower,
    "ci_upper":    ci_upper
}).sort_values("mean_score", ascending=False).reset_index(drop=True)

print(leaderboard.to_markdown(index=False, floatfmt=".4f"))


| model                                  |   mean_score |   ci_lower |   ci_upper |
|:---------------------------------------|-------------:|-----------:|-----------:|
| deepseek-ai/DeepSeek-V3_answer         |       0.8266 |     0.6250 |     0.9974 |
| Qwen/QwQ-32B_answer                    |       0.7490 |     0.5006 |     0.9375 |
| deepseek-ai/DeepSeek-R1_answer         |       0.6865 |     0.4977 |     0.8753 |
| Qwen/Qwen2.5-72B-Instruct-Turbo_answer |       0.4823 |     0.2547 |     0.7144 |


In [8]:
import json
import numpy as np
import pandas as pd

# --- 1. Load your JSON results file ---
with open("qwen3.json", "r") as f:
    data = json.load(f)

# --- 2. Gather question IDs and model names ---
q_ids = sorted(data.keys(), key=lambda x: int(x))
model_names = list(next(iter(data.values()))["column_results"].keys())

# --- 3. Extract ground-truth answers and detect numeric vs equation ---
def is_number(s: str) -> bool:
    """Return True if s can be parsed as a float (i.e. numeric question)."""
    try:
        float(s)
        return True
    except Exception:
        return False

ground_truths = [ data[q]["ground_truth"] for q in q_ids ]
is_numeric    = np.array([ is_number(gt) for gt in ground_truths ])

# --- 4. Build a DataFrame of similarity scores (N×J) ---
scores = np.array([
    [ data[q]["column_results"][m]["similarity_score"] for m in model_names ]
    for q in q_ids
])
df_scores = pd.DataFrame(scores, columns=model_names)

# --- 5. Leaderboard + CI helper ---
def compute_leaderboard(sub_df, model_names, B=10, seed=0):
    """Return a DataFrame of mean ± 95% CI for each model on sub_df."""
    means = sub_df.mean(axis=0).values
    rng   = np.random.default_rng(seed)
    n     = len(sub_df)
    boot  = np.empty((B, len(model_names)))
    for b in range(B):
        idx = rng.choice(n, n, replace=True)
        boot[b] = sub_df.iloc[idx].mean(axis=0).values
    ci_low  = np.percentile(boot,  2.5, axis=0)
    ci_high = np.percentile(boot, 97.5, axis=0)

    return (
        pd.DataFrame({
            "model":    model_names,
            "mean":     means,
            "ci_low":   ci_low,
            "ci_high":  ci_high
        })
        .sort_values("mean", ascending=False)
        .reset_index(drop=True)
    )

# --- 6. Compute the three leaderboards ---
lb_numeric  = compute_leaderboard(df_scores[is_numeric],   model_names)
lb_equation = compute_leaderboard(df_scores[~is_numeric],  model_names)
lb_overall  = compute_leaderboard(df_scores,               model_names)

# --- 7. Print them out ---
print("### Numeric‐only questions\n")
print(lb_numeric.to_markdown(index=False, floatfmt=".4f"))

print("\n### Equation‐only questions\n")
print(lb_equation.to_markdown(index=False, floatfmt=".4f"))

print("\n### Overall (macro‐average)\n")
print(lb_overall.to_markdown(index=False, floatfmt=".4f"))


### Numeric‐only questions

| model                                  |   mean |   ci_low |   ci_high |
|:---------------------------------------|-------:|---------:|----------:|
| deepseek-ai/DeepSeek-V3_answer         | 0.7077 |   0.5092 |    1.0000 |
| deepseek-ai/DeepSeek-R1_answer         | 0.6675 |   0.5002 |    1.0000 |
| Qwen/QwQ-32B_answer                    | 0.6675 |   0.5002 |    1.0000 |
| Qwen/Qwen2.5-72B-Instruct-Turbo_answer | 0.5342 |   0.1907 |    0.9700 |

### Equation‐only questions

| model                                  |   mean |   ci_low |   ci_high |
|:---------------------------------------|-------:|---------:|----------:|
| deepseek-ai/DeepSeek-V3_answer         | 0.8979 |   0.7215 |    0.9995 |
| Qwen/QwQ-32B_answer                    | 0.7979 |   0.7000 |    0.9979 |
| deepseek-ai/DeepSeek-R1_answer         | 0.6979 |   0.1900 |    0.8974 |
| Qwen/Qwen2.5-72B-Instruct-Turbo_answer | 0.4512 |   0.2045 |    0.5627 |

### Overall (macro‐average)

| model     