In [89]:
import re
import sympy as sp
from sympy.parsing.sympy_parser import parse_expr
from sympy.parsing.latex import parse_latex
# load from os.env
import os
# New function for statistical comparison:
import random
import numpy as np
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from dotenv import load_dotenv
load_dotenv()
# Set the OpenAI API key from environment variable
# or directly set it here
api_key= os.getenv("OPENAI_API_KEY")
import re
import sympy as sp
from sympy.parsing.sympy_parser import parse_expr
from sympy.parsing.latex import parse_latex
import openai
from openai import OpenAI

# Initialize the client using your API key.
client = OpenAI(api_key=api_key)

from sympy.calculus.util import function_range
from sympy import lambdify
from sklearn.metrics import mean_absolute_error

def stats_test(expr1, expr2, var, 
               samples=100, default_range=(-10,10)):
    # 1. Determine sampling interval
    try:
        domain = function_range(expr1 - expr2, var, sp.S.Reals)
        a, b = float(domain.start), float(domain.end)
    except:
        a, b = default_range

    try:
        domain = function_range(expr1 - expr2, var, sp.S.Reals)
        a, b = float(domain.start), float(domain.end)
        # Ensure a reasonable range size
        if b - a < 1e-10:
            a, b = default_range
    except:
        a, b = default_range
    

    # Check if both expressions are constants
    is_const1 = len(expr1.free_symbols) == 0
    is_const2 = len(expr2.free_symbols) == 0
    
    # If both are constants, perform direct numerical comparison
    if is_const1 and is_const2:
        val1 = float(expr1)
        val2 = float(expr2)
        abs_diff = abs(val1 - val2)
        mse = abs_diff ** 2
        
        # Create result dictionary for constant case
        results = {
            "type": "constant_comparison",
            "value1": val1,
            "value2": val2,
            "absolute_difference": abs_diff,
            "mse": mse,
            "are_equal": abs_diff < 1e-10,  # Numerical equality with tolerance
            "similarity_score": 1.0 / (1.0 + mse),  # Converts MSE to similarity (0-1 range)
            "expr1": str(expr1),
            "expr2": str(expr2)
        }
        return results
    
    # Handle case where only one expression is constant
    if is_const1:
        expr1 = sp.Lambda(var, expr1)  # Convert to a constant function
    if is_const2:
        expr2 = sp.Lambda(var, expr2)  # Convert to a constant function
    
    # 1. Determine sampling interval
    try:
        domain = function_range(expr1 - expr2, var, sp.S.Reals)
        a, b = float(domain.start), float(domain.end)
        # Ensure a reasonable range size
        if b - a < 1e-10:
            a, b = default_range
    except:
        a, b = default_range
    
    # 2. Create numerical functions
    f1 = lambdify(var, expr1, "numpy")
    f2 = lambdify(var, expr2, "numpy")
    
    # 3. Generate sample points for x
    x_values = np.linspace(a, b, samples)
    
    # 4. Calculate function values, handling potential errors
    y1_values = []
    y2_values = []
    valid_x = []
    
    for x in x_values:
        try:
            y1 = float(f1(x))
            y2 = float(f2(x))
            # Only include points where both functions return finite values
            if (not np.isnan(y1) and not np.isnan(y2) and 
                not np.isinf(y1) and not np.isinf(y2)):
                y1_values.append(y1)
                y2_values.append(y2)
                valid_x.append(x)
        except:
            continue
    
    # Convert to numpy arrays for analysis
    x_array = np.array(valid_x)
    y1_array = np.array(y1_values)
    y2_array = np.array(y2_values)
    
    # Skip analysis if not enough valid points
    if len(valid_x) < 2:
        return {
            "valid_samples": len(valid_x),
            "error": "Insufficient valid data points for statistical analysis"
        }
    
    # 5. Calculate statistical metrics
    results = {}
    
    # Number of valid samples
    results["valid_samples"] = len(valid_x)
    
    # Calculate absolute and relative differences
    abs_diff = np.abs(y1_array - y2_array)
    results["mean_absolute_error"] = np.mean(abs_diff)
    results["max_absolute_error"] = np.max(abs_diff)
    
    # Handle division by zero in relative error calculation
    non_zero_y1 = y1_array != 0
    if np.any(non_zero_y1):
        rel_diff = abs_diff[non_zero_y1] / np.abs(y1_array[non_zero_y1])
        results["mean_relative_error"] = np.mean(rel_diff)
        results["max_relative_error"] = np.max(rel_diff)
    
    # Correlation analysis
    try:
        correlation, p_value = pearsonr(y1_array, y2_array)
        results["correlation"] = correlation
        results["correlation_p_value"] = p_value
    except:
        results["correlation"] = "Error calculating correlation"
    
    # Linear regression analysis
    try:
        model = LinearRegression()
        X = y1_array.reshape(-1, 1)
        y = y2_array
        model.fit(X, y)
        predictions = model.predict(X)
        
        results["regression_slope"] = model.coef_[0]
        results["regression_intercept"] = model.intercept_
        results["r_squared"] = model.score(X, y)
        results["rmse"] = np.sqrt(mean_squared_error(y, predictions))
        
        # Perfect match would have slope=1, intercept=0
        results["slope_deviation"] = abs(1 - model.coef_[0])
        results["intercept_deviation"] = abs(model.intercept_)
    except:
        results["regression_analysis"] = "Error performing regression analysis"
    
    # 6. Create a simple overall similarity score (0-1)
    # Weight different factors: correlation, slope deviation, intercept, MAE
    try:
        score_components = []
        if "correlation" in results and isinstance(results["correlation"], (int, float)):
            score_components.append(abs(results["correlation"]))
        if "slope_deviation" in results:
            score_components.append(1 / (1 + results["slope_deviation"]))
        if "intercept_deviation" in results:
            score_components.append(1 / (1 + abs(results["intercept_deviation"])))
        if "mean_absolute_error" in results:
            score_components.append(1 / (1 + results["mean_absolute_error"]))
        
        if score_components:
            results["similarity_score"] = sum(score_components) / len(score_components)
        else:
            results["similarity_score"] = "Could not calculate"
    except:
        results["similarity_score"] = "Error calculating similarity score"
    
    # 7. Add expression information
    results["expr1"] = str(expr1)
    results["expr2"] = str(expr2)
    results["sample_range"] = (a, b)
    
    return results



def normalize_expr(expr):
    """
    Recursively traverse the expression and replace any instance of
    log(expr, E) with log(expr). If expr is a tuple (e.g., representing an
    equation with left and right sides), normalize each element.
    """
    # If expr is a tuple (e.g., (lhs, rhs)), apply normalization to each part.
    if isinstance(expr, tuple):
        return tuple(normalize_expr(e) for e in expr)
    
    # Otherwise, expr should be a Sympy object
    if expr.is_Atom:
        return expr
    new_args = tuple(normalize_expr(arg) for arg in expr.args)
    if expr.func == sp.log and len(new_args) == 2 and new_args[1] == sp.E:
        return sp.log(new_args[0])
    return expr.func(*new_args)

def llm_to_sympy(expr_str: str) -> sp.Expr:
    """
    Convert a mathematical expression string to a Sympy expression using an LLM-based conversion.
    
    This function calls the LLM API and then parses the returned string (which should be a valid 
    Sympy expression) into a Sympy object.
    """
    result = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": """You are a Sympy expression converter. Return only the final Sympy expression 
(e.g., (1/S(2))*log(...) + C). Do not include code fences or additional text. 
No imports, definitions, or explanations. Just one line: the Sympy expression."""},
            {"role": "user", "content": f"""
"You are an agent that converts mathematical expressions to valid Sympy expressions. 
Output only the Sympy expression itself on a single line, with no imports, no extra 
Python code, and no explanations. Use standard Sympy functions like exp, log, sin, 
cos, etc. The result must parse successfully with 'sympy.parse_expr'. 

Convert the following expression to Sympy format: {expr_str}"
"""}
        ],
    )
    output = result.choices[0].message.content.strip()
    
    # Define a local dictionary for standard functions and constants.
    local_dict = {
        'cos': sp.cos,
        'sin': sp.sin,
        'tan': sp.tan,
        'log': sp.log,
        'exp': sp.exp,
        'Derivative': sp.Derivative,
        'Integral': sp.Integral,
        'E': sp.E,
        'e': sp.E,
        'C': sp.Symbol('C'),
        'x': sp.Symbol('x'),
        'y': sp.Symbol('y'),
        'z': sp.Symbol('z'),
        'S': sp.S,
        'I': sp.I,
        'pi': sp.pi,
        'sqrt': sp.sqrt,
        'Abs': sp.Abs,

    }
    try:
        sympy_expr = parse_expr(output, local_dict=local_dict)
        return sympy_expr
    except Exception as e:
        raise ValueError(f"Error parsing LLM output:\nOutput: {output}\n{e}")

def preprocess_expression(expr_str: str):
    """
    Preprocess an expression string to produce a Sympy object using an LLM-based conversion.
    
    If the string contains an "=" sign, it is split into left-hand and right-hand parts.
    For each part, the llm_to_sympy function is called to obtain its Sympy expression.
    """
    expr_str = expr_str.strip()
    if '=' in expr_str:
        left_str, right_str = expr_str.split('=', 1)
        left_expr = preprocess_expression(left_str.strip())
        right_expr = preprocess_expression(right_str.strip())
        return (left_expr, right_expr)
    else:
        # Call the LLM-based function to convert this expression to a Sympy object.
        return llm_to_sympy(expr_str)

def compare_expressions(expr1, expr2) -> bool:
    """
    Compare two Sympy expressions (or equation tuples). First, normalize both expressions
    (e.g., converting log(expr, E) to log(expr)). Then compare both sides for equations,
    or simply check if the difference is zero for single expressions.
    """
    norm_expr1 = expr1
    norm_expr2 = expr2
    # print("Normalized Expression 1:", norm_expr1)
    # print("Normalized Expression 2:", norm_expr2)
    
    if isinstance(norm_expr1, tuple) and isinstance(norm_expr2, tuple):
        left_equal = sp.simplify(norm_expr1[0] - norm_expr2[0]) == 0
        right_equal = sp.simplify(norm_expr1[1] - norm_expr2[1]) == 0
        return left_equal and right_equal
    else:
        return sp.simplify(norm_expr1 - norm_expr2) == 0
def compare_pipeline(expr_str1: str, expr_str2: str) -> bool:
    """
    The complete pipeline that:
      1. Checks if an expression is an equation (using '='), splitting it if necessary.
      2. Calls the LLM-based conversion (llm_to_sympy) to get the Sympy expression.
      3. Normalizes and compares the resulting expressions.
      
    The function prints the parsed expressions and returns whether they are equivalent.
    """
    print("Comparing expressions")
    if '=' in expr_str1:
            left_str1, expr_str1 = expr_str1.split('=', 1)
    if '=' in expr_str2:
            left_str2, expr_str2 = expr_str2.split('=', 1)

    try:
        parsed_expr1 = preprocess_expression(expr_str1)
        parsed_expr2 = preprocess_expression(expr_str2)
    except ValueError as e:
        print("Parsing error:", e)
        return False
    
    equivalent = compare_expressions(parsed_expr1, parsed_expr2)
    
    print("Parsed Expression 1:", parsed_expr1)
    print("Parsed Expression 2:", parsed_expr2)
    print("The expressions are equivalent." if equivalent else "The expressions are NOT equivalent.")
    return equivalent, parsed_expr1, parsed_expr2


# Example with plain text vs. LaTeX. They should be equivalent.
expr1 = r'x = (2√5 - 1) / 2'
expr2 = r'\dfrac{1 + \sqrt{2} + \sqrt{2\sqrt{2} - 1}}{2}'

print("Comparing expressions")
if '=' in expr1:
        left_str1, expr1 = expr1.split('=', 1)
if '=' in expr2:
        left_str2, expr2 = expr1.split('=', 1)



result,parsed_expr1,parsed_expr2 = compare_pipeline(expr1, expr2)
print("Result:", "Equivalent" if result else "Not equivalent")
var = sp.Symbol('x')
stats = stats_test(parsed_expr1, parsed_expr2, var)
print("Statistical test results:",stats)


Comparing expressions
Comparing expressions
Parsed Expression 1: -1/2 + sqrt(5)
Parsed Expression 2: 1/2 + sqrt(-1 + 2*sqrt(2))/2 + sqrt(2)/2
The expressions are NOT equivalent.
Result: Not equivalent
Statistical test results: {'type': 'constant_comparison', 'value1': 1.7360679774997898, 'value2': 1.883203505913526, 'absolute_difference': 0.14713552841373612, 'mse': 0.021648863721589348, 'are_equal': False, 'similarity_score': 0.9788098783345891, 'expr1': '-1/2 + sqrt(5)', 'expr2': '1/2 + sqrt(-1 + 2*sqrt(2))/2 + sqrt(2)/2'}


In [63]:
max_range = 10
min_range = 1
# Perform the statistical testz


Statistical test results: {'type': 'constant_comparison', 'value1': 1.0, 'value2': 999999.0, 'absolute_difference': 999998.0, 'mse': 999996000004.0, 'are_equal': False, 'similarity_score': 1.000004000011e-12, 'expr1': '1', 'expr2': '999999'}


### using the csv

In [90]:
import pandas as pd
import numpy as np


df = pd.read_csv('together_deepseek - Sheet3.csv',dtype=str,keep_default_na=False)

var  =sp.Symbol('x')
columns_answer = ["deepseek-ai/DeepSeek-V3_answer",'deepseek-ai/DeepSeek-R1_answer',"Qwen/Qwen2.5-72B-Instruct-Turbo_answer","Qwen/QwQ-32B_answer"]
all_results = {}
column_scores = {}

for idx,row in df.iterrows():
    if idx<2:
        continue
    row_results = {}
    try:
        ground_truth = str(row['answer'])
        print("Ground Truth:", ground_truth)

        for col in columns_answer:
            column_expression = str(row[col])
            print("Column Expression:", column_expression)
            try:
                result,parsed_expr1,parsed_expr2 = compare_pipeline(column_expression, ground_truth)
                print(result,parsed_expr1,parsed_expr2)
                if result:
                    stats = {"similarity_score": 1.0, "is_equivalent": True}
                else:
                    stats = stats_test(parsed_expr1, parsed_expr2, var)

                row_results[col] = stats
            except Exception as e:
                print(f"Error processing row {idx}, column {col}: {e}")
                row_results[col] = {"error": str(e), "similarity_score": 0.0}
                # Update column scores with zero for errors
                if col not in column_scores:
                    column_scores[col] = []
                column_scores[col].append(0.0)
                    # Rank columns for this row based on similarity score
        row_rankings = sorted(
                [(col, res["similarity_score"]) for col, res in row_results.items()],
                key=lambda x: x[1],
                reverse=True  # Higher scores are better
        )
                    # Store results for this row
        all_results[idx] = {
                "id":row["Sub Task Id"],
                "ground_truth": ground_truth,
                "column_results": row_results,
                "rankings": row_rankings
            }
        
    except Exception as e:
        print(f"Error processing row {idx}: {e}")
        all_results[idx] = {"error": str(e)}


                
avg_scores = {col: np.mean(scores) for col, scores in column_scores.items()}
    # Rank columns based on average similarity score
overall_rankings = sorted(
        [(col, score) for col, score in avg_scores.items()],
        key=lambda x: x[1],
        reverse=True  # Higher scores are better
    )
import json
# save to json file 
with open("all_results.json","w") as f:
    json.dump(all_results , f)






Ground Truth: x = -1
Column Expression: -1
Comparing expressions
Parsed Expression 1: -1
Parsed Expression 2: -1
The expressions are equivalent.
True -1 -1
Column Expression: -1
Comparing expressions
Parsed Expression 1: -1
Parsed Expression 2: -1
The expressions are equivalent.
True -1 -1
Column Expression: \pm \sqrt{3}
Comparing expressions
Parsed Expression 1: 1/6
Parsed Expression 2: -1
The expressions are NOT equivalent.
False 1/6 -1
Column Expression: -1
Comparing expressions
Parsed Expression 1: -1
Parsed Expression 2: -1
The expressions are equivalent.
True -1 -1
Ground Truth: a + b + c = 0  
Column Expression: a + b + c = 0
Comparing expressions
Parsed Expression 1: 0
Parsed Expression 2: 0
The expressions are equivalent.
True 0 0
Column Expression: a + b + c = 0
Comparing expressions
Parsed Expression 1: 0
Parsed Expression 2: 0
The expressions are equivalent.
True 0 0
Column Expression: a + b + c = 0
Comparing expressions
Parsed Expression 1: 0
Parsed Expression 2: 0
The exp

  return C + (1/2)*log(x + sqrt(x**2 - 1))
  return C + (1/2)*log(x + sqrt(x**2 - 1))


Error processing row 3: 'similarity_score'
Ground Truth: x = (2√5 - 1) / 2
Column Expression: \dfrac{1 + \sqrt{2} + \sqrt{2\sqrt{2} - 1}}{2}
Comparing expressions
Parsed Expression 1: 1/2 + sqrt(-1 + 2*sqrt(2))/2 + sqrt(2)/2
Parsed Expression 2: -1/2 + sqrt(5)
The expressions are NOT equivalent.
False 1/2 + sqrt(-1 + 2*sqrt(2))/2 + sqrt(2)/2 -1/2 + sqrt(5)
Column Expression: \dfrac{1 + \sqrt{2} + \sqrt{2\sqrt{2} - 1}}{2}
Comparing expressions
Parsed Expression 1: 1/2 + sqrt(-1 + 2*sqrt(2))/2 + sqrt(2)/2
Parsed Expression 2: -1/2 + sqrt(5)
The expressions are NOT equivalent.
False 1/2 + sqrt(-1 + 2*sqrt(2))/2 + sqrt(2)/2 -1/2 + sqrt(5)
Column Expression: 2
Comparing expressions
Parsed Expression 1: 2
Parsed Expression 2: -1/2 + sqrt(5)
The expressions are NOT equivalent.
False 2 -1/2 + sqrt(5)
Column Expression: \dfrac{1 + \sqrt{2} + \sqrt{2\sqrt{2} - 1}}{2}
Comparing expressions
Parsed Expression 1: 1/2 + sqrt(-1 + 2*sqrt(2))/2 + sqrt(2)/2
Parsed Expression 2: -1/2 + sqrt(5)
The expres

In [85]:
for idx,row in df.iterrows():
    for col in columns_answer:

        print(row[col])

24
24
47.6
24
\dfrac{33}{4}
24
24
24
-1
-1
\pm \sqrt{3}
-1
a + b + c = 0
a + b + c = 0
a + b + c = 0
nan
\dfrac{1 + \sqrt{2} + \sqrt{2\sqrt{2} - 1}}{2}
\dfrac{1 + \sqrt{2} + \sqrt{2\sqrt{2} - 1}}{2}
2
\dfrac{1 + \sqrt{2} + \sqrt{2\sqrt{2} - 1}}{2}
18
18
16
18
35
35
35
35
120600
120600
120600
120600
-\dfrac{107}{9}
-\dfrac{107}{9}
\frac{14}{9}
-\dfrac{107}{9}
2
2
2
2
\dfrac{2}{3}
\dfrac{2}{3}
1
\dfrac{2}{3}
 \frac{dy}{dx} = \frac{y x^{y} - 1}{x \left( \frac{1}{y} + 2y - x^{y} \ln x \right)} 
nan
No boxed answer found
\dfrac{y(x^y y - 1)}{x(1 + 2y^2 - x^y y \ln x)}
\dfrac{3}{7}
\dfrac{3}{7}
\frac{3}{7}
\dfrac{3}{7}
\frac{1}{2} \ln \left( \frac{e^{2x} - e^x + 1}{e^{2x} + e^x + 1} \right) + C
\frac{1}{2} \ln\left| \frac{e^{2x} - e^x + 1}{e^{2x} + e^x + 1} \right| + C
No boxed answer found
\frac{1}{2} \ln\left( \frac{e^{2x} - e^x + 1}{e^{2x} + e^x + 1} \right) + C
l = \frac{(1 - \alpha) x_0 + x_1}{2 - \alpha}
\dfrac{(1 - \alpha)x_0 + x_1}{2 - \alpha}
\frac{x_1 - x_0 (\alpha - 1)}{2 - \alpha