In [15]:
from pathlib import Path
import json
import pandas as pd

try:
    from IPython.display import display  # type: ignore
except Exception:
    # Fallback for non-notebook execution
    display = print

# Reload local notebook utilities to pick up edits without restarting kernel
import importlib
import testLibs as tl
importlib.reload(tl)

# Notebook helpers (scoring + flatteners)
ResultsFlattener = tl.ResultsFlattener
mznResultsFlattener = tl.mznResultsFlattener
get_significative_solvers = tl.get_significative_solvers
scoreComputation_subset = tl.scoreComputation_subset
compute_llm_scores = tl.compute_llm_scores
compute_top1_llm_scores = tl.compute_top1_llm_scores
compute_closed_gap = tl.compute_closed_gap
build_llm_performance_table = tl.build_llm_performance_table
filter_to_solvers = tl.filter_to_solvers
singleSolverScore = tl.singleSolverScore

# Optional plotting libs (only needed for plots)
try:
    import matplotlib.pyplot as plt
except ModuleNotFoundError:
    plt = None
try:
    import seaborn as sns
except ModuleNotFoundError:
    sns = None

In [16]:
data_dir = Path("../data/testOutputSignificative").resolve()
if not data_dir.is_dir():
    raise FileNotFoundError(f"Expected folder not found: {data_dir}")

json_paths = sorted(data_dir.glob("*.json"))
print(f"Loading {len(json_paths)} JSON files from {data_dir}")

data_by_file = {}
for p in json_paths:
    with p.open("r", encoding="utf-8") as f:
        data_by_file[p.name] = json.load(f)

# List in a stable order, matching json_paths
data_list = [data_by_file[p.name] for p in json_paths]

print("Loaded files:")
for name in data_by_file.keys():
    print(" -", name)

with open('../data/tablesJSON/allTables_free.json', 'r') as f1:
    MznResults = json.load(f1)

Loading 4 JSON files from /home/vro5/Coding/AgenticSolvers/test/data/testOutputSignificative
Loaded files:
 - LLMsuggestions_significative_uncommented_fzn.json
 - LLMsuggestions_significative_uncommented_fzn_solverdesc.json
 - LLMsuggestions_significative_uncommented_fzncat.json
 - LLMsuggestions_significative_uncommented_fzncat_solverdesc.json


In [17]:
# --- Significative-only scoring pipeline ---
sig_solvers = get_significative_solvers()
print(f"Using significative solvers (count={len(sig_solvers)}):")
for s in sig_solvers:
    print(" -", s)

# MiniZinc results -> scores restricted to significative solvers
mzn_raw_df = mznResultsFlattener(MznResults)
scored_sig_df = scoreComputation_subset(mzn_raw_df, allowed_solvers=sig_solvers)
print(f"\nMZN rows (raw): {len(mzn_raw_df)}")
print(f"MZN rows (significative-only): {len(scored_sig_df)}")

# Optional: show best single solver within significative set
sig_single_solver_rank = singleSolverScore(scored_sig_df)
display(sig_single_solver_rank.head(10))

# LLM results -> compute Top-3, Top-1, ClosedGap within significative set
tables_by_file = {}
for fname, llm_results in data_by_file.items():
    llm_df = ResultsFlattener(llm_results)
    top3_summary = compute_llm_scores(llm_df, scored_sig_df, allowed_solvers=sig_solvers)
    top1_summary, top1_scored = compute_top1_llm_scores(llm_df, scored_sig_df, allowed_solvers=sig_solvers)
    cg_rows = compute_closed_gap(top1_scored, scored_sig_df, allowed_solvers=sig_solvers, sbs_solver=None)
    cg_df = pd.DataFrame(cg_rows)
    perf_table = build_llm_performance_table(
        top3_summary=top3_summary,
        top1_summary=top1_summary,
        closed_gap=cg_df,
        sort_by='SingleScore',
        ascending=False,
    )
    tables_by_file[fname] = {
        "top3_summary": top3_summary,
        "top1_summary": top1_summary,
        "closed_gap": cg_df,
        "performance_table": perf_table,
    }

print(f"\nComputed significative-only scoring for {len(tables_by_file)} LLM result files.")

# Display intermediate tables + performance table for each loaded file (stable order)
for p in json_paths:
    fname = p.name
    if fname not in tables_by_file:
        continue
    print("\n" + "=" * 80)
    print("File:", fname)
    print("=" * 80)

    print("\nTop-3 summary:")
    display(tables_by_file[fname]["top3_summary"])

    print("\nTop-1 summary:")
    display(tables_by_file[fname]["top1_summary"])

    print("\nClosed-gap table:")
    display(tables_by_file[fname]["closed_gap"])

    print("\nPerformance table:")
    display(tables_by_file[fname]["performance_table"])

Using significative solvers (count=12):
 - cbc-free
 - choco-solver__cp_-free
 - choco-solver__cp-sat_-free
 - cp_optimizer-free
 - cplex-free
 - gurobi-free
 - highs-free
 - izplus-free
 - jacop-free
 - pumpkin-free
 - scip-free
 - sicstus_prolog-free

MZN rows (raw): 2000
MZN rows (significative-only): 1200


Unnamed: 0,Solver,TotalScore,NumOptimal
5,gurobi-free,55.449377,38
9,pumpkin-free,57.232777,33
1,choco-solver__cp-sat_-free,60.303673,32
4,cplex-free,48.631992,30
2,choco-solver__cp_-free,58.330593,29
3,cp_optimizer-free,51.07582,26
8,jacop-free,43.84722,25
7,izplus-free,58.000135,25
11,sicstus_prolog-free,43.918052,24
10,scip-free,36.49014,20



Computed significative-only scoring for 4 LLM result files.

File: LLMsuggestions_significative_uncommented_fzn.json

Top-3 summary:


Unnamed: 0,provider,model,LLM_TotalScore,InstancesCovered,LLM_AvgScore
0,groq,openai/gpt-oss-120b,72.347686,100,0.723477



Top-1 summary:


Unnamed: 0,provider,model,LLM_Top1_TotalScore,LLM_Top1_AvgScore,InstancesCovered
0,groq,openai/gpt-oss-120b,52.618511,0.548109,100



Closed-gap table:


Unnamed: 0,provider,model,InstancesCovered,AS,SBS,VBS,ClosedGap
0,groq,openai/gpt-oss-120b,100,52.618511,60.303673,83.75,-0.327777



Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,52.618511,72.347686,-0.327777



File: LLMsuggestions_significative_uncommented_fzn_solverdesc.json

Top-3 summary:


Unnamed: 0,provider,model,LLM_TotalScore,InstancesCovered,LLM_AvgScore
0,groq,openai/gpt-oss-120b,69.472444,100,0.694724



Top-1 summary:


Unnamed: 0,provider,model,LLM_Top1_TotalScore,LLM_Top1_AvgScore,InstancesCovered
0,groq,openai/gpt-oss-120b,50.170237,0.533726,98



Closed-gap table:


Unnamed: 0,provider,model,InstancesCovered,AS,SBS,VBS,ClosedGap
0,groq,openai/gpt-oss-120b,98,50.170237,60.303673,83.75,-0.432197



Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,50.170237,69.472444,-0.432197



File: LLMsuggestions_significative_uncommented_fzncat.json

Top-3 summary:


Unnamed: 0,provider,model,LLM_TotalScore,InstancesCovered,LLM_AvgScore
0,groq,openai/gpt-oss-120b,72.563254,100,0.725633



Top-1 summary:


Unnamed: 0,provider,model,LLM_Top1_TotalScore,LLM_Top1_AvgScore,InstancesCovered
0,groq,openai/gpt-oss-120b,51.421134,0.547033,99



Closed-gap table:


Unnamed: 0,provider,model,InstancesCovered,AS,SBS,VBS,ClosedGap
0,groq,openai/gpt-oss-120b,99,51.421134,60.303673,83.75,-0.378846



Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,51.421134,72.563254,-0.378846



File: LLMsuggestions_significative_uncommented_fzncat_solverdesc.json

Top-3 summary:


Unnamed: 0,provider,model,LLM_TotalScore,InstancesCovered,LLM_AvgScore
0,groq,openai/gpt-oss-120b,66.854373,100,0.668544



Top-1 summary:


Unnamed: 0,provider,model,LLM_Top1_TotalScore,LLM_Top1_AvgScore,InstancesCovered
0,groq,openai/gpt-oss-120b,51.07582,0.53764,100



Closed-gap table:


Unnamed: 0,provider,model,InstancesCovered,AS,SBS,VBS,ClosedGap
0,groq,openai/gpt-oss-120b,100,51.07582,60.303673,83.75,-0.393573



Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,51.07582,66.854373,-0.393573


In [18]:
# Compute and print the SBS/VBS that Closed Gap uses (for the current scoring subset).
if "scored_sig_df" not in globals():
    raise RuntimeError("Run the significative-only scoring pipeline cell first to create `scored_sig_df`.")

# SBS: single solver with max total ComputedScore
sbs_totals = scored_sig_df.groupby("Solver", as_index=False)["ComputedScore"].sum().rename(columns={"ComputedScore": "TotalComputedScore"})
sbs_row = sbs_totals.sort_values("TotalComputedScore", ascending=False).head(1)
SBS_SOLVER = str(sbs_row.iloc[0]["Solver"]) if not sbs_row.empty else ""
SBS_TOTAL = float(sbs_row.iloc[0]["TotalComputedScore"]) if not sbs_row.empty else 0.0

# VBS: per (Problem, Instance) max ComputedScore then sum
vbs_df = (
    scored_sig_df.groupby(["Problem", "Instance"], as_index=False)["ComputedScore"]
    .max()
    .rename(columns={"ComputedScore": "VBS_InstScore"})
)
VBS_TOTAL = float(vbs_df["VBS_InstScore"].sum()) if not vbs_df.empty else 0.0

display(pd.DataFrame(
    {
        "Quantity": ["SBS Solver", "SBS Total", "VBS Total"],
        "Value": [SBS_SOLVER, SBS_TOTAL, VBS_TOTAL],
    }
))

Unnamed: 0,Quantity,Value
0,SBS Solver,choco-solver__cp-sat_-free
1,SBS Total,60.303673
2,VBS Total,83.75


# Temperature sampling analysis (significative solvers)

This section loads LLM suggestion outputs generated with different decoding temperatures from `../data/testOutputSigTemp/` and evaluates them with the same scoring pipeline (Top-3, Top-1, ClosedGap) restricted to the significative solver set.

In [19]:
# --- Temperature sampling analysis ---
import re

temp_data_dir = Path("../data/testOutputSigTemp").resolve()
if not temp_data_dir.is_dir():
    raise FileNotFoundError(f"Expected folder not found: {temp_data_dir}")

temp_json_paths = sorted(temp_data_dir.glob("*.json"))
print(f"Loading {len(temp_json_paths)} temperature-sampling JSON files from {temp_data_dir}")

def _parse_temperature_from_filename(name: str) -> float | None:
    # Expected suffix patterns: _T0.json, _T0p2.json, ...
    m = re.search(r"_T(?P<t>\d+(?:p\d+)?)\.json$", name)
    if not m:
        return None
    t_str = m.group("t").replace("p", ".")
    try:
        return float(t_str)
    except ValueError:
        return None

def _parse_variant_from_filename(name: str) -> str:
    # Extract between 'LLMsuggestions_significative_' and '_T...'
    m = re.search(r"^LLMsuggestions_significative_(?P<v>.+)_T\d+(?:p\d+)?\.json$", name)
    return m.group("v") if m else "unknown"

# Ensure baseline scoring tables exist (cell 3 computes these)
if "scored_sig_df" not in globals():
    raise RuntimeError("Run the significative-only scoring pipeline cell first to create `scored_sig_df`.")
if "sig_solvers" not in globals():
    sig_solvers = get_significative_solvers()

temp_tables_by_file = {}
for p in temp_json_paths:
    fname = p.name
    with p.open("r", encoding="utf-8") as f:
        llm_results_wrapped = json.load(f)

    # Files are structured like {'chat': {...}}; we evaluate the entire object as-is.
    llm_df = ResultsFlattener(llm_results_wrapped)
    top3_summary = compute_llm_scores(llm_df, scored_sig_df, allowed_solvers=sig_solvers)
    top1_summary, top1_scored = compute_top1_llm_scores(llm_df, scored_sig_df, allowed_solvers=sig_solvers)
    cg_rows = compute_closed_gap(top1_scored, scored_sig_df, allowed_solvers=sig_solvers, sbs_solver=None)
    cg_df = pd.DataFrame(cg_rows)
    perf_table = build_llm_performance_table(
    top3_summary=top3_summary,
    top1_summary=top1_summary,
    closed_gap=cg_df,
    sort_by='SingleScore',
    ascending=False,
    )
    temp_tables_by_file[fname] = {
        "temperature": _parse_temperature_from_filename(fname),
        "variant": _parse_variant_from_filename(fname),
        "top3_summary": top3_summary,
        "top1_summary": top1_summary,
        "closed_gap": cg_df,
        "performance_table": perf_table,
    }

print(f"Computed temperature-sampling scoring for {len(temp_tables_by_file)} files.")

# Display per-file tables (stable order)
for p in temp_json_paths:
    fname = p.name
    info = temp_tables_by_file[fname]
    print("\n" + "=" * 80)
    print("File:", fname)
    print("Variant:", info["variant"], "| Temperature:", info["temperature"])
    print("=" * 80)
    print("\nPerformance table:")
    display(info["performance_table"])

# Build an aggregated comparison table across temps/variants
rows = []
for fname, info in temp_tables_by_file.items():
    t = info["temperature"]
    variant = info["variant"]
    perf = info["performance_table"].copy()
    perf["File"] = fname
    perf["Variant"] = variant
    perf["Temperature"] = t
    rows.append(perf)

temp_perf_long = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame()
print(f"\nAggregated rows: {len(temp_perf_long)}")
display(temp_perf_long.head(20))

Loading 10 temperature-sampling JSON files from /home/vro5/Coding/AgenticSolvers/test/data/testOutputSigTemp
Computed temperature-sampling scoring for 10 files.

File: LLMsuggestions_significative_chat_fzn_Sdesc_T0.json
Variant: chat_fzn_Sdesc | Temperature: 0.0

Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,49.097559,72.374489,-0.477948



File: LLMsuggestions_significative_chat_fzn_Sdesc_T0p2.json
Variant: chat_fzn_Sdesc | Temperature: 0.2

Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,49.07582,73.624489,-0.478875



File: LLMsuggestions_significative_chat_fzn_Sdesc_T0p3.json
Variant: chat_fzn_Sdesc | Temperature: 0.3

Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,50.57582,73.892572,-0.414899



File: LLMsuggestions_significative_chat_fzn_Sdesc_T0p7.json
Variant: chat_fzn_Sdesc | Temperature: 0.7

Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,51.30548,73.374489,-0.383778



File: LLMsuggestions_significative_chat_fzn_Sdesc_T0p8.json
Variant: chat_fzn_Sdesc | Temperature: 0.8

Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,49.82582,73.544802,-0.446887



File: LLMsuggestions_significative_chat_fzn_T0.json
Variant: chat_fzn | Temperature: 0.0

Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,50.761038,72.701924,-0.406999



File: LLMsuggestions_significative_chat_fzn_T0p2.json
Variant: chat_fzn | Temperature: 0.2

Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,52.002097,75.142652,-0.354067



File: LLMsuggestions_significative_chat_fzn_T0p3.json
Variant: chat_fzn | Temperature: 0.3

Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,49.423086,71.37974,-0.464064



File: LLMsuggestions_significative_chat_fzn_T0p7.json
Variant: chat_fzn | Temperature: 0.7

Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,52.843076,73.600511,-0.318199



File: LLMsuggestions_significative_chat_fzn_T0p8.json
Variant: chat_fzn | Temperature: 0.8

Performance table:


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap
0,openai/gpt-oss-120b,48.581673,74.979358,-0.49995



Aggregated rows: 10


Unnamed: 0,Model,Single Score,Parallel Score,Closed Gap,File,Variant,Temperature
0,openai/gpt-oss-120b,49.097559,72.374489,-0.477948,LLMsuggestions_significative_chat_fzn_Sdesc_T0...,chat_fzn_Sdesc,0.0
1,openai/gpt-oss-120b,49.07582,73.624489,-0.478875,LLMsuggestions_significative_chat_fzn_Sdesc_T0...,chat_fzn_Sdesc,0.2
2,openai/gpt-oss-120b,50.57582,73.892572,-0.414899,LLMsuggestions_significative_chat_fzn_Sdesc_T0...,chat_fzn_Sdesc,0.3
3,openai/gpt-oss-120b,51.30548,73.374489,-0.383778,LLMsuggestions_significative_chat_fzn_Sdesc_T0...,chat_fzn_Sdesc,0.7
4,openai/gpt-oss-120b,49.82582,73.544802,-0.446887,LLMsuggestions_significative_chat_fzn_Sdesc_T0...,chat_fzn_Sdesc,0.8
5,openai/gpt-oss-120b,50.761038,72.701924,-0.406999,LLMsuggestions_significative_chat_fzn_T0.json,chat_fzn,0.0
6,openai/gpt-oss-120b,52.002097,75.142652,-0.354067,LLMsuggestions_significative_chat_fzn_T0p2.json,chat_fzn,0.2
7,openai/gpt-oss-120b,49.423086,71.37974,-0.464064,LLMsuggestions_significative_chat_fzn_T0p3.json,chat_fzn,0.3
8,openai/gpt-oss-120b,52.843076,73.600511,-0.318199,LLMsuggestions_significative_chat_fzn_T0p7.json,chat_fzn,0.7
9,openai/gpt-oss-120b,48.581673,74.979358,-0.49995,LLMsuggestions_significative_chat_fzn_T0p8.json,chat_fzn,0.8


In [20]:
# Clean view of the aggregated temperature table: rename variants, reorder columns, drop noisy columns, hide index
cols_to_drop = ["Model", "File"]
temp_perf_view = temp_perf_long.drop(columns=[c for c in cols_to_drop if c in temp_perf_long.columns]).copy()

# Map variant codes -> friendly names
variant_map = {
    "chat_fzn": "fzn2nl",
    "chat_fzn_Sdesc": "fzn2nl + Solver Description",
}
if "Variant" in temp_perf_view.columns:
    temp_perf_view["Variant"] = temp_perf_view["Variant"].replace(variant_map)

# Put Variant first (then keep a sensible default order)
if "Variant" in temp_perf_view.columns:
    ordered = ["Variant"] + [c for c in temp_perf_view.columns if c != "Variant"]
    temp_perf_view = temp_perf_view[ordered]

# Display formatting: Temperature -> 1 decimal; other numeric columns -> 3 decimals
formatters = {}
for col in temp_perf_view.columns:
    if col == "Temperature":
        if pd.api.types.is_numeric_dtype(temp_perf_view[col]):
            formatters[col] = "{:.1f}"
        continue
    if pd.api.types.is_numeric_dtype(temp_perf_view[col]):
        formatters[col] = "{:.3f}"

# Hide the old index column in display
temp_perf_view = temp_perf_view.reset_index(drop=True)
try:
    styler = temp_perf_view.style
    if formatters:
        styler = styler.format(formatters)
    display(styler.hide(axis="index"))
except Exception:
    # Fallback if Styler.hide isn't available in the current pandas version
    display(temp_perf_view)

Variant,Single Score,Parallel Score,Closed Gap,Temperature
fzn2nl + Solver Description,49.098,72.374,-0.478,0.0
fzn2nl + Solver Description,49.076,73.624,-0.479,0.2
fzn2nl + Solver Description,50.576,73.893,-0.415,0.3
fzn2nl + Solver Description,51.305,73.374,-0.384,0.7
fzn2nl + Solver Description,49.826,73.545,-0.447,0.8
fzn2nl,50.761,72.702,-0.407,0.0
fzn2nl,52.002,75.143,-0.354,0.2
fzn2nl,49.423,71.38,-0.464,0.3
fzn2nl,52.843,73.601,-0.318,0.7
fzn2nl,48.582,74.979,-0.5,0.8
