In [None]:
import os, json
from collections import Counter
from pathlib import Path
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt


# ── configuration ──────────────────────────────────────────────────────────
PROMPTS = ["regular", "signature"]
MODELS  = [
    "llama3.2:3b-instruct-fp16",
    "phi4:14b-fp16",
    "llama3.3:70b-instruct-fp16"
]
BASE_DIR = Path.cwd()   # launch dir (adjust if needed)
SHOT     = 5            # 1-, 5-, or 10-shot

# ── collectors ─────────────────────────────────────────────────────────────
summary_rows  = []                 # one row per model × prompt
matrix_tables = {}                 # {(prompt, model): DataFrame}

for prompt in PROMPTS:
    # category ranking depends on prompt type
    RANK = {"": 4, "Tests": 3, "Semantic": 2, "Syntax": 1}
    if prompt == "regular":        # "Tests" unused for regular prompt
        RANK.pop("Tests")
    CLASSES = list(RANK.keys())    # stable ordering

    for model in MODELS:
        folder = (BASE_DIR / "RAG" / prompt / "3_fold" /
                  f"{prompt}_RAG_{SHOT}_shot_{model}")
        if not folder.exists():
            print("⚠️  Folder not found:", folder)
            continue

        # global counters
        initial_cnt = Counter()
        final_cnt   = Counter()
        trans       = Counter()
        improved = worse = same = total = 0

        # refinement-bucket counters
        ref_total = Counter({i: 0 for i in range(4)})   # candidates in bucket r
        ref_pass  = Counter({i: 0 for i in range(4)})   # candidates that ended in Pass ("")

        # ── iterate over all folds / JSON files ───────────────────────────
        for file in sorted(folder.glob("*.json")):
            with open(file) as fh:
                data = json.load(fh)
            experiments = data if isinstance(data, list) else [data]

            for exp in experiments:
                for task_id, cands in exp.get("task_candidates", {}).items():
                    for cand in cands:
                        init = cand["initial_error_category"]
                        fin  = cand["final_error_category"]
                        r    = cand.get("refinements_performed", 0)  # defaults to 0

                        if init not in RANK or fin not in RANK:
                            continue  # skip unknown labels

                        # overall tallies
                        total += 1
                        initial_cnt[init] += 1
                        final_cnt[fin]   += 1
                        trans[(init, fin)] += 1

                        # refinement buckets
                        ref_total[r] += 1
                        if fin == "":
                            ref_pass[r] += 1

                        # improvement categories (for completeness)
                        if RANK[fin] > RANK[init]:
                            improved += 1
                        elif RANK[fin] < RANK[init]:
                            worse += 1
                        else:
                            same += 1

        if total == 0:
            print(f"No data for {prompt} | {model}")
            continue

        # ── statistics derived from buckets ───────────────────────────────
        pass_total = final_cnt[""]                        # total items that ended in Pass
        pass_share = {r: (100 * ref_pass[r] / pass_total if pass_total else 0.0)
                      for r in range(4)}                  # % of Pass that used r refinements


        # ── statistics derived from buckets ───────────────────────────────
        pass_total = final_cnt[""]                        # total items that ended in Pass
        pass_share = {r: (100 * ref_pass[r] / pass_total if pass_total else 0.0)
                      for r in range(4)}                  # % of Pass that used r refinements

        # ── NEW: bar chart of Pass‑share per refinement count ─────────────
        fig, ax = plt.subplots(figsize=(4, 3))            # small, article‑friendly size
        bars = ax.bar(
            range(4),
            [pass_share[r] for r in range(4)],
            tick_label=[f"r={r}" for r in range(4)]
        )
        ax.set_ylim(0, 100)
        ax.set_ylabel("% of Pass cases")
        ax.set_xlabel("Refinement turns (r)")
        ax.set_title(f"{model} | {prompt} | Pass share by r")

        # percentage labels on top of each bar
        for bar, pct in zip(bars, [pass_share[r] for r in range(4)]):
            ax.annotate(f"{pct:.1f}%", 
                        xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                        xytext=(0, 3), textcoords="offset points",
                        ha="center", va="bottom", fontsize=8)

        plt.tight_layout()
        img_path = folder / f"pass_share_{prompt}_{model.replace(':','_')}.png"
        fig.savefig(img_path, dpi=150)
        plt.close(fig)                                    # free memory
        print("📊  Saved bar chart →", img_path)

        # ── build DataFrame row for summary table ─────────────────────────
        summary_rows.append({
            # … unchanged …
        })

        # ── build DataFrame row for summary table ─────────────────────────
        summary_rows.append({
            "Prompt": prompt,
            "Model":  model,
            "Total":  total,
            "Improved (n)": improved,
            "Improved (%)": f"{100*improved/total:.1f}",
            "Worse (n)":    worse,
            "Worse (%)":    f"{100*worse/total:.1f}",
            "Same (n)":     same,
            "Same (%)":     f"{100*same/total:.1f}",
            "Final-Pass (n)": pass_total,
            "Final-Pass (%)": f"{100*pass_total/total:.1f}",
            # per-bucket counts
            **{f"Ref{r} (n)": ref_total[r]          for r in range(4)},
            # NEW: share of all Pass cases that required r refinements
            **{f"Ref{r}→Pass (%)": pass_share[r]    for r in range(4)},
        })

        # ── transition matrix ------------------------------------------------
        mat = pd.DataFrame(
            [[trans[(r, c)] for c in CLASSES] for r in CLASSES],
            index=[c or "Pass" for c in CLASSES],
            columns=[c or "Pass" for c in CLASSES]
        )
        matrix_tables[(prompt, model)] = mat

        # ── console preview --------------------------------------------------
        print(f"\n{prompt} | {model}  (n={total})")

        def with_pct(df: pd.DataFrame, ndigits: int = 1) -> pd.DataFrame:
            """Return a DataFrame whose cells are 'count (row‑%)',
            with row‑wise percentages summing to 100 exactly."""
            
            out = df.copy().astype(int)        # keep counts as ints for display
            
            for idx, row in df.iterrows():
                total = row.sum()
                if total == 0:
                    # show dashes instead of meaningless percentages
                    out.loc[idx] = [f"{v} (—)" for v in row]
                    continue
                
                pct      = row / total * 100              # raw percentages
                rounded  = pct.round(ndigits)             # provisional rounding
                diff     = 100 - rounded.sum()            # rounding error (‑0.2 … +0.2)
                
                # add the diff to the *last* non‑zero column (feel free to choose idxmax())
                last_col = row[row > 0].index[-1]
                rounded[last_col] += diff
                
                # build “count (pct%)” strings
                out.loc[idx] = [
                    f"{int(c)} ({p:.{ndigits}f}%)" for c, p in zip(row, rounded)
                ]

            return out



        display(with_pct(mat))

        print("Share of *Pass* results by refinement count:")
        for r in range(4):
            print(f"  r={r}: {pass_share[r]:4.1f}%  "
                  f"({ref_pass[r]} of {pass_total})")

        print(f"Improved {improved/total:.1%}, Worse {worse/total:.1%}, "
              f"Same {same/total:.1%}, Final-Pass {pass_total/total:.1%}")

# # ── LaTeX export (uncomment if needed) --------------------------------------
# summary_df = pd.DataFrame(summary_rows)
# summary_df.to_latex(
#     "summary_table.tex",
#     index=False,
#     caption="Error-category shifts and refinement statistics.",
#     label="tab:error_overview",
#     float_format="%.1f"
# )
# for (prompt, model), mat in matrix_tables.items():
#     mat.to_latex(
#         f"matrix_{prompt}_{model.replace(':','_')}.tex",
#         caption=f"Transition matrix for {model} ({prompt}).",
#         label=f"tab:matrix_{prompt}_{model.replace(':','_')}",
#         column_format="l" + "r" * len(mat.columns)
#     )
# print("\n✅  LaTeX tables written.")


📊  Saved bar chart → /root/Thesis_project/experiments/Refinement/fox/testing_runs/RAG/regular/3_fold/regular_RAG_5_shot_llama3.2:3b-instruct-fp16/pass_share_regular_llama3.2_3b-instruct-fp16.png

regular | llama3.2:3b-instruct-fp16  (n=1230)


  out.loc[idx] = [
  out.loc[idx] = [
  out.loc[idx] = [


Unnamed: 0,Pass,Semantic,Syntax
Pass,737 (100.0%),0 (0.0%),0 (0.0%)
Semantic,54 (28.3%),53 (27.7%),84 (44.0%)
Syntax,17 (5.6%),6 (2.0%),279 (92.4%)


Share of *Pass* results by refinement count:
  r=0: 91.2%  (737 of 808)
  r=1:  7.3%  (59 of 808)
  r=2:  1.5%  (12 of 808)
  r=3:  0.0%  (0 of 808)
Improved 6.3%, Worse 6.8%, Same 86.9%, Final-Pass 65.7%
📊  Saved bar chart → /root/Thesis_project/experiments/Refinement/fox/testing_runs/RAG/regular/3_fold/regular_RAG_5_shot_phi4:14b-fp16/pass_share_regular_phi4_14b-fp16.png

regular | phi4:14b-fp16  (n=1230)


  out.loc[idx] = [
  out.loc[idx] = [
  out.loc[idx] = [


Unnamed: 0,Pass,Semantic,Syntax
Pass,936 (100.0%),0 (0.0%),0 (0.0%)
Semantic,44 (24.3%),111 (61.3%),26 (14.4%)
Syntax,14 (12.4%),5 (4.4%),94 (83.2%)


Share of *Pass* results by refinement count:
  r=0: 94.2%  (936 of 994)
  r=1:  5.0%  (50 of 994)
  r=2:  0.8%  (8 of 994)
  r=3:  0.0%  (0 of 994)
Improved 5.1%, Worse 2.1%, Same 92.8%, Final-Pass 80.8%
📊  Saved bar chart → /root/Thesis_project/experiments/Refinement/fox/testing_runs/RAG/regular/3_fold/regular_RAG_5_shot_llama3.3:70b-instruct-fp16/pass_share_regular_llama3.3_70b-instruct-fp16.png

regular | llama3.3:70b-instruct-fp16  (n=1230)


  out.loc[idx] = [
  out.loc[idx] = [
  out.loc[idx] = [


Unnamed: 0,Pass,Semantic,Syntax
Pass,943 (100.0%),0 (0.0%),0 (0.0%)
Semantic,22 (13.6%),114 (70.4%),26 (16.0%)
Syntax,119 (95.2%),2 (1.6%),4 (3.2%)


Share of *Pass* results by refinement count:
  r=0: 87.0%  (943 of 1084)
  r=1:  9.6%  (104 of 1084)
  r=2:  3.4%  (37 of 1084)
  r=3:  0.0%  (0 of 1084)
Improved 11.6%, Worse 2.1%, Same 86.3%, Final-Pass 88.1%
📊  Saved bar chart → /root/Thesis_project/experiments/Refinement/fox/testing_runs/RAG/signature/3_fold/signature_RAG_5_shot_llama3.2:3b-instruct-fp16/pass_share_signature_llama3.2_3b-instruct-fp16.png

signature | llama3.2:3b-instruct-fp16  (n=1230)


  out.loc[idx] = [
  out.loc[idx] = [
  out.loc[idx] = [
  out.loc[idx] = [


Unnamed: 0,Pass,Tests,Semantic,Syntax
Pass,312 (100.0%),0 (0.0%),0 (0.0%),0 (0.0%)
Tests,14 (4.5%),69 (22.0%),24 (7.6%),207 (65.9%)
Semantic,21 (10.7%),10 (5.1%),74 (37.8%),91 (46.4%)
Syntax,14 (3.4%),3 (0.7%),14 (3.4%),377 (92.5%)


Share of *Pass* results by refinement count:
  r=0: 86.4%  (312 of 361)
  r=1:  9.1%  (33 of 361)
  r=2:  4.4%  (16 of 361)
  r=3:  0.0%  (0 of 361)
Improved 6.2%, Worse 26.2%, Same 67.6%, Final-Pass 29.3%
📊  Saved bar chart → /root/Thesis_project/experiments/Refinement/fox/testing_runs/RAG/signature/3_fold/signature_RAG_5_shot_phi4:14b-fp16/pass_share_signature_phi4_14b-fp16.png

signature | phi4:14b-fp16  (n=1230)


  out.loc[idx] = [
  out.loc[idx] = [
  out.loc[idx] = [
  out.loc[idx] = [


Unnamed: 0,Pass,Tests,Semantic,Syntax
Pass,619 (100.0%),0 (0.0%),0 (0.0%),0 (0.0%)
Tests,55 (35.0%),70 (44.6%),10 (6.4%),22 (14.0%)
Semantic,21 (10.9%),19 (9.8%),107 (55.4%),46 (23.9%)
Syntax,4 (1.5%),4 (1.5%),17 (6.5%),236 (90.5%)


Share of *Pass* results by refinement count:
  r=0: 88.6%  (619 of 699)
  r=1:  9.4%  (66 of 699)
  r=2:  2.0%  (14 of 699)
  r=3:  0.0%  (0 of 699)
Improved 9.8%, Worse 6.3%, Same 83.9%, Final-Pass 56.8%
📊  Saved bar chart → /root/Thesis_project/experiments/Refinement/fox/testing_runs/RAG/signature/3_fold/signature_RAG_5_shot_llama3.3:70b-instruct-fp16/pass_share_signature_llama3.3_70b-instruct-fp16.png

signature | llama3.3:70b-instruct-fp16  (n=1230)


  out.loc[idx] = [
  out.loc[idx] = [
  out.loc[idx] = [
  out.loc[idx] = [


Unnamed: 0,Pass,Tests,Semantic,Syntax
Pass,650 (100.0%),0 (0.0%),0 (0.0%),0 (0.0%)
Tests,77 (52.0%),57 (38.5%),6 (4.1%),8 (5.4%)
Semantic,24 (17.4%),29 (21.0%),83 (60.1%),2 (1.5%)
Syntax,46 (15.6%),62 (21.1%),36 (12.2%),150 (51.1%)


Share of *Pass* results by refinement count:
  r=0: 81.6%  (650 of 797)
  r=1: 14.8%  (118 of 797)
  r=2:  3.6%  (29 of 797)
  r=3:  0.0%  (0 of 797)
Improved 22.3%, Worse 1.3%, Same 76.4%, Final-Pass 64.8%
