In [40]:
import pandas as pd
from IPython.display import display

# Configuration

In [41]:
# =========================
# Config
# =========================
LLM_CSV_PATH = "sample/sample_llm_results.csv"     # LLM results
GT_CSV_PATH  = "sample/sample_ground_truth.csv"    # ground truth

In [42]:
# =========================
# Config Continued, (edit these IF NEEDED).
# =========================

# Column in BOTH CSVs that contains the paper identifier (first column by position).
PAPER_ID_COL = "paper_id"

# Predefined criteria and paper IDs.
CRITERIA = [
    "criteria a.1", "criteria a.2", "criteria a.3", "criteria a.4", "criteria a.5",
    "criteria b.1", "criteria b.2", "criteria b.3", "criteria b.4", "criteria b.5",
    "criteria c",
    "criteria d.1", "criteria d.2", "criteria d.3",
    "criteria e.1", "criteria e.2", "criteria e.3", "criteria e.4",
]

PAPER_IDS = [
    "Allcott, H (2011) (1)",
    "Ayres, Raseman, Shih, 2012",
    "Bager, S; Mundaca, L (2017)",
    "Carroll, J; Lyons, S; Denny, E (2014)",
    "Houde, S; Todd, A; Sudarshan, A; Flora, JA; Armel, KC (2013)",
    "Marangoni and Tavoni, 2021",
    "Matsukawa, I. (2018)",
]

'''
This will use the predefined PAPER_IDS and CRITERIA instead of taking values from any of the dataset. 
Hence, naming isn't important here, but ordering of the paper and criteria will be important. Ordering has to be the same between both dataset.
'''

# Presentation
DISPLAY_SEPARATOR = " / "  # string between LLM and GT values in a cell.

# Result Comparison

In [43]:
# =========================
# Imports & helpers
# =========================
df_llm = pd.read_csv(LLM_CSV_PATH)
df_gt = pd.read_csv(GT_CSV_PATH)

def _canon_label(x: object) -> str:
    """Lowercase; map 'probably yes/no' to 'yes/no'; empty for NaN."""
    if pd.isna(x):
        return ""
    s = str(x).strip().lower()
    if s == "probably yes":
        s = "yes"
    elif s == "probably no":
        s = "no"
    return s

def _clean_col(s: str) -> str:
    """Simple normalization for column names (used in string mode)."""
    return " ".join(str(s).strip().lower().split())

def load_by_position(dataframe: pd.DataFrame, path: str, paper_ids: list[str], criteria: list[str], paper_id_col: str) -> pd.DataFrame:
    """
    Read CSV assuming:
    - First column (by position) is the paper id, and rows appear in the same order as PAPER_IDS
    - Next len(criteria) columns (by position) correspond to CRITERIA in that exact order
    Any extra columns after criteria are ignored. Column *names* in the file don't matter.
    """
    df = dataframe
    # Basic shape check
    need_cols = 1 + len(criteria)
    if df.shape[1] < need_cols:
        raise ValueError(f"{path}: expected at least {need_cols} columns (id + {len(criteria)} criteria), got {df.shape[1]}.")

    # Slice columns by position
    df = df.iloc[:, :need_cols]

    # Rename columns to canonical names (first col -> PAPER_ID_COL, rest -> CRITERIA)
    df.columns = [paper_id_col] + criteria

    # Enforce row count/order by position
    if df.shape[0] != len(paper_ids):
        raise ValueError(f"{path}: expected exactly {len(paper_ids)} rows (one per paper), got {df.shape[0]}.")

    # Set index to predefined order (position-based — we *overwrite* with PAPER_IDS)
    df[paper_id_col] = paper_ids
    df = df.set_index(paper_id_col)

    # Canonicalize labels
    for c in criteria:
        df[c] = df[c].apply(_canon_label)

    return df

def build_comparison(llm_df: pd.DataFrame, gt_df: pd.DataFrame, paper_ids: list[str], criteria: list[str], sep: str) -> pd.DataFrame:
    """Return a criteria × papers table with 'llm / ground truth' strings."""
    out = pd.DataFrame(index=criteria, columns=paper_ids, dtype=object)
    for p in paper_ids:
        for c in criteria:
            pred  = "" if pd.isna(llm_df.loc[p, c]) else str(llm_df.loc[p, c])
            truth = "" if pd.isna(gt_df.loc[p, c])  else str(gt_df.loc[p, c])
            out.at[c, p] = f"{pred}{sep}{truth}"
    return out

In [44]:
llm = load_by_position(df_llm ,LLM_CSV_PATH, PAPER_IDS, CRITERIA, PAPER_ID_COL)
gt  = load_by_position(df_gt, GT_CSV_PATH,  PAPER_IDS, CRITERIA, PAPER_ID_COL)

# =========================
# Build and display comparison
# =========================
comparison = build_comparison(llm, gt, PAPER_IDS, CRITERIA, DISPLAY_SEPARATOR)

# Add accuracy column per-criterion (row)
# Definition: fraction of papers where LLM == GT for that criterion.
# Denominator uses papers with non-empty ground-truth for that criterion.
acc_values = []
for crit in CRITERIA:
    preds = llm.loc[PAPER_IDS, crit]
    truths = gt.loc[PAPER_IDS, crit]
    mask = truths.astype(str).str.len() > 0  # count only where GT present
    total = int(mask.sum())
    matches = int((preds[mask] == truths[mask]).sum()) if total > 0 else 0
    acc_pct = (matches / total * 100.0) if total > 0 else None
    formatted = f"{matches}/{total} ({acc_pct:.1f}%)" if total > 0 else "n/a"
    acc_values.append(formatted)

comparison["accuracy"] = acc_values

print("=== Criteria x Papers (LLM / Ground Truth) with per-criterion accuracy ===")
display(comparison)

# save
# comparison.to_csv("rob_comparison_llm_vs_gt_based.csv", index=True)

=== Criteria x Papers (LLM / Ground Truth) with per-criterion accuracy ===


Unnamed: 0,"Allcott, H (2011) (1)","Ayres, Raseman, Shih, 2012","Bager, S; Mundaca, L (2017)","Carroll, J; Lyons, S; Denny, E (2014)","Houde, S; Todd, A; Sudarshan, A; Flora, JA; Armel, KC (2013)","Marangoni and Tavoni, 2021","Matsukawa, I. (2018)",accuracy
criteria a.1,yes / yes,no / no,no / yes,no / no,no / no,no / no,yes / yes,6/7 (85.7%)
criteria a.2,yes / yes,yes / yes,yes / yes,yes / yes,yes / yes,no / yes,no / yes,5/7 (71.4%)
criteria a.3,yes / no,yes / no,no / no,yes / yes,yes / yes,yes / no,no / no,4/7 (57.1%)
criteria a.4,no / yes,no / no,yes / yes,no / yes,no / yes,yes / no,no / no,3/7 (42.9%)
criteria a.5,no / no,no / yes,yes / no,yes / no,yes / yes,yes / no,yes / yes,3/7 (42.9%)
criteria b.1,no / yes,yes / no,yes / no,yes / yes,no / no,no / no,yes / no,3/7 (42.9%)
criteria b.2,yes / yes,yes / no,no / no,no / yes,no / no,no / yes,no / yes,3/7 (42.9%)
criteria b.3,yes / no,no / yes,yes / no,no / no,no / no,no / no,yes / yes,4/7 (57.1%)
criteria b.4,no / yes,no / yes,yes / no,no / no,no / no,yes / yes,yes / no,3/7 (42.9%)
criteria b.5,yes / no,yes / yes,no / no,no / no,no / no,no / yes,no / yes,4/7 (57.1%)


# Individual Datasets

In [45]:
# =========================
# show formatted LLM and GT DataFrames
# =========================
print("=== LLM DataFrame (formatted) ===")
display(llm.applymap(_canon_label))

=== LLM DataFrame (formatted) ===


Unnamed: 0_level_0,criteria a.1,criteria a.2,criteria a.3,criteria a.4,criteria a.5,criteria b.1,criteria b.2,criteria b.3,criteria b.4,criteria b.5,criteria c,criteria d.1,criteria d.2,criteria d.3,criteria e.1,criteria e.2,criteria e.3,criteria e.4
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
"Allcott, H (2011) (1)",yes,yes,yes,no,no,no,yes,yes,no,yes,yes,yes,no,no,yes,no,no,no
"Ayres, Raseman, Shih, 2012",no,yes,yes,no,no,yes,yes,no,no,yes,yes,yes,no,yes,yes,yes,yes,yes
"Bager, S; Mundaca, L (2017)",no,yes,no,yes,yes,yes,no,yes,yes,no,yes,yes,no,yes,no,yes,no,yes
"Carroll, J; Lyons, S; Denny, E (2014)",no,yes,yes,no,yes,yes,no,no,no,no,no,yes,no,yes,yes,no,yes,yes
"Houde, S; Todd, A; Sudarshan, A; Flora, JA; Armel, KC (2013)",no,yes,yes,no,yes,no,no,no,no,no,yes,no,no,yes,no,no,yes,no
"Marangoni and Tavoni, 2021",no,no,yes,yes,yes,no,no,no,yes,no,no,no,yes,yes,yes,yes,yes,yes
"Matsukawa, I. (2018)",yes,no,no,no,yes,yes,no,yes,yes,no,no,yes,no,yes,yes,no,yes,yes


In [46]:
print("=== Ground Truth DataFrame (formatted) ===")
display(gt.applymap(_canon_label))

=== Ground Truth DataFrame (formatted) ===


Unnamed: 0_level_0,criteria a.1,criteria a.2,criteria a.3,criteria a.4,criteria a.5,criteria b.1,criteria b.2,criteria b.3,criteria b.4,criteria b.5,criteria c,criteria d.1,criteria d.2,criteria d.3,criteria e.1,criteria e.2,criteria e.3,criteria e.4
paper_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
"Allcott, H (2011) (1)",yes,yes,no,yes,no,yes,yes,no,yes,no,no,no,no,yes,no,no,no,yes
"Ayres, Raseman, Shih, 2012",no,yes,no,no,yes,no,no,yes,yes,yes,no,no,yes,yes,yes,no,yes,yes
"Bager, S; Mundaca, L (2017)",yes,yes,no,yes,no,no,no,no,no,no,no,no,yes,yes,no,yes,no,no
"Carroll, J; Lyons, S; Denny, E (2014)",no,yes,yes,yes,no,yes,yes,no,no,no,no,no,no,no,yes,no,no,yes
"Houde, S; Todd, A; Sudarshan, A; Flora, JA; Armel, KC (2013)",no,yes,yes,yes,yes,no,no,no,no,no,no,no,yes,no,no,yes,no,yes
"Marangoni and Tavoni, 2021",no,yes,no,no,no,no,yes,no,yes,yes,yes,yes,yes,no,no,yes,yes,no
"Matsukawa, I. (2018)",yes,yes,no,no,yes,no,yes,yes,no,yes,yes,no,yes,no,yes,no,no,yes


In [47]:
# =========================
# show raw LLM and GT DataFrames
# =========================
print("=== LLM DataFrame (raw) ===")
df_llm

=== LLM DataFrame (raw) ===


Unnamed: 0,paper_id,criteria a.1,criteria a.2,criteria a.3,criteria a.4,criteria a.5,criteria b.1,criteria b.2,criteria b.3,criteria b.4,criteria b.5,criteria c,criteria d.1,criteria d.2,criteria d.3,criteria e.1,criteria e.2,criteria e.3,criteria e.4
0,"Allcott, H (2011) (1)",yes,yes,probably yes,no,no,no,yes,yes,probably no,yes,yes,yes,no,no,yes,no,probably no,no
1,"Ayres, Raseman, Shih, 2012",probably no,probably yes,yes,no,probably no,probably yes,probably yes,no,no,probably yes,yes,yes,probably no,yes,probably yes,probably yes,probably yes,yes
2,"Bager, S; Mundaca, L (2017)",probably no,yes,probably no,yes,probably yes,probably yes,no,yes,yes,no,probably yes,yes,no,yes,probably no,probably yes,probably no,probably yes
3,"Carroll, J; Lyons, S; Denny, E (2014)",no,probably yes,probably yes,no,probably yes,yes,no,no,no,probably no,probably no,probably yes,no,probably yes,yes,no,yes,probably yes
4,"Houde, S; Todd, A; Sudarshan, A; Flora, JA; Ar...",probably no,probably yes,yes,no,probably yes,no,probably no,probably no,probably no,no,probably yes,no,no,probably yes,probably no,probably no,probably yes,no
5,"Marangoni and Tavoni, 2021",no,probably no,yes,yes,yes,no,no,probably no,yes,probably no,probably no,probably no,probably yes,yes,yes,probably yes,probably yes,yes
6,"Matsukawa, I. (2018)",probably yes,probably no,no,probably no,yes,probably yes,no,yes,probably yes,no,no,probably yes,no,yes,probably yes,probably no,yes,yes


In [48]:
print("=== Ground Truth DataFrame (raw) ===")
df_gt

=== Ground Truth DataFrame (raw) ===


Unnamed: 0,paper_id,criteria a.1,criteria a.2,criteria a.3,criteria a.4,criteria a.5,criteria b.1,criteria b.2,criteria b.3,criteria b.4,criteria b.5,criteria c,criteria d.1,criteria d.2,criteria d.3,criteria e.1,criteria e.2,criteria e.3,criteria e.4
0,"Allcott, H (2011) (1)",probably yes,probably yes,no,yes,no,yes,yes,probably no,yes,no,no,probably no,no,probably yes,probably no,no,no,probably yes
1,"Ayres, Raseman, Shih, 2012",probably no,probably yes,probably no,probably no,yes,no,no,yes,probably yes,yes,no,no,yes,yes,yes,no,yes,yes
2,"Bager, S; Mundaca, L (2017)",probably yes,yes,no,probably yes,probably no,no,no,probably no,no,probably no,probably no,no,yes,yes,probably no,probably yes,probably no,probably no
3,"Carroll, J; Lyons, S; Denny, E (2014)",probably no,yes,yes,yes,probably no,probably yes,yes,no,no,no,probably no,no,probably no,no,probably yes,probably no,no,yes
4,"Houde, S; Todd, A; Sudarshan, A; Flora, JA; Ar...",probably no,yes,yes,yes,yes,no,no,probably no,probably no,probably no,no,probably no,yes,no,probably no,yes,probably no,probably yes
5,"Marangoni and Tavoni, 2021",probably no,probably yes,probably no,probably no,no,no,probably yes,no,yes,yes,probably yes,yes,yes,probably no,no,yes,yes,no
6,"Matsukawa, I. (2018)",yes,yes,no,probably no,yes,no,yes,yes,probably no,probably yes,probably yes,no,probably yes,no,probably yes,probably no,no,probably yes


# Risk Level Comparison (coming soon)