In [2]:
"""
Supplementary Material: Inter-rater Reliability (Cohen's Kappa + Observed Agreement)

This script calculates inter-rater agreement between two reviewers.

It computes:
1) Cohen’s kappa for each binary coding item:
   - ISO/IEC 25010 attributes: FS, PE, C, U, R, S, M, P
   - Quality appraisal items: Q1–Q6
2) Observed agreement (%) for each item
3) An overall Cohen’s kappa and observed agreement across the ISO/IEC 25010 block (FS–P)
   by pooling all ISO decisions across studies and attributes.

Expected Excel columns:
- PS_ID, Review_ID
- ISO binary columns: FS, PE, C, U, R, S, M, P
- QA binary columns: Q1, Q2, Q3, Q4, Q5, Q6
- Review_ID contains exactly two values (e.g., "R_1" and "R_2").
"""

import os
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score

# -----------------------
# 1) Load extraction file
# -----------------------
FILE_PATH = "SLR_PS1_78.xlsx"   # <-- recommended repo path
df = pd.read_excel(FILE_PATH)

# -----------------------
# 2) Configuration
# -----------------------
ISO_COLS = ["FS", "PE", "C", "U", "R", "S", "M", "P"]
QAS_COLS = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6"]
ALL_BINARY_COLS = ISO_COLS + QAS_COLS

R1 = "R_1"
R2 = "R_2"

# -----------------------
# 2.1) Basic checks
# -----------------------
required_cols = {"PS_ID", "Review_ID"} | set(ALL_BINARY_COLS)
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns in Excel file: {sorted(missing)}")

reviewers = set(df["Review_ID"].dropna().unique())
if not ({R1, R2} <= reviewers):
    raise ValueError(f"Expected reviewers '{R1}' and '{R2}' in Review_ID. Found: {sorted(reviewers)}")

# ------------------------------------------
# 3) Agreement for a single binary column
# ------------------------------------------
def aligned_ratings(df: pd.DataFrame, col: str, r1: str = R1, r2: str = R2):
    """Return aligned ratings vectors for r1 and r2 for a given column."""
    piv = df.pivot(index="PS_ID", columns="Review_ID", values=col)
    mask = piv[r1].notna() & piv[r2].notna()
    y1 = piv.loc[mask, r1].astype(int)
    y2 = piv.loc[mask, r2].astype(int)
    return y1, y2

def kappa_for_column(df: pd.DataFrame, col: str, r1: str = R1, r2: str = R2) -> float:
    """Cohen's kappa for one binary column."""
    y1, y2 = aligned_ratings(df, col, r1, r2)
    if len(y1) == 0:
        return np.nan
    return cohen_kappa_score(y1, y2)

def observed_agreement_for_column(df: pd.DataFrame, col: str, r1: str = R1, r2: str = R2) -> float:
    """Observed agreement (%) for one binary column."""
    y1, y2 = aligned_ratings(df, col, r1, r2)
    if len(y1) == 0:
        return np.nan
    return float((y1 == y2).mean() * 100.0)

# ----------------------------------------------------------
# 4) Overall ISO agreement by pooling all ISO decisions
# ----------------------------------------------------------
def overall_iso_agreement(df: pd.DataFrame, iso_cols=ISO_COLS, r1: str = R1, r2: str = R2):
    """Overall kappa + observed agreement across ISO block by pooling all ISO decisions."""
    all_r1, all_r2 = [], []
    for col in iso_cols:
        y1, y2 = aligned_ratings(df, col, r1, r2)
        all_r1.extend(y1.tolist())
        all_r2.extend(y2.tolist())

    if len(all_r1) == 0:
        return np.nan, np.nan

    kappa = cohen_kappa_score(all_r1, all_r2)
    obs = float((np.array(all_r1) == np.array(all_r2)).mean() * 100.0)
    return kappa, obs

# -----------------------
# 5) Run computations
# -----------------------
rows = []
for col in ALL_BINARY_COLS:
    rows.append({
        "Item": col,
        "Cohens_Kappa": kappa_for_column(df, col),
        "Observed_Agreement_%": observed_agreement_for_column(df, col)
    })

results = pd.DataFrame(rows)

iso_kappa, iso_obs = overall_iso_agreement(df)

# Add overall ISO row (nice for tables)
results = pd.concat([
    results,
    pd.DataFrame([{
        "Item": "Overall_ISO_FS_to_P",
        "Cohens_Kappa": iso_kappa,
        "Observed_Agreement_%": iso_obs
    }])
], ignore_index=True)

# -----------------------
# 6) Output
# -----------------------
print("\n=== Agreement by item (kappa + observed agreement) ===")
print(results.to_string(index=False))

print("\n=== Overall ISO (FS–P) ===")
print(f"Cohen's kappa: {iso_kappa}")
print(f"Observed agreement (%): {iso_obs:.2f}")

# Save outputs for replication package
os.makedirs("output", exist_ok=True)
results.to_csv("output/agreement_results.csv", index=False)
print("\nSaved: output/agreement_results.csv")



=== Agreement by item (kappa + observed agreement) ===
               Item  Cohens_Kappa  Observed_Agreement_%
                 FS      1.000000            100.000000
                 PE      0.945871             97.435897
                  C      0.692375             88.461538
                  U      0.656388             92.307692
                  R      0.702290             88.461538
                  S      0.961576             98.717949
                  M      0.808260             93.589744
                  P      0.469388             92.307692
                 Q1      1.000000            100.000000
                 Q2      0.000000             89.743590
                 Q3      0.135662             57.692308
                 Q4      0.383399             89.743590
                 Q5      0.627151             87.179487
                 Q6      0.304813             74.358974
Overall_ISO_FS_to_P      0.848076             93.910256

=== Overall ISO (FS–P) ===
Cohen's kappa: 0.848