In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.proportion import proportion_confint

# === 1. Load manually labeled data ===
file_path = ""
df = pd.read_excel(file_path)

print("Total samples:", len(df))


Total samples: 300


In [55]:
# -----------------------------
# 2. Sanity check
# -----------------------------
assert "human_substance_use" in df.columns, \
    "Column 'human_substance_use' not found."

total_n = len(df)
assert total_n == 300, f"Expected 300 samples, got {total_n}"

# -----------------------------
# 3. Precision calculation
# -----------------------------
numerator = (df["human_substance_use"] == 1).sum()
denominator = total_n
precision = numerator / denominator

# -----------------------------
# 4. 95% Wilson confidence interval
# -----------------------------
ci_low, ci_high = proportion_confint(
    count=numerator,
    nobs=denominator,
    alpha=0.05,
    method="wilson"
)

# -----------------------------
# 5. Report
# -----------------------------
print(f"Keyword filtering precision: {numerator}/{denominator}")
print(f"Precision: {precision:.3f}")
print(f"95% Wilson CI: [{ci_low:.3f}, {ci_high:.3f}]")

Keyword filtering precision: 270/300
Precision: 0.900
95% Wilson CI: [0.861, 0.929]


In [59]:
results = []

for ctx in ["family", "peer", "school"]:
    col = f"human_context_{ctx}"
    n = df[col].notna().sum()
    correct = (df[col] == 1).sum()
    acc = correct / n

    results.append({
        "Context": ctx.capitalize(),
        "Human-validated accuracy": acc,
        "Correct (1)": correct,
        "Total evaluated": n
    })

df_context_acc = pd.DataFrame(results)
print(df_context_acc)

  Context  Human-validated accuracy  Correct (1)  Total evaluated
0  Family                  0.953333          286              300
1    Peer                  0.956667          287              300
2  School                  0.976667          293              300


In [60]:
n_emotion = df["human_emotion_context"].notna().sum()
correct_emotion = (df["human_emotion_context"] == 1).sum()
emotion_acc = correct_emotion / n_emotion

print(f"Emotion validation accuracy: {correct_emotion}/{n_emotion} = {emotion_acc:.3f}")


Emotion validation accuracy: 273/300 = 0.910


In [61]:
# -----------------------------
# Step 1: LLM binary prediction
# -----------------------------
df["llm_context_family"] = df["Family Influence"].fillna("").str.contains(r"\[1\]").astype(int)
df["llm_context_peer"] = df["Peer Influence"].fillna("").str.contains(r"\[1\]").astype(int)
df["llm_context_school"] = df["School Environment"].fillna("").str.contains(r"\[1\]").astype(int)

# -----------------------------
# Step 2: Human-derived ground truth
# -----------------------------
def derive_gt(llm_pred, human_correct):
    return llm_pred if human_correct == 1 else 1 - llm_pred

for ctx in ["family", "peer", "school"]:
    df[f"human_gt_{ctx}"] = df.apply(
        lambda r: derive_gt(r[f"llm_context_{ctx}"], r[f"human_context_{ctx}"]),
        axis=1
    )

# -----------------------------
# Step 3: Agreement metrics
# -----------------------------
results = []

for ctx in ["family", "peer", "school"]:
    y_pred = df[f"llm_context_{ctx}"]
    y_true = df[f"human_gt_{ctx}"]

    results.append({
        "Context": ctx.capitalize(),
        "Accuracy": accuracy_score(y_true, y_pred),
        "F1": f1_score(y_true, y_pred),
        "Cohen_kappa": cohen_kappa_score(y_true, y_pred)
    })

df_results = pd.DataFrame(results)
print(df_results)

  Context  Accuracy        F1  Cohen_kappa
0  Family  0.953333  0.901408     0.871126
1    Peer  0.956667  0.970522     0.888838
2  School  0.976667  0.885246     0.872356
