# Set up and global variables

In [None]:
from pathlib import Path

import os
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from IPython.display import display, HTML
from tqdm import tqdm
from scipy.stats import entropy, kendalltau

from itertools import combinations
from sklearn.metrics import cohen_kappa_score

from src.prioritization import *
from src.feature_engineering import add_heuristic_scores

In [None]:
os.environ["CONFIG_ENV"] = "debug"
if False:
    os.environ["CONFIG_ENV"] = "production"

from config import load_config
config = load_config()

RESOLUTION = config['DEFAULTS']['resolution']

# input data
HOLD_OUT_DATA_PATH = config['PATHS']['teacher_hold_out_set']
STORAGE_PATH = config['PATHS']['storage']
RAW_SURVEY_RESPONSES_PATH = config['PATHS']['raw_survey_responses']
RAW_SURVEY_FEEDBACK_PATH = config['PATHS']['raw_survey_feedback']
CACHED_PRIORITIZATIONS_PATH = config['PATHS']['teacher_hold_out_prioritizations']

# output data
BENCHMARK_OUTPUT_PATH = config['PATHS']['benchmark_dataset']
IMAGE_DIR = config['PATHS']['images'] / 'survey_results'

os.makedirs(BENCHMARK_OUTPUT_PATH, exist_ok=True)
os.makedirs(IMAGE_DIR, exist_ok=True)

***

# Loading data

In [None]:
items = pd.read_csv(STORAGE_PATH / 'items.csv', index_col=0)
defects = pd.read_csv(STORAGE_PATH / 'defects.csv', index_col=0)

responses = pd.read_csv(RAW_SURVEY_RESPONSES_PATH, sep=';', parse_dates=['timestamp'])
feedback = pd.read_csv(RAW_SURVEY_FEEDBACK_PATH, sep=';', parse_dates=['timestamp'])

log = pd.read_csv(HOLD_OUT_DATA_PATH / 'log.csv', index_col=0, parse_dates=['time'])
defect_log = pd.read_csv(HOLD_OUT_DATA_PATH / 'defect_log.csv', index_col=0)
defect_log.columns = defect_log.columns.astype(int)

# load heuristic scores as features
discrete_features = pd.read_csv(CACHED_PRIORITIZATIONS_PATH / 'discrete_scores.csv', index_col=0, sep=';')
continuous_features = pd.read_csv(CACHED_PRIORITIZATIONS_PATH / 'continuous_scores.csv', index_col=0, sep=';')

In [None]:
# model metadata
data = items, defects

models = [
    TaskCommonModel(*data),
    TaskCharacteristicModel(*data),
    StudentCommonModel(*data),
    StudentCharacteristicModel(*data),
    StudentEncounteredBeforeModel(*data),
    DefectMultiplicityModel(*data),
    SeverityModel(*data),
]

# shift -2-2 scale to 1-5 scale
for model in models:
    if model.get_discretization_scale() == '-2-2':
        name = model.get_model_name()
        discrete_features[name] = discrete_features[name] + 3

In [None]:
# Simplify respondent ids
unique_ids = sorted(responses["respondent"].unique())
id_map = {rid: f"R{i+1}" for i, rid in enumerate(unique_ids)}

responses["respondent"] = responses["respondent"].map(id_map)
feedback["respondent"] = feedback["respondent"].map(id_map)

***

# Quality filtering

In [None]:
# keep only single response per user-item pair
responses = responses.groupby(['submission id', 'respondent']).first().reset_index()

# keep only survey submissions
survey_submissions = responses['submission id'].unique()
log = log.loc[survey_submissions]
defect_log = defect_log.loc[log.index]
discrete_features = discrete_features[discrete_features['submission id'].isin(survey_submissions)]
continuous_features = continuous_features[continuous_features['submission id'].isin(survey_submissions)]

***
# Analysis

## Basic stats

In [None]:
responses.info()

In [None]:
feedback.info()

In [None]:
n_respondents = responses['respondent'].nunique()
n_submissions = responses['submission id'].nunique()
n_annotations = len(responses)

responses_per_respondent = responses.groupby('respondent').size()
responses_per_submission = responses.groupby('submission id').size()

In [None]:
print(f"Respondents: {n_respondents}")
print(f"Survey submissions seen by respondents: {n_submissions}")
print(f"Total annotations: {n_annotations}")
print()
print("Responses per respondent (summary):")
print(responses_per_respondent.describe().to_string())
print()
print("Responses per submission (summary):")
print(responses_per_submission.describe().to_string())

## Feedback

In [None]:
feedback.info()

In [None]:
display(feedback.head())

In [None]:
print("Number of feedback entries:", len(feedback))
print("Unique respondents:", feedback['respondent'].nunique())

## Comments

In [None]:
comments = responses[["submission id", "respondent", "comment"]].dropna(subset=["comment"])

print("Number of comments:", len(comments))
print("Percentage of responses with comments:", round(100 * len(comments) / len(responses), 1), "%")
print("Unique commenters:", comments["respondent"].nunique())

In [None]:
for submission_id, group in comments.groupby("submission id"):
    print(f"\n=== Submission {submission_id} ===")
    for _, row in group.iterrows():
        print(f"- {row['respondent']}: {row['comment']}")

In [None]:
for respondent, group in comments.groupby("respondent"):
    print(f"\n=== {respondent} ===")
    for _, row in group.iterrows():
        print(f"[{row['submission id']}] {row['comment']}")

## Coverage

In [None]:
coverage_df = responses_per_submission.value_counts().sort_index()
coverage_df = pd.DataFrame({
    'n_responses': coverage_df.index,
    'n_submissions': coverage_df.values,
    'pct_submissions': 100 * coverage_df.values / coverage_df.values.sum()
}).reset_index(drop=True).sort_values('n_responses')
display(coverage_df)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(
    coverage_df['n_responses'],
    coverage_df.cumsum()['pct_submissions'],
    marker='o'
)

plt.xlabel("Number of Responses per Submission")
plt.ylabel("Cumulative % of Submissions")
plt.title("Cumulative Coverage of Survey Submissions")
plt.grid(True, alpha=0.3)
plt.ylim(0, 100)

plt.tight_layout()
plt.savefig(IMAGE_DIR / "cumulative_coverage.png", dpi=150)
plt.close()


## Per-submission agreement

In [None]:
rows = []

for submission_id, group in responses.groupby("submission id"):
    counts = group["answer"].value_counts().sort_index()

    # Majority vote
    top_count = counts.max()
    top_answers = counts[counts == top_count].index.tolist()
    tie = len(top_answers) > 1

    # Entropy
    probabilities = counts / counts.sum()
    H = entropy(probabilities, base=2)

    # Normalized entropy
    k = len(counts)
    max_H = math.log2(k) if k > 1 else 1
    H_norm = H / max_H

    rows.append({
        "submission id": submission_id,
        "n_votes": counts.sum(),
        "n_choices": k,
        "majority": ", ".join(map(str, top_answers)),
        "tie_for_top": tie,
        "entropy_bits": H,
        "entropy_norm": H_norm,
    })

results = pd.DataFrame(rows).set_index("submission id")


In [None]:
pct_tied = 100 * results["tie_for_top"].mean()
print(f"Percentage of submissions with tie for top choice: {pct_tied:.2f}%")

In [None]:
# highest disagreement
results.sort_values("entropy_norm", ascending=False)

In [None]:
results["entropy_norm"].sort_values().plot(kind="bar")
plt.title("Normalized Entropy per Submission")
plt.ylabel("Entropy (0 = agreement, 1 = maximum disagreement)")
plt.xlabel("Submission ID")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "submission_entropy.png", dpi=RESOLUTION)
plt.close()


In [None]:
results["n_votes"].sort_values().plot(kind="bar")
plt.title("Number of Responses per Submission")
plt.ylabel("Votes")
plt.xlabel("Submission ID")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "submission_n_responses.png", dpi=RESOLUTION)
plt.close()


In [None]:
plt.figure(figsize=(6,4))
plt.hist(results["entropy_norm"].dropna(), bins=12)
plt.xlabel('Normalized vote entropy (0=perfect agreement, 1=max)')
plt.ylabel('Number of submissions')
plt.title('Distribution of submission vote entropy (agreement)')
plt.tight_layout()
plt.savefig(IMAGE_DIR / 'submission_entropy_distribution.png', dpi=RESOLUTION)
plt.close()

## Inter-anotator agreement

In [None]:
pair_kappas = []

for a, b in combinations(id_map.values(), 2):
    a_df = responses[responses['respondent'] == a][['submission id', 'answer']].set_index('submission id')
    b_df = responses[responses['respondent'] == b][['submission id', 'answer']].set_index('submission id')
    
    # Only consider overlapping submissions
    common = a_df.index.intersection(b_df.index)
    if len(common) < 2:  # too few items
        continue
    
    k = cohen_kappa_score(a_df.loc[common, 'answer'], b_df.loc[common, 'answer'])
    pair_kappas.append(k)

pair_kappas = np.array(pair_kappas)

print("Pairwise Cohen's kappa (overlapping annotator pairs):")
print(f"  Number of pairs considered: {len(pair_kappas)}")
if len(pair_kappas) > 0:
    print(f"  Mean kappa:   {pair_kappas.mean():.3f}")
    print(f"  Median kappa: {np.median(pair_kappas):.3f}")
    print(f"  Std kappa:    {pair_kappas.std(ddof=1):.3f}")


In [None]:
plt.figure(figsize=(6,4))
sns.histplot(pair_kappas, bins=10, kde=True)
plt.xlabel("Cohen's kappa")
plt.ylabel("Number of annotator pairs")
plt.title("Distribution of Pairwise Cohen's Kappa")
plt.tight_layout()
plt.savefig(IMAGE_DIR / 'annotator_pairwise_kappa.png', dpi=RESOLUTION)
plt.close()


In [None]:
def fleiss_kappa(table):
    """Compute Fleiss' kappa from a count table (N_items x k_categories)."""
    table = np.asarray(table, dtype=float)
    N, k = table.shape
    n_raters_per_item = table.sum(axis=1)
    
    # Probabilities per category
    p_j = table.sum(axis=0) / table.sum()
    # Agreement per item
    P_i = ((table * (table - 1)).sum(axis=1)) / (n_raters_per_item * (n_raters_per_item - 1))
    
    P_bar = P_i.mean()
    P_e = (p_j**2).sum()
    
    return (P_bar - P_e) / (1 - P_e) if (1 - P_e) != 0 else np.nan


In [None]:
# Build contingency table
all_answers = sorted(responses['answer'].unique())
answer_to_idx = {a: i for i, a in enumerate(all_answers)}
submission_ids = responses['submission id'].unique()

table = np.zeros((len(submission_ids), len(all_answers)), dtype=int)
submission_idx_map = {sid: i for i, sid in enumerate(submission_ids)}

for sid, grp in responses.groupby('submission id'):
    i = submission_idx_map[sid]
    for ans, cnt in grp['answer'].value_counts().items():
        table[i, answer_to_idx[ans]] = cnt

fleiss = fleiss_kappa(table)
print(f"Fleiss' kappa (all submissions): {fleiss:.3f}")


***
# Dataset Construction

## Defect pairs

In [None]:
# extract defect pairs
long_defects = defect_log.melt(var_name='defect id', value_name='count', ignore_index=False).reset_index(names=['submission id'])
long_defects = long_defects[long_defects['count'] > 0]

defect_pairs = (
    responses
    .merge(long_defects, on="submission id", how="left")
    .rename(columns={
        "answer": "left",
        "defect id": "right"
    })[["submission id", "left", "right"]]
)
# remove self-pairs
defect_pairs = defect_pairs[defect_pairs["left"] != defect_pairs["right"]]

# add item id
defect_pairs['item'] = log['item'].loc[defect_pairs['submission id']].values

# add negated pairs
defect_pairs['left won'] = True
negated_pairs = defect_pairs.rename(columns={"left": "right", "right": "left"})
negated_pairs['left won'] = False
defect_pairs = pd.concat([defect_pairs, negated_pairs]).reset_index(drop=True)


## Heuristics as features

In [None]:
# combine to a single dataframe
df = add_heuristic_scores(defect_pairs, discrete_features, continuous_features)

## Weigh all submissions equally

In [None]:
submission_counts = df.groupby('submission id')['submission id'].transform('count')
df['weight'] = 1 / submission_counts

## Export

In [None]:
df.to_csv(BENCHMARK_OUTPUT_PATH / 'benchmark_dataset.csv', index=False)

In [None]:
df