# Set up and global variables

In [None]:
from pathlib import Path

import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from IPython.display import display, HTML
from tqdm import tqdm

from src.prioritization import *

In [None]:
os.environ["CONFIG_ENV"] = "debug"

from config import load_config
config = load_config()

RESOLUTION = config['DEFAULTS']['resolution']

# input data
HOLD_OUT_DATA_PATH = config['PATHS']['hold_out_set']
STORAGE_PATH = config['PATHS']['storage']
RAW_SURVEY_RESPONSES_PATH = config['PATHS']['raw_survey_responses']
RAW_SURVEY_FEEDBACK_PATH = config['PATHS']['raw_survey_feedback']
CACHED_PRIORITIZATIONS_PATH = config['PATHS']['hold_out_prioritizations']

# output data
BENCHMARK_OUTPUT_PATH = config['PATHS']['benchmark_dataset']
IMAGE_DIR = config['PATHS']['images']

os.makedirs(BENCHMARK_OUTPUT_PATH, exist_ok=True)
os.makedirs(IMAGE_DIR, exist_ok=True)

***

# Loading data

In [None]:
items = pd.read_csv(STORAGE_PATH / 'items.csv', index_col=0)
defects = pd.read_csv(STORAGE_PATH / 'defects.csv', index_col=0)

responses = pd.read_csv(RAW_SURVEY_RESPONSES_PATH, sep=';', parse_dates=['timestamp'])
feedback = pd.read_csv(RAW_SURVEY_FEEDBACK_PATH, sep=';', parse_dates=['timestamp'])

log = pd.read_csv(HOLD_OUT_DATA_PATH / 'log.csv', index_col=0, parse_dates=['time'])
defect_log = pd.read_csv(HOLD_OUT_DATA_PATH / 'defect_log.csv', index_col=0)
defect_log.columns = defect_log.columns.astype(int)

# keep only single response per user-item pair
responses = responses.groupby(['submission id', 'respondent']).first().reset_index()

# keep only survey submissions
survey_submissions = responses['submission id'].unique()
log = log.loc[survey_submissions]
defect_log = defect_log.loc[log.index]

In [None]:
# load heuristic scores as features
discrete_features = pd.read_csv(CACHED_PRIORITIZATIONS_PATH / 'discrete_scores.csv', index_col=0, sep=';')
continuous_features = pd.read_csv(CACHED_PRIORITIZATIONS_PATH / 'continuous_scores.csv', index_col=0, sep=';')

# keep only survey submissions
discrete_features = discrete_features[discrete_features['submission id'].isin(survey_submissions)]
continuous_features = continuous_features[continuous_features['submission id'].isin(survey_submissions)]

***
# Analysis

## Validation

In [None]:
responses.info()

In [None]:
feedback.info()

In [None]:
vote_counts = responses.groupby(['submission id', 'answer']).size().reset_index(name='count')
ties = vote_counts.groupby('submission id', group_keys=False).apply(lambda x: (x['count'] == x['count'].max()).sum() > 1, include_groups=False)

In [None]:
print('Number of respondents:', responses['respondent'].nunique())
print('Average number of responses:', responses.groupby('respondent').count()['answer'].mean())
print('Average number of answers per submission:', responses.groupby('submission id').count()['answer'].mean())
print('Percentage of tied results:', np.round(ties.mean() * 100, 2), '%')

***
# Dataset Construction

## Defect pairs

In [None]:
# extract defect pairs
long_defects = defect_log.melt(var_name='defect id', value_name='count', ignore_index=False).reset_index(names=['submission id'])
long_defects = long_defects[long_defects['count'] > 0]

defect_pairs = (
    responses
    .merge(long_defects, on="submission id", how="left")
    .rename(columns={
        "answer": "left",
        "defect id": "right"
    })[["submission id", "left", "right"]]
)
# remove self-pairs
defect_pairs = defect_pairs[defect_pairs["left"] != defect_pairs["right"]]

# add negated pairs
defect_pairs['left won'] = True
negated_pairs = defect_pairs.rename(columns={"left": "right", "right": "left"})
negated_pairs['left won'] = False
defect_pairs = pd.concat([defect_pairs, negated_pairs]).reset_index(drop=True)


## Heuristics as features

In [None]:
def construct_features(defect_pairs: pd.DataFrame, features: pd.DataFrame, which: str="left", suffix: str=""):
    """
    Construct a dataframe with heuristic scores of one of the defects.
    
    Args:
        defect_pairs (pd.DataFrame): DataFrame with defect pairs.
        features (pd.DataFrame): DataFrame with heuristic scores.
        which (str): Which defect to use.
        suffix (str): Suffix to add to column names.
    Returns:
        pd.DataFrame: DataFrame with heuristic scores.
    """
    if which not in defect_pairs.columns:
        raise ValueError(f"Column '{which}' not found in defect pairs.")
    return defect_pairs[["submission id", which]].merge(
        features,
        left_on=["submission id", which],
        right_on=["submission id", "defect id"],
        how="left"
    ).drop(columns=["defect id", "submission id", which]).add_suffix(suffix)

In [None]:
# combine to a single dataframe
df = pd.concat([
    defect_pairs,
    construct_features(defect_pairs, discrete_features, "left", " (Left Discrete)"),
    construct_features(defect_pairs, discrete_features, "right", " (Right Discrete)"),
    construct_features(defect_pairs, continuous_features, "left", " (Left Continuous)"),
    construct_features(defect_pairs, continuous_features, "right", " (Right Continuous)")
], axis=1)

## Export

In [None]:
df.to_csv(BENCHMARK_OUTPUT_PATH / 'benchmark_dataset.csv', index=False)