In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
all_pair_df = pd.read_csv("../database/data_assignment/pair_information.csv")[
    ["pair_id", "video1", "video2", "pair_feature", "dist"]
]
all_pair_df["pair_id"] = all_pair_df["pair_id"].astype(int)

In [3]:
def disagreements(mode) -> pd.DataFrame:
    """Score the labeled data by disagreement."""
    # group the labeled data by pair_id
    all_user_pref_path = f"../database/{mode}/all_user_pref.csv"
    if os.path.exists(all_user_pref_path):
        all_user_pref = pd.read_csv(all_user_pref_path)
        # map the pair_id to int
        all_user_pref["pair_id"] = all_user_pref["pair_id"].astype(int)
        # map the pref to float
        all_user_pref["pref"] = all_user_pref["pref"].astype(float)
    
    # merge the rows with the same pair_id and the same user_id, while retaining other columns
    all_user_pref = (
        all_user_pref.groupby(
            ["pair_id", "video1", "video2", "user_id", "pair_feature"]
        )
        .agg({"pref": "mean"})
        .reset_index()
    )
    pair_id_group = all_user_pref.groupby("pair_id")

    # get the disagreement for each pair with the variance in the pref score
    pair_disagreement = pair_id_group.apply(lambda x: x["pref"].values.var())
    all_pair_df["pair_disagreement_score"] = all_pair_df["pair_id"].apply(
        lambda x: pair_disagreement[x] if x in pair_disagreement.index else np.inf
    )

    # group the labeled data by pair_id
    pair_feature_group = all_user_pref.groupby("pair_feature")
    # get the cluster disagreement for each pair with the variance in the pref score
    pair_cluster_disagreement = pair_feature_group.apply(
        lambda x: x["pref"].values.var()
    )
    all_pair_df["cluster_disagreement_score"] = all_pair_df[
        "pair_feature"
    ].apply(
        lambda x: pair_cluster_disagreement[x]
        if x in pair_cluster_disagreement.index
        else np.inf
    )

    return all_pair_df.copy()

In [4]:
baseline_disagreement = disagreements("baseline")
experiment_disagreement = disagreements("experiment")

In [5]:
baseline_disagreement["pair_disagreement_score"].mean(), baseline_disagreement["cluster_disagreement_score"].mean()

(0.08349748563218393, 0.1201527132381496)

In [6]:
experiment_disagreement["pair_disagreement_score"].mean(), experiment_disagreement["cluster_disagreement_score"].mean()

(0.09171368135376758, 0.1409160740249854)

In [7]:
baseline_disagreement[["pair_disagreement_score", "cluster_disagreement_score"]].describe()

Unnamed: 0,pair_disagreement_score,cluster_disagreement_score
count,435.0,435.0
mean,0.083497,0.120153
std,0.086129,0.082036
min,0.0,0.0
25%,0.0,0.013156
50%,0.04,0.153452
75%,0.16,0.205836
max,0.25,0.215556


In [8]:
experiment_disagreement[["pair_disagreement_score", "cluster_disagreement_score"]].describe()

Unnamed: 0,pair_disagreement_score,cluster_disagreement_score
count,435.0,435.0
mean,0.091714,0.140916
std,0.095481,0.085735
min,0.0,0.00637
25%,0.0,0.027901
50%,0.046875,0.187904
75%,0.1875,0.211241
max,0.25,0.232862
