## Evaluate GPT Judgements for all modalities

In [26]:
import pandas as pd
from tools import read_trec_qrels
from statsmodels.stats.inter_rater import fleiss_kappa, cohens_kappa, aggregate_raters, to_table

In [3]:
columns = ["topic_id", "it", "json_loc", "rel_t"]
df = pd.DataFrame(read_trec_qrels("runs/first_run_with_balanced_sample_bin.txt"), columns=columns)

In [4]:
df.head()

Unnamed: 0,topic_id,it,json_loc,rel_t
0,23,0,45/1438042989897.84_20150728002309-00081-ip-10...,1
1,23,0,28/1438042989301.17_20150728002309-00289-ip-10...,1
2,20,0,37/1438042982013.25_20150728002302-00168-ip-10...,0
3,29,0,36/1438042988598.68_20150728002308-00147-ip-10...,0
4,14,0,29/1438042987628.47_20150728002307-00142-ip-10...,0


In [5]:
additional_modalities = {
    "entity": ("runs/balanced_sample_bin_entity.txt", "e"),
    "page_title": ("runs/balanced_sample_bin_page_title.txt", "pt"),
    "text_before": ("runs/balanced_sample_bin_text_before.txt", "tb"),
    "text_after": ("runs/balanced_sample_bin_text_after.txt", "ta"),
    "table": ("runs/first_run_with_balanced_sample_bin.txt", "t")
}

for mod in additional_modalities.keys():
    qrels = read_trec_qrels(additional_modalities[mod][0])
    mod_cols = columns.copy()
    mod_cols[-1] = mod_cols[-1][:-1] + additional_modalities[mod][1]
    temp_df = pd.DataFrame(qrels, columns=mod_cols)
    df[mod_cols[-1]] = temp_df[mod_cols[-1]]

## Cohen's Kappa between any two fields

In [30]:
two_field_agreement_data = []
for mod_a in additional_modalities.keys():
    table_col = []
    for mod_b in additional_modalities.keys():
        cols = ["rel_" + additional_modalities[mod_a][1], "rel_" + additional_modalities[mod_b][1]]

        # Get unique categories from both raters
        categories = sorted(set(df[cols[0]].unique()) | set(df[cols[1]].unique()))

        # Create a square array with counts of ratings for both raters
        table = pd.crosstab(df[cols[0]], df[cols[1]], rownames=[cols[0]], colnames=[cols[1]])
        table = table.reindex(columns=categories, index=categories, fill_value=0).values
        table_col.append(
            cohens_kappa(
                table, None, None
            )
        )
    two_field_agreement_data.append(table_col)

In [20]:
pd.DataFrame(two_field_agreement_data, columns=additional_modalities.keys(), index=additional_modalities.keys()).style.highlight_between(axis=0, left=0.25, right=0.4)

Unnamed: 0,entity,page_title,text_before,text_after,table
entity,1.0,0.070675,0.037538,0.189554,0.350361
page_title,0.070675,1.0,0.306167,0.254409,0.161745
text_before,0.037538,0.306167,1.0,0.15577,0.130528
text_after,0.189554,0.254409,0.15577,1.0,0.239662
table,0.350361,0.161745,0.130528,0.239662,1.0


## Fleiss Kappa between all fields

In [44]:
fleiss_kappa(aggregate_raters(df[df.columns[3:]])[0])

0.1611969303377024