## Evaluate GPT Judgements for all modalities

In [25]:
import pandas as pd
from LLmsfJiT import read_trec_qrels
from statsmodels.stats.inter_rater import fleiss_kappa, cohens_kappa, aggregate_raters, to_table
from sklearn.metrics import cohen_kappa_score
from itertools import repeat

In [26]:
columns = ["topic_id", "it", "json_loc", "rel_t"]
df = pd.DataFrame(read_trec_qrels("../gpt_judgements/ex_qrels_bin_no_sys.txt"), columns=columns)

In [27]:
df

Unnamed: 0,topic_id,it,json_loc,rel_t
0,55,0,5/1438042988718.8_20150728002308-00148-ip-10-2...,1
1,25,0,11/1438042981460.12_20150728002301-00113-ip-10...,0
2,24,0,13/1438042989891.18_20150728002309-00180-ip-10...,1
3,55,0,39/1438043062723.96_20150728002422-00055-ip-10...,0
4,32,0,48/1438042987775.70_20150728002307-00036-ip-10...,0
...,...,...,...,...
95,24,0,32/1438042990603.54_20150728002310-00051-ip-10...,1
96,14,0,5/1438042988718.8_20150728002308-00150-ip-10-2...,1
97,49,0,46/1438042988922.24_20150728002308-00185-ip-10...,0
98,57,0,31/1438042989018.48_20150728002309-00112-ip-10...,0


In [28]:
additional_modalities = {
    "entity": ("../gpt_judgements/ex_qrels_bin_entity.txt", "e"),
    "page_title": ("../gpt_judgements/ex_qrels_bin_page_title.txt", "pt"),
    "text_before": ("../gpt_judgements/ex_qrels_bin_text_before.txt", "tb"),
    "text_after": ("../gpt_judgements/ex_qrels_bin_text_after.txt", "ta"),
    "table": ("../gpt_judgements/ex_qrels_bin_no_sys.txt", "t")
}

for mod in additional_modalities.keys():
    qrels = read_trec_qrels(additional_modalities[mod][0])
    mod_cols = columns.copy()
    mod_cols[-1] = mod_cols[-1][:-1] + additional_modalities[mod][1]
    temp_df = pd.DataFrame(qrels, columns=mod_cols)
    df[mod_cols[-1]] = temp_df[mod_cols[-1]].astype(int)

## Collect Qrels data for all modalities

In [29]:
wtr_modalities = [
    ("../rel_files/rel_entity_qrels.txt", "rel_e", "enitity"),
    ("../rel_files/rel_PageTitle_qrels.txt", "rel_pt", "page_title"),
    ("../rel_files/rel_table_qrels.txt", "rel_t", "table"),
    ("../rel_files/rel_textAfter_qrels.txt", "rel_ta", "text_after"),
    ("../rel_files/rel_textBefore_qrels.txt", "rel_tb", "text_before")
]
df_wtr = df.copy()[df.columns[:3]]
for path, name, mod in wtr_modalities:
    q = read_trec_qrels(path)
    temp_df = pd.DataFrame(q, columns=["topic_id", "it", "json_loc", name])
    temp_df[name] = temp_df[name].astype(float).astype(int)
    temp_df.loc[temp_df[name] > 1, name] = 1    
    temp_df_merged = df[["topic_id", "json_loc"]].merge(temp_df, on=["json_loc", "topic_id"])
    df_wtr[name] = temp_df_merged[name]
df_wtr = df_wtr[df.columns.tolist()]

## $Cohen's K$ for every modality

In [33]:
kappas = []
for path, name, mod in wtr_modalities:
    ratings = pd.concat([df_wtr[name], df[name]], axis=1)
    table = to_table(ratings)
    kappas.append(cohens_kappa(table[0]).kappa)

In [50]:
index = list(zip(*wtr_modalities))[2]
columns = ["Cohen's K"]
cohens_df = pd.DataFrame(kappas, columns=columns, index=index)

In [73]:
cohens_df.round(2)

Unnamed: 0,Cohen's K
enitity,0.2
page_title,0.57
table,0.33
text_after,0.36
text_before,0.16


## Append all ratings compute a $Cohens K$

In [35]:
df_collapsed_wtr = df_wtr[df_wtr.columns[0:3]].copy()
df_collapsed_wtr["fields_collapsed"] = df_wtr[df_wtr.columns[3:7]].values.tolist()
df_collapsed_wtr["modalities"] = list(repeat(df_wtr.columns[3:7].values.tolist(), times=100))
df_collapsed_wtr = df_collapsed_wtr.explode(["fields_collapsed", "modalities"])

In [36]:
df_collapsed = df[df.columns[0:3]].copy()
df_collapsed["fields_collapsed"] = df[df.columns[3:7]].values.tolist()
df_collapsed["modalities"] = list(repeat(df.columns[3:7].values.tolist(), times=100))
df_collapsed = df_collapsed.explode(["fields_collapsed", "modalities"])

In [37]:
all_ratings = pd.concat([df_collapsed["fields_collapsed"], df_collapsed_wtr["fields_collapsed"]], axis=1)
all_ratings.columns = ["gpt", "wtr"]

## Confusion Matrix all modalities

In [69]:
conf_matricies = []
columns = []
for idx, (path, name, mod) in enumerate(wtr_modalities):
    pred = pd.Series(df[name], name="GPT-3.5 Judgements")
    actu = pd.Series(df_wtr[name], name="WTR")
    conf_matricies.append(pd.crosstab(actu, pred).T)

    for x in (0, 1):
            columns.append((mod, x))

    # columns.append("$K$")

conf_comp = pd.concat(conf_matricies, axis=1)
conf_comp.columns = pd.MultiIndex.from_tuples(columns, names=["Modalities", "WTR judgements"])
conf_comp.T

Unnamed: 0_level_0,GPT-3.5 Judgements,0,1
Modalities,WTR judgements,Unnamed: 2_level_1,Unnamed: 3_level_1
enitity,0,45,27
enitity,1,11,17
page_title,0,58,9
page_title,1,10,23
table,0,45,20
table,1,12,23
text_after,0,64,5
text_after,1,19,12
text_before,0,66,2
text_before,1,27,5


In [74]:
cohens_df.round(2)

Unnamed: 0,Cohen's K
enitity,0.2
page_title,0.57
table,0.33
text_after,0.36
text_before,0.16


## $Cohen's K$ between any two fields

In [39]:
two_field_agreement_data = []
for mod_a in additional_modalities.keys():
    table_col = []
    for mod_b in additional_modalities.keys():
        cols = ["rel_" + additional_modalities[mod_a][1], "rel_" + additional_modalities[mod_b][1]]

        # Get unique categories from both raters
        categories = sorted(set(df[cols[0]].unique()) | set(df[cols[1]].unique()))

        # Create a square array with counts of ratings for both raters
        table = pd.crosstab(df[cols[0]], df[cols[1]], rownames=[cols[0]], colnames=[cols[1]])
        table = table.reindex(columns=categories, index=categories, fill_value=0).values
        table_col.append(
            cohens_kappa(
                table, None, None
            )
        )
    two_field_agreement_data.append(table_col)

In [42]:
pd.DataFrame(two_field_agreement_data, columns=additional_modalities.keys(), index=additional_modalities.keys()).style.highlight_between(axis=0, left=0.25, right=0.4, props="font-weight:bold")

Unnamed: 0,entity,page_title,text_before,text_after,table
entity,1.0,0.247492,0.085638,0.10947,0.328723
page_title,0.247492,1.0,0.159907,0.239244,0.347094
text_before,0.085638,0.159907,1.0,0.259944,0.135971
text_after,0.10947,0.239244,0.259944,1.0,0.250771
table,0.328723,0.347094,0.135971,0.250771,1.0


## Fleiss Kappa between all fields

In [41]:
fleiss_kappa(aggregate_raters(df[df.columns[3:]])[0])

0.1968815498227266