## Evaluate GPT Judgements for all modalities

In [1]:
import pandas as pd
from LLmsfJiT import read_trec_qrels
from statsmodels.stats.inter_rater import fleiss_kappa, cohens_kappa, aggregate_raters, to_table
from itertools import repeat

In [2]:
columns = ["topic_id", "it", "json_loc", "rel_t"]
df = pd.DataFrame(read_trec_qrels("../gpt_judgements/first_run_with_balanced_sample_bin.txt"), columns=columns)

In [3]:
df.head()

Unnamed: 0,topic_id,it,json_loc,rel_t
0,23,0,45/1438042989897.84_20150728002309-00081-ip-10...,1
1,23,0,28/1438042989301.17_20150728002309-00289-ip-10...,1
2,20,0,37/1438042982013.25_20150728002302-00168-ip-10...,0
3,29,0,36/1438042988598.68_20150728002308-00147-ip-10...,0
4,14,0,29/1438042987628.47_20150728002307-00142-ip-10...,0


In [4]:
additional_modalities = {
    "entity": ("../gpt_judgements/balanced_sample_bin_entity.txt", "e"),
    "page_title": ("../gpt_judgements/balanced_sample_bin_page_title.txt", "pt"),
    "text_before": ("../gpt_judgements/balanced_sample_bin_text_before.txt", "tb"),
    "text_after": ("../gpt_judgements/balanced_sample_bin_text_after.txt", "ta"),
    "table": ("../gpt_judgements/first_run_with_balanced_sample_bin.txt", "t")
}

for mod in additional_modalities.keys():
    qrels = read_trec_qrels(additional_modalities[mod][0])
    mod_cols = columns.copy()
    mod_cols[-1] = mod_cols[-1][:-1] + additional_modalities[mod][1]
    temp_df = pd.DataFrame(qrels, columns=mod_cols)
    df[mod_cols[-1]] = temp_df[mod_cols[-1]].astype(int)

## Collect Qrels data for all modalities

In [5]:
wtr_modalities = [
    ("../rel_files/rel_entity_qrels.txt", "rel_e", "enitity"),
    ("../rel_files/rel_PageTitle_qrels.txt", "rel_pt", "page_title"),
    ("../rel_files/rel_table_qrels.txt", "rel_t", "table"),
    ("../rel_files/rel_textAfter_qrels.txt", "rel_ta", "text_after"),
    ("../rel_files/rel_textBefore_qrels.txt", "rel_tb", "text_before")
]
df_wtr = df.copy()[df.columns[:3]]
for path, name, mod in wtr_modalities:
    q = read_trec_qrels(path)
    temp_df = pd.DataFrame(q, columns=["topic_id", "it", "json_loc", name])
    temp_df[name] = temp_df[name].astype(float).astype(int)
    temp_df.loc[temp_df[name] > 1, name] = 1    
    temp_df_merged = df[["topic_id", "json_loc"]].merge(temp_df, on=["json_loc", "topic_id"])
    df_wtr[name] = temp_df_merged[name]
df_wtr = df_wtr[df.columns.tolist()]

## $Cohen's K$ for every modality

In [6]:
kappas = []
for path, name, mod in wtr_modalities:
    ratings = pd.concat([df_wtr[name], df[name]], axis=1)
    table = to_table(ratings)
    kappas.append(cohens_kappa(table[0]).kappa)

In [7]:
index = list(zip(*wtr_modalities))[2]
columns = ["Cohen's K"]
pd.DataFrame(kappas, columns=columns, index=index)

Unnamed: 0,Cohen's K
enitity,0.309623
page_title,0.329692
table,0.470588
text_after,0.29652
text_before,0.317111


## Append all ratings compute a $Cohens K$

In [8]:
df_collapsed_wtr = df_wtr[df_wtr.columns[0:3]].copy()
df_collapsed_wtr["fields_collapsed"] = df_wtr[df_wtr.columns[3:7]].values.tolist()
df_collapsed_wtr["modalities"] = list(repeat(df_wtr.columns[3:7].values.tolist(), times=90))
df_collapsed_wtr = df_collapsed_wtr.explode(["fields_collapsed", "modalities"])

In [9]:
df_collapsed = df[df.columns[0:3]].copy()
df_collapsed["fields_collapsed"] = df[df.columns[3:7]].values.tolist()
df_collapsed["modalities"] = list(repeat(df.columns[3:7].values.tolist(), times=90))
df_collapsed = df_collapsed.explode(["fields_collapsed", "modalities"])

In [10]:
all_ratings = pd.concat([df_collapsed["fields_collapsed"], df_collapsed_wtr["fields_collapsed"]], axis=1)
all_ratings.columns = ["gpt", "wtr"]
cohens_kappa(to_table(all_ratings)[0])

{'kind': 'Simple',
 'kappa': 0.37222222222222223,
 'kappa_max': 0.9944444444444445,
 'weights': None,
 'var_kappa': 0.0023928445258387106,
 'var_kappa0': 0.002777692043895747,
 'alpha': 0.025,
 'alpha_ci': '  95',
 'std_kappa': 0.048916710088053865,
 'std_kappa0': 0.0527038143201775,
 'z_value': 7.062529098196942,
 'pvalue_one_sided': 8.174953155841656e-13,
 'pvalue_two_sided': 1.6349906311683311e-12,
 'kappa_low': 0.2763472322074495,
 'kappa_upp': 0.46809721223699496,
 'distribution_kappa': <scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x7f2d0fae35f0>,
 'distribution_zero_null': <scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x7f2d0b662db0>}

## Confusion Matrix all modalities

In [60]:
conf_matricies = []
columns = []
for path, name, mod in wtr_modalities:
    pred = pd.Series(df[name], name="GPT-3.5 Judgements")
    actu = pd.Series(df_wtr[name], name="WTR")
    conf_matricies.append(pd.crosstab(actu, pred).T)

    for x in (0, 1):
            columns.append((mod, x))

conf_comp = pd.concat(conf_matricies, axis=1)
conf_comp.columns = pd.MultiIndex.from_tuples(columns, names=["Modalities", "WTR judgements"])
conf_comp.style.highlight_max(props="font-weight:bold", axis=0)

Modalities,enitity,enitity,page_title,page_title,table,table,text_after,text_after,text_before,text_before
WTR judgements,0,1,0,1,0,1,0,1,0,1
GPT-3.5 Judgements,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0,26,5,36,20,19,10,38,21,43,21
1,28,31,10,24,11,50,10,21,8,18


## $Cohen's K$ between any two fields

In [6]:
two_field_agreement_data = []
for mod_a in additional_modalities.keys():
    table_col = []
    for mod_b in additional_modalities.keys():
        cols = ["rel_" + additional_modalities[mod_a][1], "rel_" + additional_modalities[mod_b][1]]

        # Get unique categories from both raters
        categories = sorted(set(df[cols[0]].unique()) | set(df[cols[1]].unique()))

        # Create a square array with counts of ratings for both raters
        table = pd.crosstab(df[cols[0]], df[cols[1]], rownames=[cols[0]], colnames=[cols[1]])
        table = table.reindex(columns=categories, index=categories, fill_value=0).values
        table_col.append(
            cohens_kappa(
                table, None, None
            )
        )
    two_field_agreement_data.append(table_col)

In [8]:
pd.DataFrame(two_field_agreement_data, columns=additional_modalities.keys(), index=additional_modalities.keys()).style.highlight_between(axis=0, left=0.25, right=0.4, props="font-weight:bold")


Unnamed: 0,entity,page_title,text_before,text_after,table
entity,1.0,0.070675,0.037538,0.189554,0.350361
page_title,0.070675,1.0,0.306167,0.254409,0.161745
text_before,0.037538,0.306167,1.0,0.15577,0.130528
text_after,0.189554,0.254409,0.15577,1.0,0.239662
table,0.350361,0.161745,0.130528,0.239662,1.0


## Fleiss Kappa between all fields

In [44]:
fleiss_kappa(aggregate_raters(df[df.columns[3:]])[0])

0.1611969303377024