In [1]:
from tools import read_trec_qrels
import pandas as pd
from sklearn.metrics import confusion_matrix, cohen_kappa_score
from itertools import repeat

## Read Files

In [13]:
human_qrels = read_trec_qrels("rel_files/rel_table_qrels_sample_balanced.txt")
df = pd.DataFrame(human_qrels, columns=["Topic", "It", "Doc", "Rel_Human"])

gpt_qrels = read_trec_qrels("runs/first_run_with_balanced_sample.txt")
gpt_qrels_bin = read_trec_qrels("runs/first_run_with_balanced_sample_bin.txt")
gpt_qrels_bin_sys = read_trec_qrels("runs/first_run_with_balanced_sample_bin_sys.txt")
gpt_qrels_sys = read_trec_qrels("runs/first_run_with_balanced_sample_sys.txt")
gpt_qrels_bin_no_sys = read_trec_qrels("runs/first_run_with_balanced_sample_bin_no_sys.txt")

gpt_df = pd.DataFrame(gpt_qrels, columns=["Topic", "It", "Doc", "Rel_GPT"])
gpt_bin_df = pd.DataFrame(gpt_qrels_bin, columns=["Topic", "It", "Doc", "Rel_GPT_bin"])
gpt_bin_sys_df = pd.DataFrame(gpt_qrels_bin_sys, columns=["Topic", "It", "Doc", "Rel_GPT_bin_sys"])
gpt_sys_df = pd.DataFrame(gpt_qrels_sys, columns=["Topic", "It", "Doc", "Rel_GPT_sys"])
gpt_bin_no_sys_df = pd.DataFrame(gpt_qrels_bin_no_sys, columns=["Topic", "It", "Doc", "Rel_GPT_bin_no_sys"])

df["Rel_GPT"] = gpt_df["Rel_GPT"].astype(int)
df["Rel_Human"] = df["Rel_Human"].astype(int)
df["Rel_GPT_bin"] = gpt_bin_df["Rel_GPT_bin"].astype(int)
df["Rel_GPT_bin_sys"] = gpt_bin_sys_df["Rel_GPT_bin_sys"].astype(int)
df["Rel_GPT_sys"] = gpt_sys_df["Rel_GPT_sys"].astype(int)
df["Rel_GPT_bin_no_sys"] = gpt_bin_no_sys_df["Rel_GPT_bin_no_sys"].astype(int)

## Baseline Rating(irr, rel, high_rel)

In [14]:
cm = confusion_matrix(y_true=df["Rel_Human"], y_pred=df["Rel_GPT"])
cohens_k = round(cohen_kappa_score(df["Rel_Human"], df["Rel_GPT"]), 3)

idx_rows = pd.MultiIndex.from_tuples(list(zip(repeat("WTR Assesors"), sorted(df["Rel_Human"].unique()))))
idx_cols = pd.MultiIndex.from_tuples(list(zip(repeat("GPT-3.5"), sorted(df["Rel_GPT"].unique()))))

cm_df = pd.DataFrame(
    data=cm,
    index = idx_rows,
    columns=idx_cols
)

display(cm_df.T.style.highlight_max(axis=1, props='font-weight:bold'))
print("Cohen's K:", cohens_k)
        # .to_latex(
        #     hrules=True,
        #     multirow_align="t",
        #     caption="Rater agreement between GPT-3.5 and WTR Assesors",
        # )


Unnamed: 0_level_0,Unnamed: 1_level_0,WTR Assesors,WTR Assesors,WTR Assesors
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1,2
GPT-3.5,0,13,4,0
GPT-3.5,1,13,19,19
GPT-3.5,2,4,7,11


Cohen's K: 0.217


## Baseline Rating(irr, rel, high_rel) mapped to bin

In [15]:
df.loc[df["Rel_Human"] == 0, "Rel_Human_bin"] = 0
df.loc[df["Rel_Human"] > 0, "Rel_Human_bin"] = 1
df["Rel_Human_bin"] = df["Rel_Human_bin"].astype(int)

df.loc[df["Rel_GPT"] == 0, "Rel_GPT_bin_mapped"] = 0
df.loc[df["Rel_GPT"] > 0, "Rel_GPT_bin_mapped"] = 1
df["Rel_GPT_bin_mapped"] = df["Rel_GPT_bin_mapped"].astype(int)

idx_rows = pd.MultiIndex.from_tuples(list(zip(repeat("WTR Assesors"), sorted(df["Rel_Human_bin"].unique()))))
idx_cols = pd.MultiIndex.from_tuples(list(zip(repeat("GPT-3.5"), sorted(df["Rel_GPT_bin_mapped"].unique()))))

display(pd.DataFrame(
    confusion_matrix(y_true=df["Rel_Human_bin"], y_pred=df["Rel_GPT_bin_mapped"]),
    index=idx_rows,
    columns=idx_cols
).T.style.highlight_max(axis=1, props='font-weight:bold'))

print("Cohen's K", cohen_kappa_score(df["Rel_GPT_bin_mapped"], df["Rel_Human_bin"]))



Unnamed: 0_level_0,Unnamed: 1_level_0,WTR Assesors,WTR Assesors
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
GPT-3.5,0,13,4
GPT-3.5,1,17,56


Cohen's K 0.41121495327102797


## Baseline Rating(irr, rel, high_rel) only GPT mapped to bin

In [16]:
a = len(df.loc[(df["Rel_GPT_bin_mapped"] == 1) & (df["Rel_Human"] == 2)])
b = len(df.loc[(df["Rel_GPT_bin_mapped"] == 1) & (df["Rel_Human"] == 1)])
c = len(df.loc[(df["Rel_GPT_bin_mapped"] == 1) & (df["Rel_Human"] == 0)])
gpt_rel = [a, b, c]
d = len(df.loc[(df["Rel_GPT_bin_mapped"] == 0) & (df["Rel_Human"] == 2)])
e = len(df.loc[(df["Rel_GPT_bin_mapped"] == 0) & (df["Rel_Human"] == 1)])
f = len(df.loc[(df["Rel_GPT_bin_mapped"] == 0) & (df["Rel_Human"] == 0)])
gpt_irrel = [d, e, f]

pd.DataFrame([gpt_rel, gpt_irrel], columns=[2, 1, 0], index=[1, 0]).style.highlight_max(axis=0, props='font-weight:bold')

Unnamed: 0,2,1,0
1,30,26,17
0,0,4,13


## Baseline Rating bin only Human mapped

In [17]:
df.loc[df["Rel_Human"] == 0, "Rel_Human_bin"] = 0
df.loc[df["Rel_Human"] > 0, "Rel_Human_bin"] = 1
df["Rel_Human_bin"] = df["Rel_Human_bin"].astype(int)

# df.loc[df["Rel_GPT_bin"] == 0, "Rel_GPT_bin"] = "irrelevant"
# df.loc[df["Rel_GPT_bin"] > 0, "Rel_GPT_bin"] = "relevant"

idx_rows = pd.MultiIndex.from_tuples(list(zip(repeat("WTR Assesors"), sorted(df["Rel_Human_bin"].unique()))))
idx_cols = pd.MultiIndex.from_tuples(list(zip(repeat("GPT-3.5"), sorted(df["Rel_GPT_bin"].unique()))))

display(pd.DataFrame(
    confusion_matrix(y_true=df["Rel_Human_bin"], y_pred=df["Rel_GPT_bin"]),
    index=idx_rows,
    columns=idx_cols
).T.style.highlight_max(axis=1, props='font-weight:bold'))

print("Cohen's K", cohen_kappa_score(df["Rel_GPT_bin"], df["Rel_Human_bin"]))



Unnamed: 0_level_0,Unnamed: 1_level_0,WTR Assesors,WTR Assesors
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
GPT-3.5,0,19,10
GPT-3.5,1,11,50


Cohen's K 0.47058823529411764


## Baseline Rating bin topic in sys only human mapped

In [18]:
idx_rows = pd.MultiIndex.from_tuples(list(zip(repeat("WTR Assesors"), sorted(df["Rel_Human_bin"].unique()))))
idx_cols = pd.MultiIndex.from_tuples(list(zip(repeat("GPT-3.5"), sorted(df["Rel_GPT_bin_sys"].unique()))))

display(pd.DataFrame(
    confusion_matrix(y_true=df["Rel_Human_bin"], y_pred=df["Rel_GPT_bin_sys"]),
    index=idx_rows,
    columns=idx_cols
).T.style.highlight_max(axis=1, props='font-weight:bold'))

print("Cohen's K", cohen_kappa_score(df["Rel_GPT_bin_sys"], df["Rel_Human_bin"]))



Unnamed: 0_level_0,Unnamed: 1_level_0,WTR Assesors,WTR Assesors
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
GPT-3.5,0,24,24
GPT-3.5,1,6,36


Cohen's K 0.34782608695652173


## Baseline Rating topic in sys

In [19]:
idx_rows = pd.MultiIndex.from_tuples(list(zip(repeat("WTR Assesors"), sorted(df["Rel_Human"].unique()))))
idx_cols = pd.MultiIndex.from_tuples(list(zip(repeat("GPT-3.5"), sorted(df["Rel_GPT_sys"].unique()))))

display(pd.DataFrame(
    confusion_matrix(y_true=df["Rel_Human"], y_pred=df["Rel_GPT_sys"]),
    index=idx_rows,
    columns=idx_cols
).T.style.highlight_max(axis=1, props='font-weight:bold'))

print("Cohen's K", cohen_kappa_score(df["Rel_GPT_sys"], df["Rel_Human"]))



Unnamed: 0_level_0,Unnamed: 1_level_0,WTR Assesors,WTR Assesors,WTR Assesors
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1,2
GPT-3.5,0,18,11,2
GPT-3.5,1,10,9,10
GPT-3.5,2,2,10,18


Cohen's K 0.25


## Baseline Rating topic in sys mapped to binary

In [20]:
df.loc[df["Rel_GPT_sys"] == 0, "Rel_GPT_sys_bin"] = 0
df.loc[df["Rel_GPT_sys"] > 0, "Rel_GPT_sys_bin"] = 1
df["Rel_GPT_sys_bin"] = df["Rel_GPT_sys_bin"].astype(int)

idx_rows = pd.MultiIndex.from_tuples(list(zip(repeat("WTR Assesors"), sorted(df["Rel_Human_bin"].unique()))))
idx_cols = pd.MultiIndex.from_tuples(list(zip(repeat("GPT-3.5"), sorted(df["Rel_GPT_sys_bin"].unique()))))

display(pd.DataFrame(
    confusion_matrix(y_true=df["Rel_Human_bin"], y_pred=df["Rel_GPT_sys_bin"]),
    index=idx_rows,
    columns=idx_cols
).T.style.highlight_max(axis=1, props='font-weight:bold'))

print("Cohen's K", cohen_kappa_score(df["Rel_GPT_sys_bin"], df["Rel_Human_bin"]))



Unnamed: 0_level_0,Unnamed: 1_level_0,WTR Assesors,WTR Assesors
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
GPT-3.5,0,18,13
GPT-3.5,1,12,47


Cohen's K 0.3801652892561984


## Baseline Rating no system instructions in binary

In [23]:
idx_rows = pd.MultiIndex.from_tuples(list(zip(repeat("WTR Assesors"), sorted(df["Rel_Human_bin"].unique()))))
idx_cols = pd.MultiIndex.from_tuples(list(zip(repeat("GPT-3.5"), sorted(df["Rel_GPT_bin_no_sys"].unique()))))

display(pd.DataFrame(
    confusion_matrix(y_true=df["Rel_Human_bin"], y_pred=df["Rel_GPT_bin_no_sys"]),
    index=idx_rows,
    columns=idx_cols
).T.style.highlight_max(axis=1, props='font-weight:bold'))

print("Cohen's K", cohen_kappa_score(df["Rel_GPT_bin_no_sys"], df["Rel_Human_bin"]))


Unnamed: 0_level_0,Unnamed: 1_level_0,WTR Assesors,WTR Assesors
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
GPT-3.5,0,17,8
GPT-3.5,1,13,52


Cohen's K 0.4521739130434783
