## Confusion matrix for larger evaluation (rejudging)

In [2]:
from LLmsfJiT import read_trec_qrels
import pandas as pd
from sklearn.metrics import confusion_matrix, cohen_kappa_score
from itertools import repeat

In [2]:
human_qrels = read_trec_qrels("../rel_files/rel_table_qrels.txt")
df = pd.DataFrame(human_qrels, columns=["Topic", "It", "Doc", "Rel_Human"])

gpt = read_trec_qrels("../gpt_judgements/rel_table_qrels.txt")
gpt_df = pd.DataFrame(gpt, columns=["Topic", "It", "Doc", "Rel_GPT"])

df["Rel_GPT"] = gpt_df["Rel_GPT"]

df["Rel_GPT"] = df["Rel_GPT"].astype(float)
df["Rel_Human"] = df["Rel_Human"].astype(float)

df["Rel_GPT"] = df["Rel_GPT"].astype(int)
df["Rel_Human"] = df["Rel_Human"].astype(int)

df.loc[df["Rel_Human"] == 0, "Rel_Human_bin"] = 0.
df.loc[df["Rel_Human"] > 0, "Rel_Human_bin"] = 1.
df["Rel_Human_bin"] = df["Rel_Human_bin"].astype(int)

In [3]:
df

Unnamed: 0,Topic,It,Doc,Rel_Human,Rel_GPT,Rel_Human_bin
0,1,0,5/1438042988718.8_20150728002308-00068-ip-10-2...,2,0,1
1,1,0,28/1438042990112.92_20150728002310-00241-ip-10...,2,1,1
2,1,0,41/1438042986451.45_20150728002306-00283-ip-10...,2,1,1
3,1,0,2/1438042981856.5_20150728002301-00265-ip-10-2...,2,1,1
4,1,0,21/1438042987174.71_20150728002307-00309-ip-10...,2,0,1
...,...,...,...,...,...,...
6944,60,0,25/1438042987155.85_20150728002307-00277-ip-10...,0,1,0
6945,60,0,25/1438042987155.85_20150728002307-00015-ip-10...,0,0,0
6946,60,0,23/1438042990609.0_20150728002310-00259-ip-10-...,0,1,0
6947,60,0,23/1438042989043.35_20150728002309-00291-ip-10...,0,1,0


## Confusion matrix

In [4]:
cm = confusion_matrix(y_true=df["Rel_Human"], y_pred=df["Rel_GPT"]).T
cohens_k = round(cohen_kappa_score(df["Rel_Human"], df["Rel_GPT"]), 2)

idx_rows = pd.MultiIndex.from_tuples(list(zip(repeat("WTR Assesors"), sorted(df["Rel_Human"].unique()))))
idx_cols = pd.MultiIndex.from_tuples(list(zip(repeat("GPT-3.5"), sorted(df["Rel_GPT"].unique()))))

a = pd.DataFrame(
    data=cm[:2],
    index = idx_cols,
    columns=idx_rows
)

display(a.style.highlight_max(axis=1, props='font-weight:bold'))
print("Cohen's K:", cohens_k)
        # .to_latex(
        #     hrules=True,
        #     multirow_align="t",
        #     caption="Rater agreement between GPT-3.5 and WTR Assesors",
        # )


Unnamed: 0_level_0,Unnamed: 1_level_0,WTR Assesors,WTR Assesors,WTR Assesors
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1,2
GPT-3.5,0,3068,415,171
GPT-3.5,1,1635,856,804


Cohen's K: 0.22


## Confusion matrix with human values mapped to binary

In [5]:
cm = confusion_matrix(y_true=df["Rel_Human_bin"], y_pred=df["Rel_GPT"]).T
cohens_k = cohen_kappa_score(df["Rel_Human"], df["Rel_GPT"])

idx_rows = pd.MultiIndex.from_tuples(list(zip(repeat("WTR Assesors"), sorted(df["Rel_Human_bin"].unique()))))
idx_cols = pd.MultiIndex.from_tuples(list(zip(repeat("GPT-3.5"), sorted(df["Rel_GPT"].unique()))))

diffs_col = []
diffs_col.append(cm[0][1] / sum(cm[0]))    
diffs_col.append(cm[1][0] / sum(cm[1]))

diffs_row = []
diffs_row.append(cm[:, 0][1] / sum(cm[:, 0]))
diffs_row.append(cm[:, 1][0] / sum(cm[:, 1]))

diff_wtr = pd.DataFrame(diffs_col, columns=["diff to wtr(%)"]).round(2)
diff_human = pd.DataFrame([diffs_row], index=["diff to human (%)"])
a = pd.DataFrame(
    data=cm,
    index = idx_cols,
    columns=idx_rows
)

display(a.style.highlight_max(axis=1, props='font-weight:bold'))
print("Cohen's K:", cohens_k)


Unnamed: 0_level_0,Unnamed: 1_level_0,WTR Assesors,WTR Assesors
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1
GPT-3.5,0,3068,586
GPT-3.5,1,1635,1660


Cohen's K: 0.21902185377903471


In [9]:
def get_freqs(df, col):
    vals = df[col].value_counts()
    vals_uniq = df[col].unique()

    return {val: vals[val] / sum(vals) for val in vals_uniq}

In [10]:
get_freqs(df, "Rel_GPT")

{0: 0.5258310548280328, 1: 0.47416894517196717}

In [11]:
get_freqs(df, "Rel_Human")

{2: 0.1403079579795654, 1: 0.1829040149661822, 0: 0.6767880270542525}

In [12]:
(df["Rel_GPT"].value_counts()[1] - df["Rel_Human_bin"].value_counts()[1] )

1049

In [13]:
df["Rel_GPT"].value_counts()[1]

3295

In [14]:
df["Rel_Human_bin"].value_counts()[1]

2246