In [7]:
import pandas as pd
from scipy.stats import mannwhitneyu, brunnermunzel
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np

from transformers import (
    AutoTokenizer,
)

In [8]:
df_norationale = pd.read_json('learned-NQ-test-norationale-predicted.jsonl', lines=True)
df_rationale = pd.read_json('learned-NQ-test-rationale-predicted.jsonl', lines=True)

In [9]:
base_model_name = "NousResearch/Llama-2-7b-chat-hf"
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

In [5]:
llama_tokenizer("123").input_ids

[1, 29871, 29896, 29906, 29941]

In [10]:
y_true = []
y_scores = []
for i, row in df_rationale.iterrows():
    y_true.append(row['golden_judge'])
    score = row['rationale_log_prob'] - df_norationale.iloc[i]['norationale_log_prob']
    seq_length = len(llama_tokenizer(row['golden_answer']).input_ids)
    score = score / seq_length
    y_scores.append(score)

In [11]:
pos_scores = [score for score, label in zip(y_scores, y_true) if label == 1]
neg_scores = [score for score, label in zip(y_scores, y_true) if label == 0]

print("Mann-Whitney U test")
print(mannwhitneyu(pos_scores, neg_scores, alternative='less'))

print(np.mean(pos_scores))
print(np.mean(neg_scores))

Mann-Whitney U test
MannwhitneyuResult(statistic=4818571.5, pvalue=1.2513239616879862e-18)
1.0309257502721478
1.394922664116375


In [13]:
precs, recalls, ths = metrics.precision_recall_curve(y_true, y_scores)
# rank by best F1
fscores = []
for prec, recall in zip(precs, recalls):
    if prec + recall == 0:
        fscores.append(0)
    else:
        fscores.append(2 * prec * recall / (prec + recall))
accs = []
for th in ths:
    y_pred = [1 if score > th else 0 for score in y_scores]
    accs.append(metrics.accuracy_score(y_true, y_pred))
print("Best acc", max(accs))


fscores = np.array(fscores)
best_th = ths[np.argmax(fscores)]
print("Best threshold", best_th)

print(max(fscores))

for prec, recall, th in zip(precs, recalls, ths):
    if th == best_th:
        print("Best precision", prec)
        print("Best recall", recall)
        break

y_pred = [1 if score > best_th else 0 for score in y_scores]
print(metrics.classification_report(y_true, y_pred))

Best acc 0.7350993377483444
Best threshold -4.749399185175001
0.8474162277688726
Best precision 0.7352317880794702
Best recall 1.0
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1999
           1       0.74      1.00      0.85      5551

    accuracy                           0.74      7550
   macro avg       0.37      0.50      0.42      7550
weighted avg       0.54      0.74      0.62      7550



In [23]:
(precs + recalls)

array([1.73523179, 1.73501657, 1.73480134, ..., 0.        , 0.        ,
       1.        ])

In [28]:
print(max(accs))

0.7350993377483444
