In [8]:
from metrics import distances, eval, llm, overlaps, word_movers_distance
import json
import numpy as np
from pprint import pprint
dataset = json.load(open('data/pairwise_evaluation_w_embeddings.json'))
# dataset format:
# [
#  { 
# article_id: str, writer_id: str, evaluator_id: str,
# article_text: str, writer_summary: str, text-davinci-002_summary: str,
# overall_writer_better: bool | "Equally Good", informative_writer_better: bool | "Equally Good",
# full_embedding: [float], writer_summary_embedding: [float], llm_summary_embedding: [float] 
# ]

# test scripts
def test_cosine_distance(dataset, epsilon=0.001):
    confusion_matrix_overall = eval.llmConfusionMatrix()
    confusion_matrix_informative = eval.llmConfusionMatrix()
    for datum in dataset:
        full_embedding = np.array(datum['full_embedding'])
        writer_summary_embedding = np.array(datum['writer_summary_embedding'])
        llm_summary_embedding = np.array(datum['llm_summary_embedding'])
        # cosine distance
        d1 = distances.cosine_distance(full_embedding, writer_summary_embedding)
        d2 = distances.cosine_distance(full_embedding, llm_summary_embedding)
        pred = distances.cosine_distance_writer_better(d1, d2, epsilon)
        confusion_matrix_overall.add(datum['overall_writer_better'], pred)
        confusion_matrix_informative.add(datum['informative_writer_better'], pred)
    return confusion_matrix_overall, confusion_matrix_informative

def test_linear_regression(dataset, epsilon=0.001):
    confusion_matrix_overall = eval.llmConfusionMatrix()
    confusion_matrix_informative = eval.llmConfusionMatrix()
    # linear regression
    full_embeddings = np.array([datum['full_embedding'] for datum in dataset])
    writer_summary_embeddings = np.array([datum['writer_summary_embedding'] for datum in dataset])
    llm_summary_embeddings = np.array([datum['llm_summary_embedding'] for datum in dataset])
    # fit linear regression between full and writer summary and calculate distance
    writer_distances = distances.linear_regression_distance(full_embeddings, writer_summary_embeddings)

    # fit linear regression between full and llm summary and calculate distance
    llm_distances = distances.linear_regression_distance(full_embeddings, llm_summary_embeddings)
    predictions = list(map(lambda writer_distance, llm_distance: distances.linear_regression_writer_better(writer_distance, llm_distance, epsilon), writer_distances, llm_distances))
    overall_writer_better_labels = [datum['overall_writer_better'] for datum in dataset]
    informative_writer_better_labels = [datum['informative_writer_better'] for datum in dataset]
    confusion_matrix_overall.addList(overall_writer_better_labels, predictions)
    confusion_matrix_informative.addList(informative_writer_better_labels, predictions)

    return confusion_matrix_overall, confusion_matrix_informative

# def test_mover_score(dataset, epsilon=0.001):
#     confusion_matrix_overall = eval.llmConfusionMatrix()
#     confusion_matrix_informative = eval.llmConfusionMatrix()
#     for datum in dataset:
#         full_text = datum['article_text']
#         writer_summary = datum['writer_summary']
#         llm_summary = datum['text-davinci-002_summary']
#         # writer summary mover score
#         writer_mover_score = moverscore.corpus_score(full_text, writer_summary)
#         llm_mover_score = moverscore.corpus_score(full_text, llm_summary)
#         pred = moverscore.predict_writer_better(writer_mover_score, llm_mover_score, epsilon)
#         confusion_matrix_overall.add(datum['overall_writer_better'], pred)
#         confusion_matrix_informative.add(datum['informative_writer_better'], pred)
#     return confusion_matrix_overall, confusion_matrix_informative

def test_rogue(dataset, epsilon=0.001):
    confusion_matrix_overall = eval.llmConfusionMatrix()
    confusion_matrix_informative = eval.llmConfusionMatrix()
    for datum in dataset:
        full_text = datum['article_text']
        writer_summary = datum['writer_summary']
        llm_summary = datum['text-davinci-002_summary']
        # rogues
        writer_rogue = overlaps.rogue_score(full_text, writer_summary)
        llm_rogue = overlaps.rogue_score(full_text, llm_summary)
        print(writer_rogue, llm_rogue)
        pred = overlaps.rogue_writer_better(writer_rogue, llm_rogue, epsilon)
        confusion_matrix_overall.add(datum['overall_writer_better'], pred)
        confusion_matrix_informative.add(datum['informative_writer_better'], pred)
    return confusion_matrix_overall, confusion_matrix_informative

def test_wmd(dataset, wmd_scorer, epsilon=0.001):
    confusion_matrix_overall = eval.llmConfusionMatrix()
    confusion_matrix_informative = eval.llmConfusionMatrix()
    for datum in dataset:
        full_text = datum['article_text']
        writer_summary = datum['writer_summary']
        llm_summary = datum['text-davinci-002_summary']
        # rogues
        writer_wmd = wmd_scorer.distance(full_text, writer_summary)
        llm_wmd = wmd_scorer.distance(full_text, llm_summary)
        pred = wmd_scorer.predict_writer_better(writer_wmd, llm_wmd, epsilon)
        confusion_matrix_overall.add(datum['overall_writer_better'], pred)
        confusion_matrix_informative.add(datum['informative_writer_better'], pred)
    return confusion_matrix_overall, confusion_matrix_informative


def test_llm(dataset, api_key):
    confusion_matrix_overall = eval.llmConfusionMatrix()
    confusion_matrix_informative = eval.llmConfusionMatrix()
    llmEvaluator = llm.LLMEvaluator(api_key)
    for index, datum in enumerate(dataset):
        full_text = datum['article_text']
        writer_summary = datum['writer_summary']
        llm_summary = datum['text-davinci-002_summary']
        # writer summary mover score
        pred = llmEvaluator.predict_writer_better(full_text, writer_summary, llm_summary)
        print(index, pred)
        confusion_matrix_overall.add(datum['overall_writer_better'], pred)
        confusion_matrix_informative.add(datum['informative_writer_better'], pred)
    return confusion_matrix_overall, confusion_matrix_informative


In [5]:
# test with cosine distance
epsilons = [0.01, 0.001, 0.0001, 0.00001, 0.000001, 0]
for epsilon in epsilons:
    confusion_matrix_overall, confusion_matrix_informative = test_cosine_distance(dataset, epsilon=epsilon)
    print(epsilon, confusion_matrix_overall.get_accuracy(), confusion_matrix_informative.get_accuracy(), confusion_matrix_overall.get_accuracy_by_label(label=True), confusion_matrix_overall.get_accuracy_by_label(label=False), confusion_matrix_overall.get_accuracy_by_label(label="Equally Good"))

0.01 0.330550918196995 0.32053422370617696 0.5432098765432098 0.09623430962343096 0.36752136752136755
0.001 0.3572621035058431 0.327212020033389 0.691358024691358 0.1799163179916318 0.02564102564102564
0.0001 0.3656093489148581 0.335559265442404 0.7078189300411523 0.19665271966527198 0.0
1e-05 0.3656093489148581 0.335559265442404 0.7078189300411523 0.19665271966527198 0.0
1e-06 0.3656093489148581 0.335559265442404 0.7078189300411523 0.19665271966527198 0.0
0 0.3656093489148581 0.335559265442404 0.7078189300411523 0.19665271966527198 0.0


In [3]:
# test with linear regression
epsilons = [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0]
for epsilon in epsilons:
    confusion_matrix_overall, confusion_matrix_informative = test_linear_regression(dataset, epsilon=epsilon)
    # print(confusion_matrix_overall)
    print(epsilon, confusion_matrix_overall.get_accuracy(), confusion_matrix_informative.get_accuracy(), confusion_matrix_overall.get_accuracy_by_label(label=True), confusion_matrix_overall.get_accuracy_by_label(label=False), confusion_matrix_overall.get_accuracy_by_label(label="Equally Good"))

10 0.2020033388981636 0.2287145242070117 0.0205761316872428 0.03347280334728033 0.9230769230769231
1 0.3789649415692821 0.38564273789649417 0.43209876543209874 0.38493723849372385 0.2564102564102564
0.1 0.39065108514190316 0.38731218697829717 0.4897119341563786 0.47280334728033474 0.017094017094017096
0.01 0.39232053422370616 0.38898163606010017 0.49794238683127573 0.4769874476987448 0.0
0.001 0.39232053422370616 0.38898163606010017 0.49794238683127573 0.4769874476987448 0.0
0.0001 0.39232053422370616 0.38898163606010017 0.49794238683127573 0.4769874476987448 0.0
1e-05 0.39232053422370616 0.38898163606010017 0.49794238683127573 0.4769874476987448 0.0
1e-06 0.39232053422370616 0.38898163606010017 0.49794238683127573 0.4769874476987448 0.0
0 0.39232053422370616 0.38898163606010017 0.49794238683127573 0.4769874476987448 0.0


In [None]:
# test with mover score
# 0.01 0.2637729549248748 0.24874791318864775
# 0.001 0.2888146911519199 0.2687813021702838
# 0.0001 0.29549248747913187 0.27545909849749584
# 1e-05 0.2988313856427379 0.27879799666110183
# 1e-06 0.2988313856427379 0.27879799666110183
# 0 0.2988313856427379 0.27879799666110183

In [9]:
# test with rogue
epsilons = [0.1, 0.02, 0.01, 0.005, 0.001, 0.0001, 0]
for epsilon in epsilons[2:3]:
    confusion_matrix_overall, confusion_matrix_informative = test_rogue(dataset, epsilon=epsilon)
    # print(confusion_matrix_overall)
    # print(epsilon, confusion_matrix_overall.get_accuracy(), confusion_matrix_informative.get_accuracy(), confusion_matrix_overall.get_accuracy_by_label(label=True), confusion_matrix_overall.get_accuracy_by_label(label=False), confusion_matrix_overall.get_accuracy_by_label(label="Equally Good"))

0.12224938875305623 0.19753086419753085
0.08033826638477802 0.16535433070866146
0.12332439678284182 0.18428184281842816
0.1090909090909091 0.1793372319688109
0.08097165991902834 0.12195121951219512
0.050089445438282656 0.04864864864864866
0.07734806629834254 0.10546139359698682
0.06172839506172839 0.09690721649484536
0.050089445438282656 0.04864864864864866
0.14868105515587532 0.27488151658767773
0.12332439678284182 0.18428184281842816
0.08033826638477802 0.16535433070866146
0.15121951219512195 0.12500000000000003
0.14285714285714285 0.08716707021791767
0.0632411067193676 0.052589641434262945
0.06762295081967212 0.0411522633744856
0.061403508771929835 0.11534276387377583
0.07814149947201689 0.09371671991480297
0.0632411067193676 0.052589641434262945
0.11608623548922058 0.12162162162162163
0.0826032540675845 0.11645569620253166
0.0654911838790932 0.11645569620253166
0.1088709677419355 0.11570247933884298
0.08098987626546682 0.06422018348623854
0.1297071129707113 0.11764705882352941
0.12

In [4]:
# todo: check imbalance
from collections import defaultdict
dataset = json.load(open('data/pairwise_evaluation_w_embeddings.json'))
label_counts = defaultdict(list)
for datum in dataset:
    label = datum['overall_writer_better']
    label_counts[label].append(datum)
print(len(label_counts[True]), len(label_counts[False]), len(label_counts['Equally Good']))

243 239 117


In [6]:
# test with wmd
epsilons = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0]
wmd_scorer = word_movers_distance.WordMoversDistance()
for epsilon in epsilons:
    confusion_matrix_overall, confusion_matrix_informative = test_wmd(dataset, wmd_scorer, epsilon=epsilon)
    # print(confusion_matrix_overall)
    print(epsilon, confusion_matrix_overall.get_accuracy(), confusion_matrix_informative.get_accuracy(), confusion_matrix_overall.get_accuracy_by_label(label=True), confusion_matrix_overall.get_accuracy_by_label(label=False), confusion_matrix_overall.get_accuracy_by_label(label="Equally Good"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samytlee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1 0.19532554257095158 0.22036727879799667 0.0 0.0 1.0
0.1 0.28213689482470783 0.3055091819699499 0.04526748971193416 0.23430962343096234 0.8717948717948718
0.01 0.46243739565943237 0.4741235392320534 0.411522633744856 0.6359832635983264 0.21367521367521367
0.001 0.4707846410684474 0.4674457429048414 0.4444444444444444 0.7154811715481172 0.02564102564102564
0.0001 0.4741235392320534 0.4707846410684474 0.4567901234567901 0.7154811715481172 0.017094017094017096
1e-05 0.4741235392320534 0.4707846410684474 0.46502057613168724 0.7154811715481172 0.0
1e-06 0.4741235392320534 0.4707846410684474 0.46502057613168724 0.7154811715481172 0.0
0 0.4741235392320534 0.4707846410684474 0.46502057613168724 0.7154811715481172 0.0


In [2]:
api_key = open("api_key").read()
# test with llm evaluator
confusion_matrix_overall, confusion_matrix_informative = test_llm(dataset, api_key)
# print(confusion_matrix_overall)
print(confusion_matrix_overall.get_accuracy(), confusion_matrix_informative.get_accuracy())

0 Equally Good
1 False
2 Equally Good
3 Equally Good
4 Equally Good
5 Equally Good
6 Equally Good
7 Equally Good
8 Equally Good
9 False
10 Equally Good
11 False
12 Equally Good
13 True
14 False
15 Equally Good
16 Equally Good
17 Equally Good
18 False
19 False
20 Equally Good
21 True
22 True
23 Equally Good
24 True
25 False
26 Equally Good
27 Equally Good
28 Equally Good
29 Equally Good
30 True
31 True
32 Equally Good
33 False
34 True
35 Equally Good
36 Equally Good
37 Equally Good
38 False
39 Equally Good
40 Equally Good
41 True
42 Equally Good
43 False
44 True
45 False
46 Equally Good
47 Equally Good
48 Equally Good
49 False
50 True
51 Equally Good
52 Equally Good
53 Equally Good
54 Equally Good
55 True
56 False
57 False
58 False
59 Equally Good
60 Equally Good
61 True
62 True
63 Equally Good
64 Equally Good
65 Equally Good
66 Equally Good
67 Equally Good
68 Equally Good
69 Equally Good
70 True
71 Equally Good
72 False
73 False
74 Equally Good
75 True
76 False
77 Equally Good
78 True
