In [1]:
from metrics import distances, eval, llm, overlaps, word_movers_distance
import json
import numpy as np
from pprint import pprint
dataset = json.load(open('data/pairwise_evaluation_w_embeddings.json'))
# dataset format:
# [
#  { 
# article_id: str, writer_id: str, evaluator_id: str,
# article_text: str, writer_summary: str, text-davinci-002_summary: str,
# overall_writer_better: bool | "Equally Good", informative_writer_better: bool | "Equally Good",
# full_embedding: [float], writer_summary_embedding: [float], llm_summary_embedding: [float] 
# ]

# test scripts
def test_cosine_distance(dataset, epsilon=0.001):
    confusion_matrix_overall = eval.llmConfusionMatrix()
    confusion_matrix_informative = eval.llmConfusionMatrix()
    for datum in dataset:
        full_embedding = np.array(datum['full_embedding'])
        writer_summary_embedding = np.array(datum['writer_summary_embedding'])
        llm_summary_embedding = np.array(datum['llm_summary_embedding'])
        # cosine distance
        d1 = distances.cosine_distance(full_embedding, writer_summary_embedding)
        d2 = distances.cosine_distance(full_embedding, llm_summary_embedding)
        pred = distances.cosine_distance_writer_better(d1, d2, epsilon)
        confusion_matrix_overall.add(datum['overall_writer_better'], pred)
        confusion_matrix_informative.add(datum['informative_writer_better'], pred)
    return confusion_matrix_overall, confusion_matrix_informative

def test_linear_regression(dataset, epsilon=0.001):
    confusion_matrix_overall = eval.llmConfusionMatrix()
    confusion_matrix_informative = eval.llmConfusionMatrix()
    # linear regression
    full_embeddings = np.array([datum['full_embedding'] for datum in dataset])
    writer_summary_embeddings = np.array([datum['writer_summary_embedding'] for datum in dataset])
    llm_summary_embeddings = np.array([datum['llm_summary_embedding'] for datum in dataset])
    # fit linear regression between full and writer summary and calculate distance
    writer_distances = distances.linear_regression_distances(full_embeddings, writer_summary_embeddings)

    # fit linear regression between full and llm summary and calculate distance
    llm_distances = distances.linear_regression_distances(full_embeddings, llm_summary_embeddings)
    predictions = list(map(lambda writer_distance, llm_distance: distances.linear_regression_writer_better(writer_distance, llm_distance, epsilon), writer_distances, llm_distances))
    overall_writer_better_labels = [datum['overall_writer_better'] for datum in dataset]
    informative_writer_better_labels = [datum['informative_writer_better'] for datum in dataset]
    confusion_matrix_overall.addList(overall_writer_better_labels, predictions)
    confusion_matrix_informative.addList(informative_writer_better_labels, predictions)

    return confusion_matrix_overall, confusion_matrix_informative

# def test_mover_score(dataset, epsilon=0.001):
#     confusion_matrix_overall = eval.llmConfusionMatrix()
#     confusion_matrix_informative = eval.llmConfusionMatrix()
#     for datum in dataset:
#         full_text = datum['article_text']
#         writer_summary = datum['writer_summary']
#         llm_summary = datum['text-davinci-002_summary']
#         # writer summary mover score
#         writer_mover_score = moverscore.corpus_score(full_text, writer_summary)
#         llm_mover_score = moverscore.corpus_score(full_text, llm_summary)
#         pred = moverscore.predict_writer_better(writer_mover_score, llm_mover_score, epsilon)
#         confusion_matrix_overall.add(datum['overall_writer_better'], pred)
#         confusion_matrix_informative.add(datum['informative_writer_better'], pred)
#     return confusion_matrix_overall, confusion_matrix_informative

def test_rogue(dataset, epsilon=0.001):
    confusion_matrix_overall = eval.llmConfusionMatrix()
    confusion_matrix_informative = eval.llmConfusionMatrix()
    for datum in dataset:
        full_text = datum['article_text']
        writer_summary = datum['writer_summary']
        llm_summary = datum['text-davinci-002_summary']
        # rogues
        writer_rogue = overlaps.rogue_score(full_text, writer_summary)
        llm_rogue = overlaps.rogue_score(full_text, llm_summary)
        # print(writer_rogue, llm_rogue)
        pred = overlaps.writer_better(writer_rogue, llm_rogue, higher_better=True, epsilon=epsilon)
        confusion_matrix_overall.add(datum['overall_writer_better'], pred)
        confusion_matrix_informative.add(datum['informative_writer_better'], pred)
    return confusion_matrix_overall, confusion_matrix_informative

def test_wmd(dataset, wmd_scorer, epsilon=0.001):
    confusion_matrix_overall = eval.llmConfusionMatrix()
    confusion_matrix_informative = eval.llmConfusionMatrix()
    for datum in dataset:
        full_text = datum['article_text']
        writer_summary = datum['writer_summary']
        llm_summary = datum['text-davinci-002_summary']
        # rogues
        writer_wmd = wmd_scorer.distance(full_text, writer_summary)
        llm_wmd = wmd_scorer.distance(full_text, llm_summary)
        pred = wmd_scorer.predict_writer_better(writer_wmd, llm_wmd, epsilon)
        confusion_matrix_overall.add(datum['overall_writer_better'], pred)
        confusion_matrix_informative.add(datum['informative_writer_better'], pred)
    return confusion_matrix_overall, confusion_matrix_informative


def test_llm(dataset, api_key):
    confusion_matrix_overall = eval.llmConfusionMatrix()
    confusion_matrix_informative = eval.llmConfusionMatrix()
    llmEvaluator = llm.LLMEvaluator(api_key)
    for index, datum in enumerate(dataset):
        full_text = datum['article_text']
        writer_summary = datum['writer_summary']
        llm_summary = datum['text-davinci-002_summary']
        # writer summary mover score
        pred = llmEvaluator.predict_writer_better(full_text, writer_summary, llm_summary)
        print(index, pred)
        confusion_matrix_overall.add(datum['overall_writer_better'], pred)
        confusion_matrix_informative.add(datum['informative_writer_better'], pred)
    return confusion_matrix_overall, confusion_matrix_informative


In [None]:
# test with cosine distance
epsilons = [0.01, 0.001, 0.0001, 0.00001, 0.000001, 0]
for epsilon in epsilons:
    confusion_matrix_overall, confusion_matrix_informative = test_cosine_distance(dataset, epsilon=epsilon)
    print(epsilon, confusion_matrix_overall.get_accuracy(), confusion_matrix_informative.get_accuracy(), confusion_matrix_overall.get_accuracy_by_label(label=True), confusion_matrix_overall.get_accuracy_by_label(label=False), confusion_matrix_overall.get_accuracy_by_label(label="Equally Good"))

In [2]:
# test with linear regression
epsilons = [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0]
for epsilon in epsilons:
    confusion_matrix_overall, confusion_matrix_informative = test_linear_regression(dataset, epsilon=epsilon)
    # print(confusion_matrix_overall)
    print(epsilon, confusion_matrix_overall.get_accuracy(), confusion_matrix_informative.get_accuracy(), confusion_matrix_overall.get_accuracy_by_label(label=True), confusion_matrix_overall.get_accuracy_by_label(label=False), confusion_matrix_overall.get_accuracy_by_label(label="Equally Good"))
    break

(599,)
(599,)
10 0.19532554257095158 0.22036727879799667 0.0 0.0 1.0


In [None]:
# test with mover score
# 0.01 0.2637729549248748 0.24874791318864775
# 0.001 0.2888146911519199 0.2687813021702838
# 0.0001 0.29549248747913187 0.27545909849749584
# 1e-05 0.2988313856427379 0.27879799666110183
# 1e-06 0.2988313856427379 0.27879799666110183
# 0 0.2988313856427379 0.27879799666110183

In [None]:
# test with rogue
epsilons = [0.1, 0.02, 0.01, 0.005, 0.001, 0.0001, 0]
for epsilon in epsilons:
    confusion_matrix_overall, confusion_matrix_informative = test_rogue(dataset, epsilon=epsilon)
    # print(confusion_matrix_overall)
    print(epsilon, confusion_matrix_overall.get_accuracy(), confusion_matrix_informative.get_accuracy(), confusion_matrix_overall.get_accuracy_by_label(label=True), confusion_matrix_overall.get_accuracy_by_label(label=False), confusion_matrix_overall.get_accuracy_by_label(label="Equally Good"))

In [None]:
# todo: check imbalance
from collections import defaultdict
dataset = json.load(open('data/pairwise_evaluation_w_embeddings.json'))
label_counts = defaultdict(list)
for datum in dataset:
    label = datum['overall_writer_better']
    label_counts[label].append(datum)
print(len(label_counts[True]), len(label_counts[False]), len(label_counts['Equally Good']))

In [None]:
# test with wmd
epsilons = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0]
wmd_scorer = word_movers_distance.WordMoversDistance()
for epsilon in epsilons:
    confusion_matrix_overall, confusion_matrix_informative = test_wmd(dataset, wmd_scorer, epsilon=epsilon)
    # print(confusion_matrix_overall)
    print(epsilon, confusion_matrix_overall.get_accuracy(), confusion_matrix_informative.get_accuracy(), confusion_matrix_overall.get_accuracy_by_label(label=True), confusion_matrix_overall.get_accuracy_by_label(label=False), confusion_matrix_overall.get_accuracy_by_label(label="Equally Good"))

In [None]:
api_key = open("api_key").read()
# test with llm evaluator
confusion_matrix_overall, confusion_matrix_informative = test_llm(dataset, api_key)
# print(confusion_matrix_overall)
print(confusion_matrix_overall.get_accuracy(), confusion_matrix_informative.get_accuracy())