# Notebook: Compare LLM and Human Annotations

## Packages

In [71]:
import pandas as pd
import itertools
import json
import sys
import os

In [60]:
sys.path.append(os.path.abspath('../07 train models/'))
from TASD.evaluation import calculate_metrics_for_examples
import constants

## Settings

In [48]:
LLMS = ["Llama70B", "GPT-3"]
FEW_SHOT_CONDITIONS = ["random", "fixed"]

## Code

### Load Data

#### Human Annotations

In [49]:
with open(f"annotation_datasets/annotated_synth_dataset.json", 'r') as json_file:
    human_annotations = json.load(json_file)    

#### Load Synthetic 

In [50]:
llm_annotations = []

for llm in LLMS:
    for fs in ["random", "fixed"]:
       for split_id in range(5):
           with open(f"../07 train models/synth/{llm}/{fs}/split_{split_id}.json", 'r') as json_file:
              synthetic_data_split = json.load(json_file)
              for example in  synthetic_data_split:
                  llm_annotations.append(example)    

In [51]:
len(llm_annotations), len(human_annotations)

(34750, 2293)

In [52]:
llm_annotations_aspects = [([{"aspect_category": tag["label"], "aspect_polarity": tag["polarity"],
                              "aspect_term": tag["text"]} for tag in example["tags"]], example["id"]) for example in llm_annotations]
human_annotations_aspects = [([{"aspect_category": tag["label"], "aspect_polarity": tag["polarity"], "aspect_term": tag["text"]
                                if tag["text"] != 'NULL' else None} for tag in example["tags"]], example["id"], example["model"], example["few_shot_condtion"]) for example in human_annotations]

In [54]:
human_annotations_aspects[0]

([{'aspect_category': 'GENERAL-IMPRESSION',
   'aspect_polarity': 'POSITIVE',
   'aspect_term': None}],
 'a49a6f01-1ecc-4da0-b76b-f283f518fc60',
 'Llama70B',
 'random')

### Analyse Quality

#### Triplets

In [77]:
def get_example_with_id(id, dataset):
    return [example for example in dataset if example[1] == id][0][0]

for llm in LLMS:
    for fs in FEW_SHOT_CONDITIONS:
        human_annotations_aspects_ids = [example[1] for example in human_annotations_aspects if example[2] == llm and example[3] == fs]
        human_annotations_samples = [example[0] for example in human_annotations_aspects if example[2] == llm and example[3] == fs]
        llm_annotations_samples = [get_example_with_id(id, llm_annotations_aspects) for id in human_annotations_aspects_ids]

        print(llm, fs, "\n", "----- ----- -----")
        print(calculate_metrics_for_examples(
            human_annotations_samples, llm_annotations_samples))

Llama70B random 
 ----- ----- -----
{'f1': 0.4669312169312169, 'recall': 0.500709219858156, 'precision': 0.43742255266418834, 'accuracy': 0.30457290767903367, 'tp': 353, 'tn': 0, 'fp': 454, 'fn': 352}
Llama70B fixed 
 ----- ----- -----
{'f1': 0.4382227632379793, 'recall': 0.46272493573264784, 'precision': 0.4161849710982659, 'accuracy': 0.2805923616523772, 'tp': 360, 'tn': 0, 'fp': 505, 'fn': 418}
GPT-3 random 
 ----- ----- -----
{'f1': 0.572644376899696, 'recall': 0.576499388004896, 'precision': 0.5688405797101449, 'accuracy': 0.40119250425894376, 'tp': 471, 'tn': 0, 'fp': 357, 'fn': 346}
GPT-3 fixed 
 ----- ----- -----
{'f1': 0.5832414553472989, 'recall': 0.5977401129943503, 'precision': 0.5694294940796556, 'accuracy': 0.41167315175097274, 'tp': 529, 'tn': 0, 'fp': 400, 'fn': 356}
