# Notebook: Compare LLM and Human Annotations

## Packages

In [49]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import json
import sys
import os

In [50]:
sys.path.append(os.path.abspath('../07 train models/'))
from TASD.evaluation import calculate_metrics_for_examples
import constants

## Settings

In [63]:
LLMS = ["Llama70B", "GPT-3"]
FEW_SHOT_CONDITIONS = ["random", "fixed"]
AC_POLARITY_COMBINATIONS = [cat+"_"+polarity for cat in constants.ASPECT_CATEGORIES for polarity in constants.POLARITIES]

## Code

### Load Data

#### Human Annotations

In [52]:
with open(f"annotation_datasets/annotated_synth_dataset.json", 'r') as json_file:
    human_annotations = json.load(json_file)    

#### Load Synthetic 

In [53]:
llm_annotations = []

for llm in LLMS:
    for fs in ["random", "fixed"]:
       for split_id in range(5):
           with open(f"../07 train models/synth/{llm}/{fs}/split_{split_id}.json", 'r') as json_file:
              synthetic_data_split = json.load(json_file)
              for example in  synthetic_data_split:
                  llm_annotations.append(example)    

In [54]:
llm_annotations_aspects = [([{"aspect_category": tag["label"], "aspect_polarity": tag["polarity"],
                              "aspect_term": tag["text"] if tag["text"] != 'NULL' else None, "start": tag["start"], "end": tag["end"]} for tag in example["tags"]], example["id"]) for example in llm_annotations]
human_annotations_aspects = [([{"aspect_category": tag["label"], "aspect_polarity": tag["polarity"], "aspect_term": tag["text"]
                                if tag["text"] != 'NULL' else None, "start": tag["start"], "end": tag["end"]} for tag in example["tags"]], example["id"], example["model"], example["few_shot_condtion"]) for example in human_annotations]

In [55]:
human_annotations_aspects[0]

([{'aspect_category': 'GENERAL-IMPRESSION',
   'aspect_polarity': 'POSITIVE',
   'aspect_term': None,
   'start': 0,
   'end': 0}],
 'a49a6f01-1ecc-4da0-b76b-f283f518fc60',
 'Llama70B',
 'random')

In [56]:
dataset = {}

In [57]:
def get_example_with_id(id, dataset):
    return [example for example in dataset if example[1] == id][0][0]

for llm in LLMS:
    dataset[llm] = {}
    for fs in FEW_SHOT_CONDITIONS:
        dataset[llm][fs] = {}
        human_annotations_aspects_ids = [example[1] for example in human_annotations_aspects if example[2] == llm and example[3] == fs]
        human_annotations_samples = [example[0] for example in human_annotations_aspects if example[2] == llm and example[3] == fs]
        llm_annotations_samples = [get_example_with_id(id, llm_annotations_aspects) for id in human_annotations_aspects_ids]

        dataset[llm][fs]["human_annotation"] = human_annotations_samples
        dataset[llm][fs]["llm_annotation"] = llm_annotations_samples

### Analyse Quality

#### Triplets

In [59]:
for llm in LLMS:
    for fs in FEW_SHOT_CONDITIONS:
        print(llm, fs, "\n", "----- ----- -----")
        print(calculate_metrics_for_examples(dataset[llm][fs]["human_annotation"], dataset[llm][fs]["llm_annotation"]))

Llama70B random 
 ----- ----- -----
{'f1': 0.4669312169312169, 'recall': 0.500709219858156, 'precision': 0.43742255266418834, 'accuracy': 0.30457290767903367, 'tp': 353, 'tn': 0, 'fp': 454, 'fn': 352}
Llama70B fixed 
 ----- ----- -----
{'f1': 0.4382227632379793, 'recall': 0.46272493573264784, 'precision': 0.4161849710982659, 'accuracy': 0.2805923616523772, 'tp': 360, 'tn': 0, 'fp': 505, 'fn': 418}
GPT-3 random 
 ----- ----- -----
{'f1': 0.572644376899696, 'recall': 0.576499388004896, 'precision': 0.5688405797101449, 'accuracy': 0.40119250425894376, 'tp': 471, 'tn': 0, 'fp': 357, 'fn': 346}
GPT-3 fixed 
 ----- ----- -----
{'f1': 0.5832414553472989, 'recall': 0.5977401129943503, 'precision': 0.5694294940796556, 'accuracy': 0.41167315175097274, 'tp': 529, 'tn': 0, 'fp': 400, 'fn': 356}


#### Aspect Category

In [60]:
def category_list_to_label(cat_list):
    return [1 if cat in cat_list else 0 for cat in constants.ASPECT_CATEGORIES]

In [61]:
for llm in LLMS:
    for fs in FEW_SHOT_CONDITIONS:
        llm_annotations_aspect_categories = [category_list_to_label(
            [tag["aspect_category"] for tag in example]) for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_aspect_categories = [category_list_to_label(
            [tag["aspect_category"] for tag in example]) for example in dataset[llm][fs]["human_annotation"]]

        accuracy = accuracy_score(
            human_annotations_aspect_categories, llm_annotations_aspect_categories)
        f1_micro = f1_score(human_annotations_aspect_categories,
                            llm_annotations_aspect_categories, average='micro')
        f1_macro = f1_score(human_annotations_aspect_categories,
                            llm_annotations_aspect_categories, average='macro')

        print(llm, "&", fs, "&", accuracy, "&", f1_micro, "&", f1_macro)

Llama70B & random & 0.708029197080292 & 0.8110795454545454 & 0.8037365117505718
Llama70B & fixed & 0.6498194945848376 & 0.7824377457404981 & 0.7734307720212131
GPT-3 & random & 0.912751677852349 & 0.9550706033376123 & 0.9555349753741205
GPT-3 & fixed & 0.9243697478991597 & 0.9649851632047478 & 0.9642089920810204


#### Aspect Category (performance for each Aspect Category)

In [62]:
def category_list_to_label(cat_list):
    return [1 if cat in cat_list else 0 for cat in constants.ASPECT_CATEGORIES]


for llm in LLMS:
    for fs in FEW_SHOT_CONDITIONS:
        llm_annotations_aspect_categories = [category_list_to_label(
            [tag["aspect_category"] for tag in example]) for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_aspect_categories = [category_list_to_label(
            [tag["aspect_category"] for tag in example]) for example in dataset[llm][fs]["human_annotation"]]

        for aspect_category in constants.ASPECT_CATEGORIES:
            idx = constants.ASPECT_CATEGORIES.index(aspect_category)

            tp = sum((llm_annotations_aspect_categories[i][idx] == 1) and (
                human_annotations_aspect_categories[i][idx] == 1) for i in range(len(llm_annotations_aspect_categories)))
            fp = sum((llm_annotations_aspect_categories[i][idx] == 1) and (
                human_annotations_aspect_categories[i][idx] == 0) for i in range(len(llm_annotations_aspect_categories)))
            fn = sum((llm_annotations_aspect_categories[i][idx] == 0) and (
                human_annotations_aspect_categories[i][idx] == 1) for i in range(len(llm_annotations_aspect_categories)))

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            accuracy = accuracy_score([example[idx] for example in human_annotations_aspect_categories], [
                                      example[idx] for example in llm_annotations_aspect_categories])
            n_samples_in_class = sum(
                example[idx] == 1 for example in human_annotations_aspect_categories)

            print(llm, "&", fs, "&", aspect_category, "&", precision,
                  "&", recall, "&", accuracy, "&", n_samples_in_class)

Llama70B & random & GENERAL-IMPRESSION & 0.46938775510204084 & 0.8734177215189873 & 0.8394160583941606 & 79
Llama70B & random & FOOD & 0.8540145985401459 & 0.6724137931034483 & 0.8594890510948905 & 174
Llama70B & random & SERVICE & 0.912751677852349 & 0.8292682926829268 & 0.9251824817518248 & 164
Llama70B & random & AMBIENCE & 0.86 & 0.8958333333333334 & 0.9343065693430657 & 144
Llama70B & random & PRICE & 0.851063829787234 & 0.975609756097561 & 0.9562043795620438 & 123
Llama70B & fixed & GENERAL-IMPRESSION & 0.46710526315789475 & 0.7634408602150538 & 0.8140794223826715 & 93
Llama70B & fixed & FOOD & 0.8791946308724832 & 0.6752577319587629 & 0.8537906137184116 & 194
Llama70B & fixed & SERVICE & 0.845679012345679 & 0.8616352201257862 & 0.9151624548736462 & 159
Llama70B & fixed & AMBIENCE & 0.815068493150685 & 0.7828947368421053 & 0.8916967509025271 & 152
Llama70B & fixed & PRICE & 0.8323353293413174 & 0.9144736842105263 & 0.9259927797833934 & 152
GPT-3 & random & GENERAL-IMPRESSION & 0.

#### Aspect Category + Sentiment Polarity

In [64]:
def category_polarity_list_to_label(cat_pol_list):
    return [1 if ac_pol in cat_pol_list else 0 for ac_pol in AC_POLARITY_COMBINATIONS]

In [72]:
for llm in LLMS:
    for fs in FEW_SHOT_CONDITIONS:
        llm_annotations_ac_pol = [category_polarity_list_to_label(
            [tag["aspect_category"]+"_"+tag["aspect_polarity"] for tag in example]) for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_ac_pol = [category_polarity_list_to_label(
            [tag["aspect_category"]+"_"+tag["aspect_polarity"] for tag in example]) for example in dataset[llm][fs]["human_annotation"]]

        accuracy = accuracy_score(
            human_annotations_ac_pol, llm_annotations_ac_pol)
        f1_micro = f1_score(human_annotations_ac_pol,
                            llm_annotations_ac_pol, average='micro')
        f1_macro = f1_score(human_annotations_ac_pol,
                            llm_annotations_ac_pol, average='macro')

        print(llm, "&", fs, "&", accuracy, "&", f1_micro, "&", f1_macro)

Llama70B & random & 0.48722627737226276 & 0.5718213058419245 & 0.5413943549657524
Llama70B & fixed & 0.427797833935018 & 0.5363408521303258 & 0.4974647635692283
GPT-3 & random & 0.7986577181208053 & 0.8453865336658354 & 0.8318841090991446
GPT-3 & fixed & 0.7831932773109244 & 0.8631221719457014 & 0.8542210405117728


In [75]:
def category_polarity_list_to_label(cat_polarity_list):
    return [1 if cat_polarity in cat_polarity_list else 0 for cat_polarity in AC_POLARITY_COMBINATIONS]

for llm in LLMS:
    for fs in FEW_SHOT_CONDITIONS:
        llm_annotations_ac_pol = [category_polarity_list_to_label(
            [tag["aspect_category"] + "_" + tag["aspect_polarity"] for tag in example]) for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_ac_pol = [category_polarity_list_to_label(
            [tag["aspect_category"] + "_" + tag["aspect_polarity"] for tag in example]) for example in dataset[llm][fs]["human_annotation"]]

        for ac_pol_combination in AC_POLARITY_COMBINATIONS:
            idx = AC_POLARITY_COMBINATIONS.index(ac_pol_combination)

            tp = sum((llm_annotations_ac_pol[i][idx] == 1) and (
                human_annotations_ac_pol[i][idx] == 1) for i in range(len(llm_annotations_ac_pol)))
            fp = sum((llm_annotations_ac_pol[i][idx] == 1) and (
                human_annotations_ac_pol[i][idx] == 0) for i in range(len(llm_annotations_ac_pol)))
            fn = sum((llm_annotations_ac_pol[i][idx] == 0) and (
                human_annotations_ac_pol[i][idx] == 1) for i in range(len(llm_annotations_ac_pol)))

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            accuracy = accuracy_score([example[idx] for example in human_annotations_ac_pol], [
                                      example[idx] for example in llm_annotations_ac_pol])
            n_samples_in_class = sum(
                example[idx] == 1 for example in human_annotations_ac_pol)

            aspect_category, sentiment_polarity = ac_pol_combination.split("_")

            print(llm, "&", fs, "&", aspect_category, "&", sentiment_polarity, "&", precision,
                  "&", recall, "&", accuracy, "&", n_samples_in_class)


Llama70B & random & GENERAL-IMPRESSION & POSITIVE & 0.5777777777777777 & 0.5777777777777777 & 0.9306569343065694 & 45
Llama70B & random & GENERAL-IMPRESSION & NEUTRAL & 0.1076923076923077 & 0.7 & 0.8886861313868614 & 10
Llama70B & random & GENERAL-IMPRESSION & NEGATIVE & 0.3333333333333333 & 0.6666666666666666 & 0.927007299270073 & 24
Llama70B & random & FOOD & POSITIVE & 0.8 & 0.43478260869565216 & 0.8868613138686131 & 92
Llama70B & random & FOOD & NEUTRAL & 0.42857142857142855 & 0.65625 & 0.9288321167883211 & 32
Llama70B & random & FOOD & NEGATIVE & 0.813953488372093 & 0.660377358490566 & 0.9525547445255474 & 53
Llama70B & random & SERVICE & POSITIVE & 0.859375 & 0.5238095238095238 & 0.8923357664233577 & 105
Llama70B & random & SERVICE & NEUTRAL & 0.18181818181818182 & 0.9090909090909091 & 0.916058394160584 & 11
Llama70B & random & SERVICE & NEGATIVE & 0.85 & 0.6938775510204082 & 0.9616788321167883 & 49
Llama70B & random & AMBIENCE & POSITIVE & 0.8461538461538461 & 0.5432098765432098

#### Aspect Term Detection

In [85]:
def calculate_tp_tn_fp_fn_aspect_term(pred, label):
    pred_set = set(
        f"{range['start']}_{range['end']}" for range in pred)
    label_set = set(
        f"{range['start']}_{range['end']}" for range in label)

    tp_set = pred_set & label_set
    tp = len(tp_set)

    fp_set = pred_set - tp_set
    fp = len(fp_set)

    fn_set = label_set - tp_set
    fn = len(fn_set)

    return tp, 0, fp, fn


In [92]:
for llm in LLMS:
    for fs in FEW_SHOT_CONDITIONS:
        llm_annotations_aspect_terms = [
            [{"start": tag["start"], "end": tag["end"]} for tag in example if tag["aspect_term"] is not None] for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_aspect_terms = [
            [{"start": tag["start"], "end": tag["end"]} for tag in example if tag["aspect_term"] is not None] for example in dataset[llm][fs]["human_annotation"]]

        tp_total = tn_total = fp_total = fn_total = 0
        for i in range(len(human_annotations_aspect_terms)):
            tp, tn, fp, fn = calculate_tp_tn_fp_fn_aspect_term(
                llm_annotations_aspect_terms[i], human_annotations_aspect_terms[i])
            tp_total += tp
            tn_total += tn
            fp_total += fp
            fn_total += fn

        # Calculate metrics
        accuracy = (tp_total + tn_total) / (tp_total + tn_total + fp_total +
                                            fn_total) if (tp_total + tn_total + fp_total + fn_total) > 0 else 0
        precision = tp_total / \
            (tp_total + fp_total) if (tp_total + fp_total) > 0 else 0
        recall = tp_total / \
            (tp_total + fn_total) if (tp_total + fn_total) > 0 else 0


        f1 = 2 * tp_total / (2 * tp_total + fn_total + fp_total)

        print(llm, "&", fs, "&", accuracy, "&",
              f1, "&", precision, "&", recall)

Llama70B & random & 0.7027417027417028 & 0.8254237288135593 & 0.8650088809946714 & 0.7893030794165316
Llama70B & fixed & 0.7067183462532299 & 0.8281604844814534 & 0.882258064516129 & 0.7803138373751783
GPT-3 & random & 0.5872395833333334 & 0.7399507793273175 & 0.9356846473029046 & 0.6119402985074627
GPT-3 & fixed & 0.62004662004662 & 0.7654676258992805 & 0.918825561312608 & 0.655980271270037


#### End-2-End ABSA

In [103]:
def calculate_tp_tn_fp_fn_e2e(pred, label):
    pred_set = set(
        f"{range['start']}_{range['end']}_{range['polarity']}" for range in pred)
    label_set = set(
        f"{range['start']}_{range['end']}_{range['polarity']}" for range in label)

    tp_set = pred_set & label_set
    tp = len(tp_set)

    fp_set = pred_set - tp_set
    fp = len(fp_set)

    fn_set = label_set - tp_set
    fn = len(fn_set)

    return tp, 0, fp, fn

In [105]:
for llm in LLMS:
    for fs in FEW_SHOT_CONDITIONS:
        llm_annotations_aspect_terms = [
            [{"start": tag["start"], "end": tag["end"], "polarity": tag["aspect_polarity"]} for tag in example if tag["aspect_term"] is not None] for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_aspect_terms = [
            [{"start": tag["start"], "end": tag["end"], "polarity": tag["aspect_polarity"]} for tag in example if tag["aspect_term"] is not None] for example in dataset[llm][fs]["human_annotation"]]

        tp_total = tn_total = fp_total = fn_total = 0
        for i in range(len(human_annotations_aspect_terms)):
            tp, tn, fp, fn = calculate_tp_tn_fp_fn_e2e(
                llm_annotations_aspect_terms[i], human_annotations_aspect_terms[i])
            tp_total += tp
            tn_total += tn
            fp_total += fp
            fn_total += fn

        # Calculate metrics
        accuracy = (tp_total + tn_total) / (tp_total + tn_total + fp_total +
                                            fn_total) if (tp_total + tn_total + fp_total + fn_total) > 0 else 0
        precision = tp_total / \
            (tp_total + fp_total) if (tp_total + fp_total) > 0 else 0
        recall = tp_total / \
            (tp_total + fn_total) if (tp_total + fn_total) > 0 else 0

        f1 = 2 * tp_total / (2 * tp_total + fn_total + fp_total)

        print(llm, "&", fs, "&", accuracy, "&",
              f1, "&", precision, "&", recall)

Llama70B & random & 0.4107142857142857 & 0.5822784810126582 & 0.6095406360424028 & 0.5573505654281099
Llama70B & fixed & 0.4088983050847458 & 0.5804511278195489 & 0.6166134185303515 & 0.5482954545454546
GPT-3 & random & 0.5141800246609125 & 0.6791530944625407 & 0.8651452282157677 & 0.5589812332439679
GPT-3 & fixed & 0.5318681318681319 & 0.6944045911047346 & 0.8359240069084629 & 0.5938650306748466
