# Notebook: Compare LLM and Human Annotations

## Packages

In [154]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import numpy as np
import json
import sys
import os

In [155]:
sys.path.append(os.path.abspath('../07 train models/'))
from TASD.evaluation import calculate_metrics_for_examples
import constants

## Settings

In [156]:
LLMS = ["Llama70B", "GPT-3"]
FEW_SHOT_CONDITIONS = ["fixed", "random"]

In [157]:
LLMS_ENCODED = {"GPT-3": "\\textbf{GPT-3.5-turbo}", "Llama70B": "\\textbf{Llama-2-70B}"}
ENCODE_CONDITION = {"fixed": "\\textbf{LRS\\textsubscript{25}}",
                    "random": "\\textbf{LRS\\textsubscript{500}}"}

## Code

### Load Data

#### Human Annotations

In [158]:
with open(f"annotation_datasets/annotated_synth_dataset.json", 'r') as json_file:
    human_annotations = json.load(json_file)    

#### Load Synthetic 

In [159]:
llm_annotations = []

for llm in LLMS:
    for fs in ["random", "fixed"]:
       for split_id in range(6):
           with open(f"../07 train models/synth/{llm}/{fs}/split_{split_id}.json", 'r') as json_file:
              synthetic_data_split = json.load(json_file)
              for example in  synthetic_data_split:
                  llm_annotations.append(example)    

In [160]:
llm_annotations_aspects = [([{"aspect_category": tag["label"], "aspect_polarity": tag["polarity"],
                              "aspect_term": tag["text"] if tag["text"] != 'NULL' else None, "start": tag["start"], "end": tag["end"]} for tag in example["tags"]], example["id"]) for example in llm_annotations]
human_annotations_aspects = [([{"aspect_category": tag["label"], "aspect_polarity": tag["polarity"], "aspect_term": tag["text"]
                                if tag["text"] != 'NULL' else None, "start": tag["start"], "end": tag["end"]} for tag in example["tags"]], example["id"], example["model"], example["few_shot_condtion"]) for example in human_annotations]

In [161]:
dataset = {}

In [162]:
def get_example_with_id(id, dataset):
    return [example for example in dataset if example[1] == id][0][0]

for llm in LLMS:
    dataset[llm] = {}
    for fs in FEW_SHOT_CONDITIONS:
        dataset[llm][fs] = {}
        human_annotations_aspects_ids = [example[1] for example in human_annotations_aspects if example[2] == llm and example[3] == fs]
        human_annotations_samples = [example[0] for example in human_annotations_aspects if example[2] == llm and example[3] == fs]
        llm_annotations_samples = [get_example_with_id(id, llm_annotations_aspects) for id in human_annotations_aspects_ids]

        dataset[llm][fs]["human_annotation"] = human_annotations_samples
        dataset[llm][fs]["llm_annotation"] = llm_annotations_samples

### Analyse Quality

#### Aspect Term Detection

In [163]:
def calculate_tp_tn_fp_fn_aspect_term(pred, label):
    pred_set = set(
        f"{range['start']}_{range['end']}" for range in pred)
    label_set = set(
        f"{range['start']}_{range['end']}" for range in label)

    tp_set = pred_set & label_set
    tp = len(tp_set)

    fp_set = pred_set - tp_set
    fp = len(fp_set)

    fn_set = label_set - tp_set
    fn = len(fn_set)

    return tp, 0, fp, fn


In [164]:
for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        llm_annotations_aspect_terms = [
            [{"start": tag["start"], "end": tag["end"]} for tag in example if tag["aspect_term"] is not None] for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_aspect_terms = [
            [{"start": tag["start"], "end": tag["end"]} for tag in example if tag["aspect_term"] is not None] for example in dataset[llm][fs]["human_annotation"]]

        tp_total = tn_total = fp_total = fn_total = 0
        for i in range(len(human_annotations_aspect_terms)):
            tp, tn, fp, fn = calculate_tp_tn_fp_fn_aspect_term(
                llm_annotations_aspect_terms[i], human_annotations_aspect_terms[i])
            tp_total += tp
            tn_total += tn
            fp_total += fp
            fn_total += fn

        # Calculate metrics
        accuracy = (tp_total + tn_total) / (tp_total + tn_total + fp_total +
                                            fn_total) if (tp_total + tn_total + fp_total + fn_total) > 0 else 0
        precision = tp_total / \
            (tp_total + fp_total) if (tp_total + fp_total) > 0 else 0
        recall = tp_total / \
            (tp_total + fn_total) if (tp_total + fn_total) > 0 else 0

        f1 = 2 * tp_total / (2 * tp_total + fn_total + fp_total)

        llm_print = "\\multirow{2}{*}{" + LLMS_ENCODED[llm] + "}" if fs_idx == 0 else ""

        fs_print = ENCODE_CONDITION[fs]
        
        print(llm_print, "&", fs_print,
              "&", "{:.3f}".format(f1),
              "&", "{:.3f}".format(accuracy),
              "&", "{:.3f}".format(precision),
              "&", "{:.3f}".format(recall), "\\\\")
    print("\hline")

\multirow{2}{*}{\textbf{Llama-2-70B}} & \textbf{LRS\textsubscript{25}} & 0.808 & 0.678 & 0.847 & 0.774 \\
 & \textbf{LRS\textsubscript{500}} & 0.807 & 0.676 & 0.819 & 0.795 \\
\hline
\multirow{2}{*}{\textbf{GPT-3.5-turbo}} & \textbf{LRS\textsubscript{25}} & 0.747 & 0.596 & 0.907 & 0.635 \\
 & \textbf{LRS\textsubscript{500}} & 0.738 & 0.585 & 0.936 & 0.609 \\
\hline


#### Aspect Category

In [165]:
def category_list_to_label(cat_list):
    return [1 if cat in cat_list else 0 for cat in constants.ASPECT_CATEGORIES]

In [166]:
for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        llm_annotations_aspect_categories = [category_list_to_label(
            [tag["aspect_category"] for tag in example]) for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_aspect_categories = [category_list_to_label(
            [tag["aspect_category"] for tag in example]) for example in dataset[llm][fs]["human_annotation"]]

        accuracy = accuracy_score(
            human_annotations_aspect_categories, llm_annotations_aspect_categories)
        f1_micro = f1_score(human_annotations_aspect_categories,
                            llm_annotations_aspect_categories, average='micro')
        f1_macro = f1_score(human_annotations_aspect_categories,
                            llm_annotations_aspect_categories, average='macro')

        llm_print = "\\multirow{2}{*}{" + \
            LLMS_ENCODED[llm] + "}" if fs_idx == 0 else ""

        print(llm_print, "&", ENCODE_CONDITION[fs],
              "&", "{:.3f}".format(f1_micro),
              "&", "{:.3f}".format(f1_macro),
              "&", "{:.3f}".format(accuracy), "\\\\")
    print("\\hline")

\multirow{2}{*}{\textbf{Llama-2-70B}} & \textbf{LRS\textsubscript{25}} & 0.743 & 0.734 & 0.593 \\
 & \textbf{LRS\textsubscript{500}} & 0.786 & 0.780 & 0.672 \\
\hline
\multirow{2}{*}{\textbf{GPT-3.5-turbo}} & \textbf{LRS\textsubscript{25}} & 0.959 & 0.958 & 0.918 \\
 & \textbf{LRS\textsubscript{500}} & 0.951 & 0.951 & 0.907 \\
\hline


#### Aspect Category (performance for each Aspect Category)

In [167]:
def category_list_to_label(cat_list):
    return [1 if cat in cat_list else 0 for cat in constants.ASPECT_CATEGORIES]


for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        llm_annotations_aspect_categories = [category_list_to_label(
            [tag["aspect_category"] for tag in example]) for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_aspect_categories = [category_list_to_label(
            [tag["aspect_category"] for tag in example]) for example in dataset[llm][fs]["human_annotation"]]

        for ac_idx, aspect_category in enumerate(constants.ASPECT_CATEGORIES):
            idx = constants.ASPECT_CATEGORIES.index(aspect_category)

            tp = sum((llm_annotations_aspect_categories[i][idx] == 1) and (
                human_annotations_aspect_categories[i][idx] == 1) for i in range(len(llm_annotations_aspect_categories)))
            fp = sum((llm_annotations_aspect_categories[i][idx] == 1) and (
                human_annotations_aspect_categories[i][idx] == 0) for i in range(len(llm_annotations_aspect_categories)))
            fn = sum((llm_annotations_aspect_categories[i][idx] == 0) and (
                human_annotations_aspect_categories[i][idx] == 1) for i in range(len(llm_annotations_aspect_categories)))

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            accuracy = accuracy_score([example[idx] for example in human_annotations_aspect_categories], [
                                      example[idx] for example in llm_annotations_aspect_categories])

            f1 = 2 * tp / (2 * tp + fn + fp)

            n_samples_in_class = sum(
                example[idx] == 1 for example in human_annotations_aspect_categories)

            llm_print = "\\multirow{10}{*}{" + \
                LLMS_ENCODED[llm] + "}" if fs_idx == 0 and ac_idx == 0 else ""
            fs_print = "\\multirow{5}{*}{" + \
                ENCODE_CONDITION[fs] + "}" if ac_idx == 0 else ""

            print(llm_print, "&", fs_print,
                  "&", "\\texttt{"+aspect_category+"}",
                  "&", "{:.3f}".format(f1),
                  "&", "{:.3f}".format(accuracy),
                  "&", "{:.3f}".format(precision),
                  "&", "{:.3f}".format(recall), "\\\\")

        print("\\arrayrulecolor{gray}\cline{2-7}\\arrayrulecolor{black}")

    print("\\hline")

\multirow{10}{*}{\textbf{Llama-2-70B}} & \multirow{5}{*}{\textbf{LRS\textsubscript{25}}} & \texttt{GENERAL-IMPRESSION} & 0.511 & 0.780 & 0.390 & 0.742 \\
 &  & \texttt{FOOD} & 0.740 & 0.847 & 0.824 & 0.672 \\
 &  & \texttt{SERVICE} & 0.819 & 0.897 & 0.782 & 0.859 \\
 &  & \texttt{AMBIENCE} & 0.769 & 0.877 & 0.794 & 0.745 \\
 &  & \texttt{PRICE} & 0.832 & 0.910 & 0.761 & 0.918 \\
\arrayrulecolor{gray}\cline{2-7}\arrayrulecolor{black}
 & \multirow{5}{*}{\textbf{LRS\textsubscript{500}}} & \texttt{GENERAL-IMPRESSION} & 0.596 & 0.835 & 0.453 & 0.869 \\
 &  & \texttt{FOOD} & 0.734 & 0.852 & 0.804 & 0.676 \\
 &  & \texttt{SERVICE} & 0.853 & 0.920 & 0.858 & 0.848 \\
 &  & \texttt{AMBIENCE} & 0.847 & 0.923 & 0.825 & 0.870 \\
 &  & \texttt{PRICE} & 0.871 & 0.938 & 0.796 & 0.962 \\
\arrayrulecolor{gray}\cline{2-7}\arrayrulecolor{black}
\hline
\multirow{10}{*}{\textbf{GPT-3.5-turbo}} & \multirow{5}{*}{\textbf{LRS\textsubscript{25}}} & \texttt{GENERAL-IMPRESSION} & 0.932 & 0.962 & 0.893 & 0.975 \\


#### Aspect Category + Sentiment Polarity

In [168]:
AC_POLARITY_COMBINATIONS = [cat+"_"+polarity for cat in constants.ASPECT_CATEGORIES for polarity in ["POSITIVE", "NEGATIVE", "NEUTRAL", "CONFLICT"]]

In [169]:
def category_polarity_list_to_label(cat_pol_list):
    return [1 if ac_pol in cat_pol_list else 0 for ac_pol in AC_POLARITY_COMBINATIONS]

In [170]:
for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        llm_annotations_ac_pol = [category_polarity_list_to_label(
            [tag["aspect_category"]+"_"+tag["aspect_polarity"] for tag in example]) for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_ac_pol = [category_polarity_list_to_label(
            [tag["aspect_category"]+"_"+tag["aspect_polarity"] for tag in example]) for example in dataset[llm][fs]["human_annotation"]]

        accuracy = accuracy_score(
            human_annotations_ac_pol, llm_annotations_ac_pol)
        f1_micro = f1_score(human_annotations_ac_pol,
                            llm_annotations_ac_pol, average='micro', zero_division=0)
        f1_macro = f1_score(human_annotations_ac_pol,
                            llm_annotations_ac_pol, average='macro', zero_division=0)

        llm_print = "\\multirow{2}{*}{" + \
            LLMS_ENCODED[llm] + "}" if fs_idx == 0 else ""

        print(llm_print,
              "&", ENCODE_CONDITION[fs],
              "&", "{:.3f}".format(f1_micro),
              "&", "{:.3f}".format(f1_macro),
              "&", "{:.3f}".format(accuracy), "\\\\")
    print("\\hline")

\multirow{2}{*}{\textbf{Llama-2-70B}} & \textbf{LRS\textsubscript{25}} & 0.508 & 0.356 & 0.388 \\
 & \textbf{LRS\textsubscript{500}} & 0.549 & 0.390 & 0.458 \\
\hline
\multirow{2}{*}{\textbf{GPT-3.5-turbo}} & \textbf{LRS\textsubscript{25}} & 0.848 & 0.631 & 0.767 \\
 & \textbf{LRS\textsubscript{500}} & 0.839 & 0.621 & 0.788 \\
\hline


In [171]:
def category_polarity_list_to_label(cat_polarity_list):
    return [1 if cat_polarity in cat_polarity_list else 0 for cat_polarity in AC_POLARITY_COMBINATIONS]


for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        llm_annotations_ac_pol = [category_polarity_list_to_label(
            [tag["aspect_category"] + "_" + tag["aspect_polarity"] for tag in example]) for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_ac_pol = [category_polarity_list_to_label(
            [tag["aspect_category"] + "_" + tag["aspect_polarity"] for tag in example]) for example in dataset[llm][fs]["human_annotation"]]

        for ac_pol_idx, ac_pol_combination in enumerate(AC_POLARITY_COMBINATIONS):
            idx = AC_POLARITY_COMBINATIONS.index(ac_pol_combination)

            tp = sum((llm_annotations_ac_pol[i][idx] == 1) and (
                human_annotations_ac_pol[i][idx] == 1) for i in range(len(llm_annotations_ac_pol)))
            fp = sum((llm_annotations_ac_pol[i][idx] == 1) and (
                human_annotations_ac_pol[i][idx] == 0) for i in range(len(llm_annotations_ac_pol)))
            fn = sum((llm_annotations_ac_pol[i][idx] == 0) and (
                human_annotations_ac_pol[i][idx] == 1) for i in range(len(llm_annotations_ac_pol)))

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            accuracy = accuracy_score([example[idx] for example in human_annotations_ac_pol], [
                                      example[idx] for example in llm_annotations_ac_pol])
            try:
                f1 = 2 * tp / (2 * tp + fn + fp)
            except:
                f1 = 0

            n_samples_in_class = sum(
                example[idx] == 1 for example in human_annotations_ac_pol)

            aspect_category, sentiment_polarity = ac_pol_combination.split("_")

            llm_print = "\\multirow{30}{*}{" + \
                LLMS_ENCODED[llm] + \
                "}" if fs_idx == 0 and ac_pol_idx == 0 else ""
            fs_print = "\\multirow{15}{*}{" + \
                ENCODE_CONDITION[fs] + "}" if ac_pol_idx == 0 else ""
            ac_print = "\\multirow{3}{*}{" + "\\texttt{" + \
                aspect_category+"}" + "}" if ac_pol_idx % 3 == 0 else ""

            if sentiment_polarity != "CONFLICT":
                print(llm_print,
                      "&", fs_print,
                      "&", ac_print,
                      "&", "\\texttt{"+sentiment_polarity+"}",
                      "&", "{:.3f}".format(f1),
                      "&", "{:.3f}".format(accuracy),
                      "&", "{:.3f}".format(precision),
                      "&", "{:.3f}".format(recall), "\\\\")

            if ac_pol_idx % 4 == 2:
                print(
                    "\\arrayrulecolor{gray}\cline{3-8}\\arrayrulecolor{black}")

        print("\\cline{2-8}")
    print("\\hline")

\multirow{30}{*}{\textbf{Llama-2-70B}} & \multirow{15}{*}{\textbf{LRS\textsubscript{25}}} & \multirow{3}{*}{\texttt{GENERAL-IMPRESSION}} & \texttt{POSITIVE} & 0.452 & 0.887 & 0.418 & 0.491 \\
 &  &  & \texttt{NEGATIVE} & 0.345 & 0.905 & 0.242 & 0.600 \\
 &  &  & \texttt{NEUTRAL} & 0.187 & 0.898 & 0.108 & 0.700 \\
\arrayrulecolor{gray}\cline{3-8}\arrayrulecolor{black}
 &  &  & \texttt{POSITIVE} & 0.592 & 0.878 & 0.779 & 0.477 \\
 &  &  & \texttt{NEGATIVE} & 0.577 & 0.922 & 0.653 & 0.516 \\
 &  & \multirow{3}{*}{\texttt{FOOD}} & \texttt{NEUTRAL} & 0.415 & 0.920 & 0.315 & 0.607 \\
\arrayrulecolor{gray}\cline{3-8}\arrayrulecolor{black}
 &  &  & \texttt{POSITIVE} & 0.607 & 0.905 & 0.746 & 0.512 \\
 &  & \multirow{3}{*}{\texttt{SERVICE}} & \texttt{NEGATIVE} & 0.721 & 0.943 & 0.733 & 0.710 \\
 &  &  & \texttt{NEUTRAL} & 0.275 & 0.903 & 0.159 & 1.000 \\
\arrayrulecolor{gray}\cline{3-8}\arrayrulecolor{black}
 &  & \multirow{3}{*}{\texttt{AMBIENCE}} & \texttt{POSITIVE} & 0.496 & 0.882 & 0.700 & 

#### Aspect Term + Sentiment Polarity

In [172]:
def calculate_tp_tn_fp_fn_e2e(pred, label):
    pred_set = set(
        f"{range['start']}_{range['end']}_{range['polarity']}" for range in pred)
    label_set = set(
        f"{range['start']}_{range['end']}_{range['polarity']}" for range in label)

    tp_set = pred_set & label_set
    tp = len(tp_set)

    fp_set = pred_set - tp_set
    fp = len(fp_set)

    fn_set = label_set - tp_set
    fn = len(fn_set)

    return tp, 0, fp, fn

In [173]:
def calculate_tp_tn_fp_fn_e2e_total(pred_total, label_total):
    tp_total = tn_total = fp_total = fn_total = 0
    for i in range(len(label_total)):
        tp, tn, fp, fn = calculate_tp_tn_fp_fn_e2e(
            pred_total[i], label_total[i])
        tp_total += tp
        tn_total += tn
        fp_total += fp
        fn_total += fn
    return tp_total, tn_total, fp_total, fn_total

In [174]:
def f1_macro_e2e(llm_annotations_aspect_terms, human_annotations_aspect_terms):
    f1_scores = []
    for pol in ["POSITIVE", "NEGATIVE", "NEUTRAL", "CONFLICT"]:
        llm_annotations_class = [[tag for tag in example if tag["polarity"] == pol]
                                 for example in llm_annotations_aspect_terms]
        human_annotations_class = [[tag for tag in example if tag["polarity"] == pol]
                                   for example in human_annotations_aspect_terms]
        
        tp_total, tn_total, fp_total, fn_total = calculate_tp_tn_fp_fn_e2e_total(llm_annotations_class, human_annotations_class)
        f1_scores.append(2 * tp_total / (2 * tp_total + fn_total + fp_total))
    return np.mean(f1_scores)

In [175]:
for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        llm_annotations_aspect_terms = [
            [{"start": tag["start"], "end": tag["end"], "polarity": tag["aspect_polarity"]} for tag in example if tag["aspect_term"] is not None] for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_aspect_terms = [
            [{"start": tag["start"], "end": tag["end"], "polarity": tag["aspect_polarity"]} for tag in example if tag["aspect_term"] is not None] for example in dataset[llm][fs]["human_annotation"]]

        tp_total, tn_total, fp_total, fn_total = calculate_tp_tn_fp_fn_e2e_total(
            llm_annotations_aspect_terms, human_annotations_aspect_terms)

        # Calculate metrics
        accuracy = (tp_total + tn_total) / (tp_total + tn_total + fp_total +
                                            fn_total) if (tp_total + tn_total + fp_total + fn_total) > 0 else 0
        precision = tp_total / \
            (tp_total + fp_total) if (tp_total + fp_total) > 0 else 0
        recall = tp_total / \
            (tp_total + fn_total) if (tp_total + fn_total) > 0 else 0

        f1_micro = 2 * tp_total / (2 * tp_total + fn_total + fp_total)

        f1_macro = f1_macro_e2e(
            llm_annotations_aspect_terms, human_annotations_aspect_terms)

        llm_print = "\\multirow{2}{*}{" + \
            LLMS_ENCODED[llm] + "}" if fs_idx == 0 else ""

        print(llm_print, "&", ENCODE_CONDITION[fs],
              "&", "{:.3f}".format(f1_micro),
              "&", "{:.3f}".format(f1_macro),
              "&", "{:.3f}".format(accuracy), "\\\\")
    print("\hline")

\multirow{2}{*}{\textbf{Llama-2-70B}} & \textbf{LRS\textsubscript{25}} & 0.557 & 0.392 & 0.386 \\
 & \textbf{LRS\textsubscript{500}} & 0.564 & 0.405 & 0.393 \\
\hline
\multirow{2}{*}{\textbf{GPT-3.5-turbo}} & \textbf{LRS\textsubscript{25}} & 0.681 & 0.507 & 0.516 \\
 & \textbf{LRS\textsubscript{500}} & 0.679 & 0.506 & 0.513 \\
\hline


#### Aspect Term + Aspect Category + Sentiment Polarity

In [176]:
for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        metrics_triplets = calculate_metrics_for_examples(
            dataset[llm][fs]["human_annotation"], dataset[llm][fs]["llm_annotation"])

        llm_print = "\\multirow{2}{*}{" + \
            LLMS_ENCODED[llm] + "}" if fs_idx == 0 else ""
        print(llm_print,
              "&", ENCODE_CONDITION[fs],
              "&", "{:.3f}".format(metrics_triplets["f1"]),
              "&", "{:.3f}".format(metrics_triplets["accuracy"]),
              "&", "{:.3f}".format(metrics_triplets["precision"]),
              "&", "{:.3f}".format(metrics_triplets["recall"]), "\\\\")

    print("\\hline")

\multirow{2}{*}{\textbf{Llama-2-70B}} & \textbf{LRS\textsubscript{25}} & 0.408 & 0.257 & 0.375 & 0.448 \\
 & \textbf{LRS\textsubscript{500}} & 0.448 & 0.289 & 0.409 & 0.496 \\
\hline
\multirow{2}{*}{\textbf{GPT-3.5-turbo}} & \textbf{LRS\textsubscript{25}} & 0.566 & 0.395 & 0.552 & 0.581 \\
 & \textbf{LRS\textsubscript{500}} & 0.569 & 0.397 & 0.562 & 0.575 \\
\hline
