# Notebook: Compare LLM and Human Annotations

## Packages

In [24]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import numpy as np
import json
import sys
import os

In [25]:
sys.path.append(os.path.abspath('../07 train models/'))
from TASD.evaluation import calculate_metrics_for_examples
import constants

## Settings

In [26]:
LLMS = ["Llama3_70B", "GPT-3"]
FEW_SHOT_CONDITIONS = ["fixed", "random"]

In [27]:
LLMS_ENCODED = {"GPT-3": "\\textbf{GPT-3.5-turbo}", "Llama3_70B": "\\textbf{Llama-3-70B}"}
ENCODE_CONDITION = {"fixed": "\\textbf{LRS\\textsubscript{25}}",
                    "random": "\\textbf{LRS\\textsubscript{500}}"}

## Code

### Load Data

#### Human Annotations

In [28]:
with open(f"annotation_datasets/annotated_synth_dataset.json", 'r') as json_file:
    human_annotations = json.load(json_file)    

#### Load Synthetic 

In [29]:
llm_annotations = []

for llm in LLMS:
    for fs in ["random", "fixed"]:
       for split_id in range(6):
           with open(f"../07 train models/synth/{llm}/{fs}/split_{split_id}.json", 'r') as json_file:
              synthetic_data_split = json.load(json_file)
              for example in  synthetic_data_split:
                  llm_annotations.append(example)    

In [30]:
llm_annotations_aspects = [([{"aspect_category": tag["label"], "aspect_polarity": tag["polarity"],
                              "aspect_term": tag["text"] if tag["text"] != 'NULL' else None, "start": tag["start"], "end": tag["end"]} for tag in example["tags"]], example["id"]) for example in llm_annotations]
human_annotations_aspects = [([{"aspect_category": tag["label"], "aspect_polarity": tag["polarity"], "aspect_term": tag["text"]
                                if tag["text"] != 'NULL' else None, "start": tag["start"], "end": tag["end"]} for tag in example["tags"]], example["id"], example["model"], example["few_shot_condtion"]) for example in human_annotations]

In [31]:
dataset = {}

In [32]:
def get_example_with_id(id, dataset):
    return [example for example in dataset if example[1] == id][0][0]

for llm in LLMS:
    dataset[llm] = {}
    for fs in FEW_SHOT_CONDITIONS:
        dataset[llm][fs] = {}
        human_annotations_aspects_ids = [example[1] for example in human_annotations_aspects if example[2] == llm and example[3] == fs]
        human_annotations_samples = [example[0] for example in human_annotations_aspects if example[2] == llm and example[3] == fs]
        llm_annotations_samples = [get_example_with_id(id, llm_annotations_aspects) for id in human_annotations_aspects_ids]

        dataset[llm][fs]["human_annotation"] = human_annotations_samples
        dataset[llm][fs]["llm_annotation"] = llm_annotations_samples

### Analyse Quality

#### Aspect Term Detection

In [33]:
def calculate_tp_tn_fp_fn_aspect_term(pred, label):
    pred_set = set(
        f"{range['start']}_{range['end']}" for range in pred)
    label_set = set(
        f"{range['start']}_{range['end']}" for range in label)

    tp_set = pred_set & label_set
    tp = len(tp_set)

    fp_set = pred_set - tp_set
    fp = len(fp_set)

    fn_set = label_set - tp_set
    fn = len(fn_set)

    return tp, 0, fp, fn


In [34]:
for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        llm_annotations_aspect_terms = [
            [{"start": tag["start"], "end": tag["end"]} for tag in example if tag["aspect_term"] is not None] for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_aspect_terms = [
            [{"start": tag["start"], "end": tag["end"]} for tag in example if tag["aspect_term"] is not None] for example in dataset[llm][fs]["human_annotation"]]

        tp_total = tn_total = fp_total = fn_total = 0
        for i in range(len(human_annotations_aspect_terms)):
            tp, tn, fp, fn = calculate_tp_tn_fp_fn_aspect_term(
                llm_annotations_aspect_terms[i], human_annotations_aspect_terms[i])
            tp_total += tp
            tn_total += tn
            fp_total += fp
            fn_total += fn

        # Calculate metrics
        accuracy = (tp_total + tn_total) / (tp_total + tn_total + fp_total +
                                            fn_total) if (tp_total + tn_total + fp_total + fn_total) > 0 else 0
        precision = tp_total / \
            (tp_total + fp_total) if (tp_total + fp_total) > 0 else 0
        recall = tp_total / \
            (tp_total + fn_total) if (tp_total + fn_total) > 0 else 0

        f1 = 2 * tp_total / (2 * tp_total + fn_total + fp_total)

        llm_print = "\\multirow{2}{*}{" + \
            LLMS_ENCODED[llm] + "}" if fs_idx == 0 else ""

        fs_print = ENCODE_CONDITION[fs]

        print(llm_print, "&", fs_print,
              "&", "{:.2f}".format(f1*100),
              "&", "{:.2f}".format(precision*100),
              "&", "{:.2f}".format(recall*100), "\\\\")
    print("\hline")

\multirow{2}{*}{\textbf{Llama-3-70B}} & \textbf{LRS\textsubscript{25}} & 84.12 & 89.02 & 79.74 \\
 & \textbf{LRS\textsubscript{500}} & 81.52 & 88.65 & 75.46 \\
\hline
\multirow{2}{*}{\textbf{GPT-3.5-turbo}} & \textbf{LRS\textsubscript{25}} & 75.11 & 91.44 & 63.72 \\
 & \textbf{LRS\textsubscript{500}} & 72.61 & 92.44 & 59.78 \\
\hline


#### Aspect Category

In [53]:
def category_list_to_label(cat_list):
    return [1 if cat in cat_list else 0 for cat in constants.ASPECT_CATEGORIES]

In [55]:
for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        llm_annotations_aspect_categories = [category_list_to_label(
            [tag["aspect_category"] for tag in example]) for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_aspect_categories = [category_list_to_label(
            [tag["aspect_category"] for tag in example]) for example in dataset[llm][fs]["human_annotation"]]

        accuracy = accuracy_score(
            human_annotations_aspect_categories, llm_annotations_aspect_categories)
        f1_micro = f1_score(human_annotations_aspect_categories,
                            llm_annotations_aspect_categories, average='micro')
        f1_macro = f1_score(human_annotations_aspect_categories,
                            llm_annotations_aspect_categories, average='macro')

        llm_print = "\\multirow{2}{*}{" + \
            LLMS_ENCODED[llm] + "}" if fs_idx == 0 else ""

        print(llm_print, "&", ENCODE_CONDITION[fs],
              "&", "{:.2f}".format(f1_micro*100),
              "&", "{:.2f}".format(f1_macro*100), "\\\\")
    print("\\hline")

\multirow{2}{*}{\textbf{Llama-3-70B}} & \textbf{LRS\textsubscript{25}} & 85.90 & 85.26 \\
 & \textbf{LRS\textsubscript{500}} & 85.86 & 85.53 \\
\hline
\multirow{2}{*}{\textbf{GPT-3.5-turbo}} & \textbf{LRS\textsubscript{25}} & 95.56 & 95.42 \\
 & \textbf{LRS\textsubscript{500}} & 95.60 & 95.61 \\
\hline


#### Aspect Category (performance for each Aspect Category)

In [37]:
def category_list_to_label(cat_list):
    return [1 if cat in cat_list else 0 for cat in constants.ASPECT_CATEGORIES]


for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        llm_annotations_aspect_categories = [category_list_to_label(
            [tag["aspect_category"] for tag in example]) for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_aspect_categories = [category_list_to_label(
            [tag["aspect_category"] for tag in example]) for example in dataset[llm][fs]["human_annotation"]]

        for ac_idx, aspect_category in enumerate(constants.ASPECT_CATEGORIES):
            idx = constants.ASPECT_CATEGORIES.index(aspect_category)

            tp = sum((llm_annotations_aspect_categories[i][idx] == 1) and (
                human_annotations_aspect_categories[i][idx] == 1) for i in range(len(llm_annotations_aspect_categories)))
            fp = sum((llm_annotations_aspect_categories[i][idx] == 1) and (
                human_annotations_aspect_categories[i][idx] == 0) for i in range(len(llm_annotations_aspect_categories)))
            fn = sum((llm_annotations_aspect_categories[i][idx] == 0) and (
                human_annotations_aspect_categories[i][idx] == 1) for i in range(len(llm_annotations_aspect_categories)))

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            accuracy = accuracy_score([example[idx] for example in human_annotations_aspect_categories], [
                                      example[idx] for example in llm_annotations_aspect_categories])

            f1 = 2 * tp / (2 * tp + fn + fp)

            n_samples_in_class = sum(
                example[idx] == 1 for example in human_annotations_aspect_categories)

            llm_print = "\\multirow{10}{*}{" + \
                LLMS_ENCODED[llm] + "}" if fs_idx == 0 and ac_idx == 0 else ""
            fs_print = "\\multirow{5}{*}{" + \
                ENCODE_CONDITION[fs] + "}" if ac_idx == 0 else ""

            print(llm_print, "&", fs_print,
                  "&", "\\texttt{"+aspect_category+"}",
                  "&", "{:.2f}".format(f1*100),
                  "&", "{:.2f}".format(accuracy*100),
                  "&", "{:.2f}".format(precision*100),
                  "&", "{:.2f}".format(recall*100), "\\\\")

        print("\\arrayrulecolor{gray}\cline{2-7}\\arrayrulecolor{black}")

    print("\\hline")

\multirow{10}{*}{\textbf{Llama-3-70B}} & \multirow{5}{*}{\textbf{LRS\textsubscript{25}}} & \texttt{GENERAL-IMPRESSION} & 68.99 & 85.17 & 53.80 & 96.12 \\
 &  & \texttt{FOOD} & 85.25 & 91.00 & 91.76 & 79.59 \\
 &  & \texttt{SERVICE} & 92.26 & 95.50 & 94.71 & 89.94 \\
 &  & \texttt{AMBIENCE} & 86.16 & 92.67 & 90.13 & 82.53 \\
 &  & \texttt{PRICE} & 93.63 & 96.17 & 92.86 & 94.41 \\
\arrayrulecolor{gray}\cline{2-7}\arrayrulecolor{black}
 & \multirow{5}{*}{\textbf{LRS\textsubscript{500}}} & \texttt{GENERAL-IMPRESSION} & 75.27 & 88.50 & 63.64 & 92.11 \\
 &  & \texttt{FOOD} & 83.08 & 90.83 & 88.82 & 78.03 \\
 &  & \texttt{SERVICE} & 91.07 & 95.00 & 95.62 & 86.93 \\
 &  & \texttt{AMBIENCE} & 86.21 & 93.33 & 88.65 & 83.89 \\
 &  & \texttt{PRICE} & 92.02 & 95.67 & 91.46 & 92.59 \\
\arrayrulecolor{gray}\cline{2-7}\arrayrulecolor{black}
\hline
\multirow{10}{*}{\textbf{GPT-3.5-turbo}} & \multirow{5}{*}{\textbf{LRS\textsubscript{25}}} & \texttt{GENERAL-IMPRESSION} & 92.07 & 95.67 & 87.28 & 97.42 \\


#### Aspect Category + Sentiment Polarity

In [38]:
AC_POLARITY_COMBINATIONS = [cat+"_"+polarity for cat in constants.ASPECT_CATEGORIES for polarity in ["POSITIVE", "NEGATIVE", "NEUTRAL", "CONFLICT"]]

In [39]:
def category_polarity_list_to_label(cat_pol_list):
    return [1 if ac_pol in cat_pol_list else 0 for ac_pol in AC_POLARITY_COMBINATIONS]

In [59]:
for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        llm_annotations_ac_pol = [category_polarity_list_to_label(
            [tag["aspect_category"]+"_"+tag["aspect_polarity"] for tag in example]) for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_ac_pol = [category_polarity_list_to_label(
            [tag["aspect_category"]+"_"+tag["aspect_polarity"] for tag in example]) for example in dataset[llm][fs]["human_annotation"]]

        accuracy = accuracy_score(
            human_annotations_ac_pol, llm_annotations_ac_pol)

        f1_micro = f1_score(human_annotations_ac_pol,
                            llm_annotations_ac_pol, average='micro', zero_division=0)

        f1_macro_dedicated = []
        for i, aspect_category_sentiment in enumerate(constants.ASPECT_CATEGORY_POLARITIES):
            class_labels = [label[i] for label in human_annotations_ac_pol]
            class_predictions = [prediction[i] for prediction in llm_annotations_ac_pol]

            f1 = f1_score(class_labels, class_predictions, zero_division=0)

            if all(el == 0 for el in class_labels):
                pass
            else:
                f1_macro_dedicated.append(f1)
        f1_macro = np.mean(f1_macro_dedicated)

        llm_print = "\\multirow{2}{*}{" + \
            LLMS_ENCODED[llm] + "}" if fs_idx == 0 else ""

        print(llm_print,
              "&", ENCODE_CONDITION[fs],
              "&", "{:.2f}".format(f1_micro*100),
              "&", "{:.2f}".format(f1_macro*100), "\\\\")
    print("\\hline")

\multirow{2}{*}{\textbf{Llama-3-70B}} & \textbf{LRS\textsubscript{25}} & 58.98 & 49.72 \\
 & \textbf{LRS\textsubscript{500}} & 61.68 & 48.67 \\
\hline
\multirow{2}{*}{\textbf{GPT-3.5-turbo}} & \textbf{LRS\textsubscript{25}} & 84.37 & 71.46 \\
 & \textbf{LRS\textsubscript{500}} & 83.66 & 77.60 \\
\hline


In [41]:
def category_polarity_list_to_label(cat_polarity_list):
    return [1 if cat_polarity in cat_polarity_list else 0 for cat_polarity in AC_POLARITY_COMBINATIONS]


for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        llm_annotations_ac_pol = [category_polarity_list_to_label(
            [tag["aspect_category"] + "_" + tag["aspect_polarity"] for tag in example]) for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_ac_pol = [category_polarity_list_to_label(
            [tag["aspect_category"] + "_" + tag["aspect_polarity"] for tag in example]) for example in dataset[llm][fs]["human_annotation"]]

        for ac_pol_idx, ac_pol_combination in enumerate(AC_POLARITY_COMBINATIONS):
            idx = AC_POLARITY_COMBINATIONS.index(ac_pol_combination)

            tp = sum((llm_annotations_ac_pol[i][idx] == 1) and (
                human_annotations_ac_pol[i][idx] == 1) for i in range(len(llm_annotations_ac_pol)))
            fp = sum((llm_annotations_ac_pol[i][idx] == 1) and (
                human_annotations_ac_pol[i][idx] == 0) for i in range(len(llm_annotations_ac_pol)))
            fn = sum((llm_annotations_ac_pol[i][idx] == 0) and (
                human_annotations_ac_pol[i][idx] == 1) for i in range(len(llm_annotations_ac_pol)))

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            accuracy = accuracy_score([example[idx] for example in human_annotations_ac_pol], [
                                      example[idx] for example in llm_annotations_ac_pol])
            try:
                f1 = 2 * tp / (2 * tp + fn + fp)
            except:
                f1 = 0

            n_samples_in_class = sum(
                example[idx] == 1 for example in human_annotations_ac_pol)

            aspect_category, sentiment_polarity = ac_pol_combination.split("_")

            llm_print = "\\multirow{30}{*}{" + \
                LLMS_ENCODED[llm] + \
                "}" if fs_idx == 0 and ac_pol_idx == 0 else ""
            fs_print = "\\multirow{15}{*}{" + \
                ENCODE_CONDITION[fs] + "}" if ac_pol_idx == 0 else ""
            ac_print = "\\multirow{3}{*}{" + "\\texttt{" + \
                aspect_category+"}" + "}" if ac_pol_idx % 3 == 0 else ""

            if sentiment_polarity != "CONFLICT":
                print(llm_print,
                      "&", fs_print,
                      "&", ac_print,
                      "&", "\\texttt{"+sentiment_polarity+"}",
                      "&", "{:.2f}".format(f1*100),
                      "&", "{:.2f}".format(accuracy*100),
                      "&", "{:.2f}".format(precision*100),
                      "&", "{:.2f}".format(recall*100), "\\\\")

            if ac_pol_idx % 4 == 2:
                print(
                    "\\arrayrulecolor{gray}\cline{3-8}\\arrayrulecolor{black}")

        print("\\cline{2-8}")
    print("\\hline")

\multirow{30}{*}{\textbf{Llama-3-70B}} & \multirow{15}{*}{\textbf{LRS\textsubscript{25}}} & \multirow{3}{*}{\texttt{GENERAL-IMPRESSION}} & \texttt{POSITIVE} & 66.13 & 93.00 & 58.57 & 75.93 \\
 &  &  & \texttt{NEGATIVE} & 57.14 & 92.50 & 44.78 & 78.95 \\
 &  &  & \texttt{NEUTRAL} & 26.67 & 90.83 & 15.87 & 83.33 \\
\arrayrulecolor{gray}\cline{3-8}\arrayrulecolor{black}
 &  &  & \texttt{POSITIVE} & 67.01 & 89.33 & 89.04 & 53.72 \\
 &  &  & \texttt{NEGATIVE} & 69.81 & 94.67 & 72.55 & 67.27 \\
 &  & \multirow{3}{*}{\texttt{FOOD}} & \texttt{NEUTRAL} & 43.18 & 91.67 & 31.67 & 67.86 \\
\arrayrulecolor{gray}\cline{3-8}\arrayrulecolor{black}
 &  &  & \texttt{POSITIVE} & 63.95 & 89.67 & 94.83 & 48.25 \\
 &  & \multirow{3}{*}{\texttt{SERVICE}} & \texttt{NEGATIVE} & 83.17 & 97.17 & 85.71 & 80.77 \\
 &  &  & \texttt{NEUTRAL} & 23.08 & 90.00 & 13.04 & 100.00 \\
\arrayrulecolor{gray}\cline{3-8}\arrayrulecolor{black}
 &  & \multirow{3}{*}{\texttt{AMBIENCE}} & \texttt{POSITIVE} & 59.21 & 89.67 & 86.54 &

#### Aspect Term + Sentiment Polarity

In [47]:
def calculate_tp_tn_fp_fn_e2e(pred, label):
    pred_set = set(
        f"{range['start']}_{range['end']}_{range['polarity']}" for range in pred)
    label_set = set(
        f"{range['start']}_{range['end']}_{range['polarity']}" for range in label)

    tp_set = pred_set & label_set
    tp = len(tp_set)

    fp_set = pred_set - tp_set
    fp = len(fp_set)

    fn_set = label_set - tp_set
    fn = len(fn_set)

    return tp, 0, fp, fn

In [48]:
def calculate_tp_tn_fp_fn_e2e_total(pred_total, label_total):
    tp_total = tn_total = fp_total = fn_total = 0
    for i in range(len(label_total)):
        tp, tn, fp, fn = calculate_tp_tn_fp_fn_e2e(
            pred_total[i], label_total[i])
        tp_total += tp
        tn_total += tn
        fp_total += fp
        fn_total += fn
    return tp_total, tn_total, fp_total, fn_total

In [49]:
def f1_macro_e2e(llm_annotations_aspect_terms, human_annotations_aspect_terms):
    f1_scores = []
    for pol in ["POSITIVE", "NEGATIVE", "NEUTRAL", "CONFLICT"]:
        llm_annotations_class = [[tag for tag in example if tag["polarity"] == pol]
                                 for example in llm_annotations_aspect_terms]
        human_annotations_class = [[tag for tag in example if tag["polarity"] == pol]
                                   for example in human_annotations_aspect_terms]
        
        tp_total, tn_total, fp_total, fn_total = calculate_tp_tn_fp_fn_e2e_total(llm_annotations_class, human_annotations_class)
        f1_scores.append(2 * tp_total / (2 * tp_total + fn_total + fp_total))
    return np.mean(f1_scores)

In [52]:
for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        llm_annotations_aspect_terms = [
            [{"start": tag["start"], "end": tag["end"], "polarity": tag["aspect_polarity"]} for tag in example if tag["aspect_term"] is not None] for example in dataset[llm][fs]["llm_annotation"]]
        human_annotations_aspect_terms = [
            [{"start": tag["start"], "end": tag["end"], "polarity": tag["aspect_polarity"]} for tag in example if tag["aspect_term"] is not None] for example in dataset[llm][fs]["human_annotation"]]

        tp_total, tn_total, fp_total, fn_total = calculate_tp_tn_fp_fn_e2e_total(
            llm_annotations_aspect_terms, human_annotations_aspect_terms)

        # Calculate metrics
        accuracy = (tp_total + tn_total) / (tp_total + tn_total + fp_total +
                                            fn_total) if (tp_total + tn_total + fp_total + fn_total) > 0 else 0
        precision = tp_total / \
            (tp_total + fp_total) if (tp_total + fp_total) > 0 else 0
        recall = tp_total / \
            (tp_total + fn_total) if (tp_total + fn_total) > 0 else 0

        f1_micro = 2 * tp_total / (2 * tp_total + fn_total + fp_total)

        f1_macro = f1_macro_e2e(
            llm_annotations_aspect_terms, human_annotations_aspect_terms)

        llm_print = "\\multirow{2}{*}{" + \
            LLMS_ENCODED[llm] + "}" if fs_idx == 0 else ""

        print(llm_print, "&", ENCODE_CONDITION[fs],
              "&", "{:.2f}".format(f1_micro*100),
              "&", "{:.2f}".format(f1_macro*100), "\\\\")
    print("\hline")

\multirow{2}{*}{\textbf{Llama-3-70B}} & \textbf{LRS\textsubscript{25}} & 55.73 & 39.55 \\
 & \textbf{LRS\textsubscript{500}} & 56.05 & 40.11 \\
\hline
\multirow{2}{*}{\textbf{GPT-3.5-turbo}} & \textbf{LRS\textsubscript{25}} & 67.23 & 50.28 \\
 & \textbf{LRS\textsubscript{500}} & 66.12 & 49.38 \\
\hline


#### Aspect Term + Aspect Category + Sentiment Polarity

In [73]:
def compute_f1_macro(preds, labels):

    f1_scores = []

    for comb in [(ac, pol) for ac in constants.ASPECT_CATEGORIES for pol in constants.POLARITIES]:
        aspect_category = comb[0]
        polarity = comb[1]

        pred_tuples_ac_pol = [[asp for asp in example if asp["aspect_category"] ==
                               aspect_category and asp["aspect_polarity"] == polarity] for example in preds]
        labels_tuples_ac_pol = [[asp for asp in example if asp["aspect_category"] ==
                                 aspect_category and asp["aspect_polarity"] == polarity] for example in labels]

        n_examples = sum([len(asp) for asp in labels_tuples_ac_pol])

        ac_pol_metrics = calculate_metrics_for_examples(
            labels_tuples_ac_pol, pred_tuples_ac_pol)
        
        if n_examples > 0:
             f1_scores.append(ac_pol_metrics["f1"])

    return np.mean(f1_scores)

In [74]:
for llm_idx, llm in enumerate(LLMS):
    for fs_idx, fs in enumerate(FEW_SHOT_CONDITIONS):
        metrics_triplets = calculate_metrics_for_examples(
            dataset[llm][fs]["human_annotation"], dataset[llm][fs]["llm_annotation"])

        f1_macro = compute_f1_macro(dataset[llm][fs]["llm_annotation"], dataset[llm][fs]["human_annotation"])

        llm_print = "\\multirow{2}{*}{" + \
            LLMS_ENCODED[llm] + "}" if fs_idx == 0 else ""
        print(llm_print,
              "&", ENCODE_CONDITION[fs],
              "&", "{:.2f}".format(metrics_triplets["f1"] * 100),
              "&", "{:.2f}".format(f1_macro * 100), "\\\\")

    print("\\hline")

\multirow{2}{*}{\textbf{Llama-3-70B}} & \textbf{LRS\textsubscript{25}} & 46.71 & 43.94 \\
 & \textbf{LRS\textsubscript{500}} & 47.98 & 45.02 \\
\hline
\multirow{2}{*}{\textbf{GPT-3.5-turbo}} & \textbf{LRS\textsubscript{25}} & 55.92 & 55.30 \\
 & \textbf{LRS\textsubscript{500}} & 55.38 & 54.71 \\
\hline
