# Notebook: Analyse Language


## Packages


In [1]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from spacy.lang.de.stop_words import STOP_WORDS
from collections import Counter
import Levenshtein
import numpy as np
import random
import string
import spacy
import nltk
import json

## Constants


In [14]:
ASPECT_CATEGORIES = ["GENERAL-IMPRESSION",
                     "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
LLMS = ["Llama70B", "GPT-3"]
FS_CONDITIONS = ["fixed", "random"]
CRITERIA_RS = "tag_with_polarity"
POLARITIES = ["POSITIVE", "NEGATIVE", "NEUTRAL"]
MENTIONING_TYPE = ["implicit", "explicit"]
COMBINATIONS = [f"{aspect}-{polarity}" for aspect in ["SERVICE", "FOOD",
                                                      "GENERAL-IMPRESSION", "AMBIENCE", "PRICE"] for polarity in POLARITIES]
RANDOM_STATE = 43

In [3]:
LLMS_ENCODED = {"GPT-3": "\\textbf{GPT-3.5-turbo}", "Llama70B": "\\textbf{Llama-2-70B}"}
ENCODE_CONDITION = {"fixed": "\\textbf{LRS\\textsubscript{25}}",
                    "random": "\\textbf{LRS\\textsubscript{500}}"}

## Settings


In [4]:
nlp = spacy.load("de_core_news_lg")
nltk.download('punkt')
random.seed(RANDOM_STATE)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Code


### Helper


In [27]:
def count_unique_sentences(sentences):
    unique_sentences = set(sentences)
    return len(unique_sentences)


def count_unique_tokens(tokens):
    unique_tokens = set(token.text for token in tokens)
    return len(unique_tokens)


def count_unique_lemmas(tokens):
    unique_lemmas = set(token.lemma_ for token in tokens)
    return len(unique_lemmas)


def get_avg_unique_words_in_k_words(tokens, n_selection=100, n_repetitions=1000):
    iterations_n_unique_words = []
    for i in range(n_repetitions):
        random_indices = random.sample(range(len(tokens)), n_selection)
        random_words = [tokens[index] for index in random_indices]
        n_unique_words = len(set(random_words))
        iterations_n_unique_words.append(n_unique_words)
    return np.mean(iterations_n_unique_words)


def average_word_level_levenshtein_distance(docs, norm=False):
    tokenized_texts = [
        [token.text for token in doc["tokenized_text"]] for doc in docs]

    total_distance = 0
    pair_count = 0

    for i in range(len(tokenized_texts)):
        for j in range(i + 1, len(tokenized_texts)):
            tokens1 = tokenized_texts[i]
            tokens2 = tokenized_texts[j]

            if len(tokens1) >= len(tokens2):
                max_tokens = len(tokens1)
            else:
                max_tokens = len(tokens2)

            distance = Levenshtein.distance(tokens1, tokens2)
            if norm:
                distance = distance / max_tokens
            total_distance += distance
            pair_count += 1

    average_distance = total_distance / pair_count if pair_count > 0 else 0
    return average_distance

def round_number(num, decimal_places):
    formatted_num = "{:.{}f}".format(num, decimal_places)
    rounded_num_str = "{:.{}f}".format(float(formatted_num), decimal_places)
    return rounded_num_str

In [74]:

def add_thousand_dots(n_sample):
    # Überprüfen, ob der Eingabewert ein String ist
    if isinstance(n_sample, str):
        # Überprüfen, ob der String ein Punkt (.) enthält
        if '.' in n_sample:
            # Aufteilen des Strings in Vorkomma- und Nachkommateil
            integer_part, decimal_part = n_sample.split('.')
            
            # Hinzufügen von Tausendertrennzeichen zum Vorkommateil
            formatted_integer_part = "{:,}".format(int(integer_part))
            
            # Zusammenfügen der Teile mit einem Punkt und Rückgabe
            result = f"{formatted_integer_part}.{decimal_part}"
        else:
            # Wenn kein Punkt vorhanden ist, direkt Tausendertrennzeichen hinzufügen
            result = "{:,}".format(int(n_sample))
    elif isinstance(n_sample, np.float64):
        # Wenn es sich um ein numpy.float64-Objekt handelt, direkt Tausendertrennzeichen hinzufügen
        result = "{:,}".format(round(n_sample, 1))
    else:
        # In allen anderen Fällen den Eingabewert unverändert zurückgeben
        result = n_sample
    
    return result


### Load Datasets


In [7]:
dataset = {"synth": {}, "real": []}

# Load Synth
for llm in LLMS:
    dataset["synth"][llm] = {}
    for prompting in FS_CONDITIONS:
        dataset["synth"][llm][prompting] = []
        for split in range(6):
            with open(f"../07 train models/synth/{llm}/{prompting}/split_{split}.json", 'r', encoding='utf-8') as json_file:
                split_data = json.load(json_file)
            for example in split_data:
                example["tokenized_text"] = nlp(example["text"])
            dataset["synth"][llm][prompting].append(split_data)

# Load Real
for split in range(6):
    with open(f"../07 train models/real/split_{split}.json", 'r', encoding='utf-8') as json_file:
        split_data = json.load(json_file)
    for example in split_data:
        example["tokenized_text"] = nlp(example["text"])
    dataset["real"].append(split_data)

In [8]:
def get_one_hot(subset):
    labels_one_hot = []
    for i in range(len(subset)):
        tags_in_example = list(set([tag[CRITERIA_RS]
                               for tag in subset[i]["tags"]]))
        one_hot_encoded_combination = np.array(
            [1 if tag in tags_in_example else 0 for tag in COMBINATIONS])
        labels_one_hot.append(one_hot_encoded_combination)
    return labels_one_hot

In [9]:
for llm in LLMS:
    for few_shot_condition in FS_CONDITIONS:
        for iteration in range(6):
            if few_shot_condition == "random":
                subset = dataset["synth"][llm][few_shot_condition][iteration]
            else:
                subset = dataset["synth"][llm][few_shot_condition][iteration][475:]

            found_3_split = False
            restart_idx = 0
            while found_3_split == False:
                mskf = MultilabelStratifiedKFold(
                    n_splits=3, shuffle=True, random_state=RANDOM_STATE+restart_idx)
                section = []
                for train_index, test_index in mskf.split(subset, get_one_hot(subset)):
                    split_500 = [subset[i] for i in test_index]
                    section.append(split_500)

                if len(section[0]) == 500 and len(section[1]) == 500 and len(section[2]) == 500:
                    found_3_split = True

                restart_idx += 1

            dataset["synth"][llm][few_shot_condition][iteration] = section

In [11]:
real_examples = []
for i in [0, 1, 2, 3, 4, 5]:
    real_examples.append([])
    for k in [0, 1, 2]:
        if (i+k) < 6:
            t = i+k
        else:
            t = i+k - 6
        real_examples[i].append(dataset["real"][t])
dataset["real"] = real_examples

### Document Analysis


#### Synthetic Data


In [75]:
for idx_llm, llm in enumerate(LLMS):
    for idx_fsc, few_shot_condition in enumerate(FS_CONDITIONS):
        for idx_sample, n_sample in enumerate([500, 1000, 1500]):
            iterations_n_unique_tokens = []
            iterations_n_lemmas = []
            iterations_avg_unique_sentences = []
            for it in range(6):
                samples = [item for k in range(
                    int(n_sample / 500)) for item in dataset["synth"][llm][few_shot_condition][it][k]]
                n_unique_tokens = count_unique_tokens(
                    [token for example in samples for token in example["tokenized_text"]])
                n_unique_lemmas = count_unique_lemmas(
                    [token for example in samples for token in example["tokenized_text"]])
                n_unique_sentences = count_unique_sentences(
                    [example["text"] for example in samples])

                iterations_n_unique_tokens.append(n_unique_tokens)
                iterations_n_lemmas.append(n_unique_lemmas)
                iterations_avg_unique_sentences.append(n_unique_sentences)

            llm_print = "\multirow{6}{*}{" + \
                LLMS_ENCODED[llm] + \
                "}" if idx_sample == 0 and idx_fsc == 0 else ""
            fs_condition_print = "\multirow{3}{*}{" + \
                ENCODE_CONDITION[few_shot_condition] + \
                "}" if idx_sample == 0 else ""

            print(llm_print, "&", fs_condition_print, "&", add_thousand_dots(str(n_sample)), "&",
                  add_thousand_dots(
                      round_number(np.mean(iterations_avg_unique_sentences), 1)), "&",
                  add_thousand_dots(
                      round_number(np.mean(iterations_n_unique_tokens), 1)), "&",
                  add_thousand_dots(round(np.mean(iterations_n_lemmas), 1)), "\\\\")
        if idx_fsc == 0:
            print("\\arrayrulecolor{gray}\cline{2-6}\\arrayrulecolor{black}")
        else:
            print("\\hline")

\multirow{6}{*}{\textbf{Llama-2-70B}} & \multirow{3}{*}{\textbf{LRS\textsubscript{25}}} & 500 & 480.3 & 711.8 & 546.5 \\
 &  & 1,000 & 933.2 & 1,048.5 & 802.5 \\
 &  & 1,500 & 1,383.2 & 1,300.3 & 994.3 \\
\arrayrulecolor{gray}\cline{2-6}\arrayrulecolor{black}
 & \multirow{3}{*}{\textbf{LRS\textsubscript{500}}} & 500 & 486.2 & 753.8 & 582.7 \\
 &  & 1,000 & 948.2 & 1,107.5 & 848.7 \\
 &  & 1,500 & 1,397.2 & 1,380.3 & 1,054.7 \\
\hline
\multirow{6}{*}{\textbf{GPT-3.5-turbo}} & \multirow{3}{*}{\textbf{LRS\textsubscript{25}}} & 500 & 308.5 & 295.7 & 216.2 \\
 &  & 1,000 & 553.3 & 377.3 & 275.0 \\
 &  & 1,500 & 777.8 & 439.8 & 319.2 \\
\arrayrulecolor{gray}\cline{2-6}\arrayrulecolor{black}
 & \multirow{3}{*}{\textbf{LRS\textsubscript{500}}} & 500 & 318.0 & 293.8 & 216.5 \\
 &  & 1,000 & 560.5 & 387.3 & 280.5 \\
 &  & 1,500 & 784.3 & 453.8 & 326.8 \\
\hline


In [36]:
# for idx_llm, llm in enumerate(LLMS):
#     for idx_fsc, few_shot_condition in enumerate(FS_CONDITIONS):
#         for idx_sample, n_sample in enumerate([500, 1000, 1500]):
#             iterations_avg_unique_words_in_k_words = []
#             iterations_avg_levenshtein_distance = []
#             iterations_avg_levenshtein_distance_norm = []
#             for it in range(6):
#                 samples = [item for k in range(
#                     int(n_sample / 500)) for item in dataset["synth"][llm][few_shot_condition][it][k]]
#                 n_unique_words_in_k_words = get_avg_unique_words_in_k_words(
#                     [token.text for example in samples for token in example["tokenized_text"]])
#                 avg_levenshtein_distance = average_word_level_levenshtein_distance(
#                     samples)
#                 avg_levenshtein_distance_norm = average_word_level_levenshtein_distance(
#                     samples, norm=True)

#                 iterations_avg_unique_words_in_k_words.append(
#                     n_unique_words_in_k_words)
#                 iterations_avg_levenshtein_distance.append(
#                     avg_levenshtein_distance)
#                 iterations_avg_levenshtein_distance_norm.append(
#                     avg_levenshtein_distance_norm)
                
#             llm_print = "\multirow{6}{*}{" + \
#                 LLMS_ENCODED[llm] + \
#                 "}" if idx_sample == 0 and idx_fsc == 0 else ""
#             fs_condition_print = "\multirow{3}{*}{" + \
#                 ENCODE_CONDITION[few_shot_condition] + \
#                 "}" if idx_sample == 0 else ""

#             print(llm_print, "&", fs_condition_print, "&", add_thousand_dots(n_sample), "&",
#                   add_thousand_dots(round_number(np.mean(iterations_avg_unique_words_in_k_words), 2)), "&",
#                   add_thousand_dots(round_number(np.mean(iterations_avg_levenshtein_distance), 2)), "&",
#                   add_thousand_dots(round_number(np.mean(iterations_avg_levenshtein_distance_norm), 2)))
        
#         if idx_fsc == 0:
#             print("\\arrayrulecolor{gray}\cline{2-6}\\arrayrulecolor{black}")
#         else:
#             print("\\hline")

#### Real Data


In [82]:
for idx_sample, n_sample in enumerate([500, 1000, 1500]):
    iterations_n_unique_tokens = []
    iterations_n_lemmas = []
    iterations_avg_unique_sentences = []

    for it in range(6):
        samples = [item for k in range(
            int(n_sample / 500)) for item in dataset["real"][it][k]]
        n_unique_tokens = count_unique_tokens(
            [token for example in samples for token in example["tokenized_text"]])
        n_unique_lemmas = count_unique_lemmas(
            [token for example in samples for token in example["tokenized_text"]])
        n_unique_sentences = count_unique_sentences(
            [example["text"] for example in samples])

        iterations_n_unique_tokens.append(n_unique_tokens)
        iterations_n_lemmas.append(n_unique_lemmas)
        iterations_avg_unique_sentences.append(n_unique_sentences)

    data_source_print = "\multirow{3}{*}{\\textbf{Real Examples}}" if idx_sample == 0 else ""

    fs_condition_print = "\multirow{3}{*}{-}" if idx_sample == 0 else ""

    print(data_source_print, "&", fs_condition_print, "&", add_thousand_dots(n_sample), "&",
          add_thousand_dots(round_number(np.mean(iterations_avg_unique_sentences), 1)), "&",
          add_thousand_dots(round_number(np.mean(iterations_n_unique_tokens), 1)), "&",
          add_thousand_dots(round_number(np.mean(iterations_n_lemmas), 1)), "\\\\")
print("\\hline")

\multirow{3}{*}{\textbf{Real Examples}} & \multirow{3}{*}{-} & 500 & 497.0 & 1,918.2 & 1,493.0 \\
 &  & 1000 & 989.7 & 3,061.0 & 2,349.2 \\
 &  & 1500 & 1,480.7 & 3,995.5 & 3,037.8 \\
\hline


In [38]:
# for idx_sample, n_sample in enumerate([500, 1000, 1500]):
#     iterations_avg_unique_words_in_k_words = []
#     iterations_avg_levenshtein_distance = []
#     iterations_avg_levenshtein_distance_norm = []
#     for it in range(6):
#         samples = [item for k in range(
#             int(n_sample / 500)) for item in dataset["real"][it][k]]

#         n_unique_words_in_k_words = get_avg_unique_words_in_k_words(
#             [token.text for example in samples for token in example["tokenized_text"]])
#         avg_levenshtein_distance = average_word_level_levenshtein_distance(
#             samples)
#         avg_levenshtein_distance_norm = average_word_level_levenshtein_distance(
#             samples, norm=True)

#         iterations_avg_unique_words_in_k_words.append(
#             n_unique_words_in_k_words)
#         iterations_avg_levenshtein_distance.append(
#             avg_levenshtein_distance)
#         iterations_avg_levenshtein_distance_norm.append(
#             avg_levenshtein_distance_norm)
        
#     data_source_print = "\multirow{3}{*}{\\textbf{Real Examples}}" if idx_sample == 0 else ""
#     fs_condition_print = "\multirow{3}{*}{-}" if idx_sample == 0 else ""

#     print(data_source_print, "&", fs_condition_print, "&", add_thousand_dots(n_sample), "&",
#           add_thousand_dots(
#               round_number(np.mean(iterations_avg_unique_words_in_k_words), 2)), "&",
#           add_thousand_dots(
#               round_number(np.mean(iterations_avg_levenshtein_distance), 2)), "&",
#           add_thousand_dots(round_number(np.mean(iterations_avg_levenshtein_distance_norm), 2)), "\\\\")
# print("\\hline")

### Aspect Term Analysis

#### Synthetic Data

In [None]:
for idx_llm, llm in enumerate(LLMS):
    for idx_fsc, few_shot_condition in enumerate(FS_CONDITIONS):
        for idx_sample, n_sample in enumerate([500, 1000, 1500]):
            tags_from_splits_count = []
            tags_from_splits_count_implicit = []
            tags_from_splits_count_explicit = []
            count_unique_aspect_terms_in_split = []
            count_unique_aspect_terms_in_100_aspect_terms = []
            count_unique_aspect_terms_in_200_aspect_terms = []
            count_unique_aspect_terms_in_300_aspect_terms = []

            for it in range(6):
                tags = [tag for k in range(
                    int(n_sample / 500)) for example in dataset["synth"][llm][few_shot_condition][it][k] for tag in example["tags"]]
                tags_explicit = [tag["text"]
                                 for tag in tags if tag["type"] == "label-explicit"]
                tags_from_splits_count.append(len(tags))
                tags_from_splits_count_explicit.append(
                    len([tag for tag in tags if tag["type"] == "label-explicit"]))
                tags_from_splits_count_implicit.append(
                    len([tag for tag in tags if tag["type"] == "label-implicit"]))

                unique_tags = len(set(tags_explicit))

                # Calculate number of unique tokens in k aspect terms
                count_unique_aspect_terms_in_100_aspect_terms.append(
                    get_avg_unique_words_in_k_words(tags_explicit, n_selection=100))
                count_unique_aspect_terms_in_200_aspect_terms.append(
                    get_avg_unique_words_in_k_words(tags_explicit, n_selection=200))
                count_unique_aspect_terms_in_300_aspect_terms.append(
                    get_avg_unique_words_in_k_words(tags_explicit, n_selection=300))

                count_unique_aspect_terms_in_split.append(unique_tags)

            llm_print = "\multirow{6}{*}{" + \
                LLMS_ENCODED[llm] + \
                "}" if idx_sample == 0 and idx_fsc == 0 else ""
            fs_condition_print = "\multirow{3}{*}{" + \
                ENCODE_CONDITION[few_shot_condition] + \
                "}" if idx_sample == 0 else ""

            print(llm_print, "&", fs_condition_print,
                  "&", add_thousand_dots(n_sample),  # n samples
                  "&", add_thousand_dots(
                      round_number(np.mean(tags_from_splits_count), 2)),  # n aspects
                  "&", add_thousand_dots(
                      round_number(np.mean(tags_from_splits_count_implicit) / np.mean(tags_from_splits_count) * 100, 1)) + " \\%",  # % implicit
                  "\\textit{(SD = " + add_thousand_dots(round_number(np.std([a / b * 100 for a, b in zip(
                      tags_from_splits_count_implicit, tags_from_splits_count)]), 2)) + ")}",
                  "&", add_thousand_dots(
                      round_number(np.mean(tags_from_splits_count_explicit), 2)),  # n aspects
                  "&", add_thousand_dots(
                      round_number(np.mean(count_unique_aspect_terms_in_split), 2)),  # n unique
                  "&", add_thousand_dots(
                      round_number(np.mean(count_unique_aspect_terms_in_100_aspect_terms), 1)),
                  "&", add_thousand_dots(
                      round_number(np.mean(count_unique_aspect_terms_in_200_aspect_terms), 1)),
                  "&", add_thousand_dots(
                      round_number(np.mean(count_unique_aspect_terms_in_300_aspect_terms), 1)),
                  "\\\\")

        if idx_fsc == 0:
            print("\\arrayrulecolor{gray}\cline{2-10}\\arrayrulecolor{black}")
        else:
            print("\\hline")

\multirow{6}{*}{\textbf{Llama-2-70B}} & \multirow{3}{*}{\textbf{LRS\textsubscript{25}}} & 500 & 795.6 & 27.2 \% \textit{(SD = 4.25)} & 578.8 & 104.8 & 34.1 & 52.6 & 68.4 \\
 &  & 1,000 & 1,591.4 & 27.8 \% \textit{(SD = 4.24)} & 1,149.4 & 170.0 & 34.0 & 52.5 & 68.5 \\
 &  & 1,500 & 2,388.0 & 28.0 \% \textit{(SD = 4.22)} & 1,720.4 & 220.0 & 33.4 & 51.5 & 67.0 \\
\arrayrulecolor{gray}\cline{2-10}\arrayrulecolor{black}
 & \multirow{3}{*}{\textbf{LRS\textsubscript{500}}} & 500 & 704.6 & 26.6 \% \textit{(SD = 1.99)} & 517.2 & 116.8 & 38.0 & 60.7 & 80.2 \\
 &  & 1,000 & 1,407.8 & 26.3 \% \textit{(SD = 1.77)} & 1,037.4 & 194.4 & 38.9 & 62.3 & 82.3 \\
 &  & 1,500 & 2,109.6 & 26.5 \% \textit{(SD = 1.14)} & 1,550.0 & 256.6 & 38.8 & 62.1 & 82.1 \\
\hline
\multirow{6}{*}{\textbf{GPT-3.5-turbo}} & \multirow{3}{*}{\textbf{LRS\textsubscript{25}}} & 500 & 795.6 & 35.1 \% \textit{(SD = 9.18)} & 516.2 & 38.8 & 17.9 & 24.7 & 30.0 \\
 &  & 1,000 & 1,591.4 & 35.1 \% \textit{(SD = 9.28)} & 1,032.4 & 50.6 & 1

### Real Data

In [None]:
for idx_sample, n_sample in enumerate([500, 1000, 1500]):
    tags_from_splits_count = []
    tags_from_splits_count_implicit = []
    tags_from_splits_count_explicit = []
    count_unique_aspect_terms_in_split = []
    count_unique_aspect_terms_in_100_aspect_terms = []
    count_unique_aspect_terms_in_200_aspect_terms = []
    count_unique_aspect_terms_in_300_aspect_terms = []

    for it in range(6):
        tags = [tag for k in range(int(n_sample / 500))
                for example in dataset["real"][it][k] for tag in example["tags"]]
        tags_explicit = [tag["text"]
                         for tag in tags if tag["type"] == "label-explicit"]
        tags_from_splits_count.append(len(tags))
        tags_from_splits_count_explicit.append(
            len([tag for tag in tags if tag["type"] == "label-explicit"]))
        tags_from_splits_count_implicit.append(
            len([tag for tag in tags if tag["type"] == "label-implicit"]))

        unique_tags = len(set(tags_explicit))

        # Calculate number of unique tokens in k aspect terms
        count_unique_aspect_terms_in_100_aspect_terms.append(
            get_avg_unique_words_in_k_words(tags_explicit, n_selection=100))
        count_unique_aspect_terms_in_200_aspect_terms.append(
            get_avg_unique_words_in_k_words(tags_explicit, n_selection=200))
        count_unique_aspect_terms_in_300_aspect_terms.append(
            get_avg_unique_words_in_k_words(tags_explicit, n_selection=300))

        count_unique_aspect_terms_in_split.append(unique_tags)

    data_source_print = "\multirow{3}{*}{\\textbf{Real Examples}}" if idx_sample == 0 else ""
    fs_condition_print = "\multirow{3}{*}{-}" if idx_sample == 0 else ""

    print(data_source_print, "&", fs_condition_print,
          "&", add_thousand_dots(n_sample),  # n samples
          "&", add_thousand_dots(
              round_number(np.mean(tags_from_splits_count), 2)),  # n aspects
          "&", add_thousand_dots(
              round_number(np.mean(tags_from_splits_count_implicit) / np.mean(tags_from_splits_count) * 100, 1)) + " \\%",  # % implicit
          "\\textit{(SD = " + add_thousand_dots(round_number(np.std([a / b * 100 for a, b in zip(
              tags_from_splits_count_implicit, tags_from_splits_count)]), 2)) + ")}",
          "&", add_thousand_dots(
              round_number(np.mean(tags_from_splits_count_explicit), 2)),  # n aspects explicit
          "&", add_thousand_dots(
              round_number(np.mean(count_unique_aspect_terms_in_split), 2)),  # n unique
          "&", add_thousand_dots(
              round_number(np.mean(count_unique_aspect_terms_in_100_aspect_terms), 1)),
          "&", add_thousand_dots(
              round_number(np.mean(count_unique_aspect_terms_in_200_aspect_terms), 1)),
          "&", add_thousand_dots(
              round_number(np.mean(count_unique_aspect_terms_in_300_aspect_terms), 1)), "\\\\")
print("\\hline")

\multirow{3}{*}{\textbf{Real Examples}} & \multirow{3}{*}{-} & 500 & 703.2 & 26.5 \% \textit{(SD = 1.06)} & 517.2 & 256.4 & 68.3 & 120.2 & 166.5 \\
 &  & 1,000 & 1,403.8 & 26.6 \% \textit{(SD = 0.56)} & 1,029.8 & 438.4 & 68.3 & 120.4 & 166.9 \\
 &  & 1,500 & 2,107.2 & 26.8 \% \textit{(SD = 0.19)} & 1,542.4 & 595.6 & 68.1 & 120.0 & 166.2 \\
\hline


### Aspect Term Analysis (With Aspect Category)

### Synth Examples

In [None]:
for idx_llm, llm in enumerate(LLMS):
    for idx_fsc, few_shot_condition in enumerate(FS_CONDITIONS):
        for idx_ac, aspect_category in enumerate(ASPECT_CATEGORIES):
            for idx_sample, n_sample in enumerate([500, 1000, 1500]):
                tags_from_splits_count = []
                tags_from_splits_count_implicit = []
                tags_from_splits_count_explicit = []
                count_unique_aspect_terms_in_split = []
                count_unique_aspect_terms_in_k_aspect_terms = []
                for it in range(6):
                    tags = [tag for k in range(
                        int(n_sample / 500)) for example in dataset["synth"][llm][few_shot_condition][it][k] for tag in example["tags"] if tag["label"] == aspect_category]
                    tags_explicit = [tag["text"]
                                     for tag in tags if tag["type"] == "label-explicit"]
                    tags_from_splits_count.append(len(tags))
                    tags_from_splits_count_explicit.append(
                        len([tag for tag in tags if tag["type"] == "label-explicit"]))
                    tags_from_splits_count_implicit.append(
                        len([tag for tag in tags if tag["type"] == "label-implicit"]))

                    unique_tags = len(set(tags_explicit))

                    # Calculate number of unique tokens in 100 aspect terms
                    count_unique_aspect_terms_in_k_aspect_terms.append(
                        get_avg_unique_words_in_k_words(tags_explicit, n_selection=10))

                    count_unique_aspect_terms_in_split.append(unique_tags)

                llm_print = "\multirow{30}{*}{\\textbf{" + \
                    LLMS_ENCODED[llm] + \
                    "}}" if idx_sample == 0 and idx_fsc == 0 and idx_ac == 0 else ""
                fs_condition_print = "\multirow{15}{*}{" + \
                    ENCODE_CONDITION[few_shot_condition] + \
                    "}" if idx_sample == 0 and idx_ac == 0 else ""
                ac_print = "\multirow{3}{*}{\\texttt{" + aspect_category + "}}" if idx_sample == 0 else ""

                # print(idx_llm, idx_fsc, idx_ac, idx_sample)

                print(llm_print, "&", fs_condition_print, "&", ac_print,
                      "&", add_thousand_dots(n_sample),
                      "&", add_thousand_dots(
                          round_number(np.mean(tags_from_splits_count), 2)),
                      "&", add_thousand_dots(
                          round_number(np.mean(tags_from_splits_count_implicit) / np.mean(tags_from_splits_count) * 100, 1)) + " \\%",  # % implicit
                      "\\textit{(SD = " + add_thousand_dots(round_number(np.std([a / b * 100 for a, b in zip(
                          tags_from_splits_count_implicit, tags_from_splits_count)]), 2)) + ")}",
                      "&", add_thousand_dots(
                          round_number(np.mean(tags_from_splits_count_explicit), 2)),
                      "&", add_thousand_dots(round_number(
                          np.mean(count_unique_aspect_terms_in_split), 2)),
                      "&", add_thousand_dots(round_number(np.mean(count_unique_aspect_terms_in_k_aspect_terms), 2)), "\\\\")
            #print(idx_llm, idx_fsc, idx_ac)
            if idx_fsc == 1 and idx_ac == 4:
                print("\\hline")
            elif idx_ac == 4:
                print("\\cline{2-9}")
            else:
                print("\\arrayrulecolor{gray}\cline{3-9}\\arrayrulecolor{black}")

\multirow{30}{*}{\textbf{\textbf{Llama-2-70B}}} & \multirow{15}{*}{\textbf{LRS\textsubscript{25}}} & \multirow{3}{*}{\texttt{GENERAL-IMPRESSION}} & 500 & 159.4 & 44.4 \% \textit{(SD = 4.79)} & 88.6 & 34.6 & 7.54 \\
 &  &  & 1,000 & 319.8 & 45.5 \% \textit{(SD = 5.05)} & 174.4 & 57.0 & 7.49 \\
 &  &  & 1,500 & 479.0 & 44.0 \% \textit{(SD = 6.19)} & 268.4 & 78.4 & 7.51 \\
\arrayrulecolor{gray}\cline{3-9}\arrayrulecolor{black}
 &  & \multirow{3}{*}{\texttt{FOOD}} & 500 & 158.6 & 21.7 \% \textit{(SD = 5.21)} & 124.2 & 33.6 & 5.29 \\
 &  &  & 1,000 & 318.0 & 22.8 \% \textit{(SD = 5.75)} & 245.4 & 56.0 & 5.17 \\
 &  &  & 1,500 & 478.0 & 23.5 \% \textit{(SD = 5.24)} & 365.6 & 71.0 & 5.17 \\
\arrayrulecolor{gray}\cline{3-9}\arrayrulecolor{black}
 &  & \multirow{3}{*}{\texttt{SERVICE}} & 500 & 160.0 & 18.1 \% \textit{(SD = 3.8)} & 131.0 & 18.4 & 4.58 \\
 &  &  & 1,000 & 318.4 & 18.9 \% \textit{(SD = 4.68)} & 258.2 & 26.4 & 4.55 \\
 &  &  & 1,500 & 477.0 & 19.5 \% \textit{(SD = 4.03)} & 384.2 & 

### Real Examples

In [None]:
for idx_ac, aspect_category in enumerate(ASPECT_CATEGORIES):
    for idx_sample, n_sample in enumerate([500, 1000, 1500]):
        tags_from_splits_count = []
        tags_from_splits_count_implicit = []
        tags_from_splits_count_explicit = []
        count_unique_aspect_terms_in_split = []
        count_unique_aspect_terms_in_k_aspect_terms = []
        for it in range(6):
            tags = [tag for k in range(int(n_sample / 500))
                    for example in dataset["real"][it][k] for tag in example["tags"] if tag["label"] == aspect_category]
            tags_explicit = [tag["text"]
                             for tag in tags if tag["type"] == "label-explicit"]
            tags_from_splits_count.append(len(tags))
            tags_from_splits_count_explicit.append(
                len([tag for tag in tags if tag["type"] == "label-explicit"]))
            tags_from_splits_count_implicit.append(
                len([tag for tag in tags if tag["type"] == "label-implicit"]))

            unique_tags = len(set(tags_explicit))

            # Calculate number of unique tokens in 100 aspect terms
            count_unique_aspect_terms_in_k_aspect_terms.append(
                get_avg_unique_words_in_k_words(tags_explicit, n_selection=10))

            count_unique_aspect_terms_in_split.append(unique_tags)

        # print(idx_ac, idx_sample)
        data_source_print = "\multirow{15}{*}{\\textbf{Real Examples}}" if idx_sample == 0 and idx_ac == 0 else ""
        fs_condition_print = "\multirow{15}{*}{-}" if idx_sample == 0 and idx_ac == 0 else ""
        ac_print = "\multirow{3}{*}{\\texttt{" + aspect_category + "}}" if idx_sample == 0 else ""

        print(data_source_print, "&", fs_condition_print, "&", ac_print,
              "&", add_thousand_dots(n_sample),
              "&", add_thousand_dots(
                  round_number(np.mean(tags_from_splits_count), 2)),
              "&", add_thousand_dots(
                  round_number(np.mean(tags_from_splits_count_implicit) / np.mean(tags_from_splits_count) * 100, 1)) + " \\%",  # % implicit
              "\\textit{(SD = " + add_thousand_dots(round_number(np.std([a / b * 100 for a, b in zip(
                  tags_from_splits_count_implicit, tags_from_splits_count)]), 2)) + ")}",
              "&", add_thousand_dots(
                  round_number(np.mean(tags_from_splits_count_explicit), 2)),
              "&", add_thousand_dots(round_number(
                  np.mean(count_unique_aspect_terms_in_split), 2)),
              "&", add_thousand_dots(round_number(np.mean(count_unique_aspect_terms_in_k_aspect_terms), 2)), "\\\\")
    print("\\arrayrulecolor{gray}\cline{3-9}\\arrayrulecolor{black}")
print("\\hline")

\multirow{15}{*}{\textbf{Real Examples}} & \multirow{15}{*}{-} & \multirow{3}{*}{\texttt{GENERAL-IMPRESSION}} & 500 & 124.6 & 77.7 \% \textit{(SD = 2.23)} & 27.8 & 14.0 & 6.9 \\
 &  &  & 1,000 & 249.2 & 77.8 \% \textit{(SD = 1.03)} & 55.2 & 24.0 & 7.06 \\
 &  &  & 1,500 & 376.8 & 78.2 \% \textit{(SD = 0.8)} & 82.0 & 32.4 & 7.05 \\
\arrayrulecolor{gray}\cline{3-9}\arrayrulecolor{black}
 &  & \multirow{3}{*}{\texttt{FOOD}} & 500 & 281.2 & 11.0 \% \textit{(SD = 0.97)} & 250.4 & 144.8 & 8.7 \\
 &  &  & 1,000 & 561.2 & 11.2 \% \textit{(SD = 0.86)} & 498.6 & 249.6 & 8.7 \\
 &  &  & 1,500 & 839.4 & 11.3 \% \textit{(SD = 0.58)} & 744.8 & 339.8 & 8.68 \\
\arrayrulecolor{gray}\cline{3-9}\arrayrulecolor{black}
 &  & \multirow{3}{*}{\texttt{SERVICE}} & 500 & 174.8 & 22.7 \% \textit{(SD = 2.62)} & 135.2 & 51.4 & 7.15 \\
 &  &  & 1,000 & 349.8 & 22.8 \% \textit{(SD = 0.96)} & 270.2 & 87.2 & 7.15 \\
 &  &  & 1,500 & 525.8 & 22.6 \% \textit{(SD = 0.56)} & 407.0 & 116.8 & 7.11 \\
\arrayrulecolor{gray}\