# Notebook: Analyse Language


## Packages


In [1]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from spacy.lang.de.stop_words import STOP_WORDS
from collections import Counter
import Levenshtein
import numpy as np
import random
import string
import spacy
import nltk
import json

## Constants


In [2]:
ASPECT_CATEGORIES = ["GENERAL-IMPRESSION",
                     "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
LLMS = ["Llama70B", "GPT-3"]
FS_CONDITIONS = ["fixed", "random"]
CRITERIA_RS = "tag_with_polarity"
POLARITIES = ["POSITIVE", "NEGATIVE", "NEUTRAL"]
MENTIONING_TYPE = ["implicit", "explicit"]
COMBINATIONS = [f"{aspect}-{polarity}" for aspect in ["SERVICE", "FOOD",
                                                      "GENERAL-IMPRESSION", "AMBIENCE", "PRICE"] for polarity in POLARITIES]
RANDOM_STATE = 43

In [3]:
LLMS_ENCODED = {"GPT-3": "\\textbf{GPT-3.5-turbo}", "Llama70B": "\\textbf{Llama-2-70B}"}
ENCODE_CONDITION = {"fixed": "\\textbf{LRS\\textsubscript{25}}",
                    "random": "\\textbf{LRS\\textsubscript{500}}"}

## Settings


In [4]:
nlp = spacy.load("de_core_news_lg")
nltk.download('punkt')
random.seed(RANDOM_STATE)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Code


### Helper


In [5]:
def count_unique_sentences(sentences):
    unique_sentences = set(sentences)
    return len(unique_sentences)


def count_unique_tokens(tokens):
    unique_tokens = set(token.text for token in tokens)
    return len(unique_tokens)


def count_unique_lemmas(tokens):
    unique_lemmas = set(token.lemma_ for token in tokens)
    return len(unique_lemmas)


def get_avg_unique_words_in_k_words(tokens, n_selection=100, n_repetitions=1000):
    iterations_n_unique_words = []
    for i in range(n_repetitions):
        random_indices = random.sample(range(len(tokens)), n_selection)
        random_words = [tokens[index] for index in random_indices]
        n_unique_words = len(set(random_words))
        iterations_n_unique_words.append(n_unique_words)
    return np.mean(iterations_n_unique_words)


def average_word_level_levenshtein_distance(docs, norm=False):
    tokenized_texts = [
        [token.text for token in doc["tokenized_text"]] for doc in docs]

    total_distance = 0
    pair_count = 0

    for i in range(len(tokenized_texts)):
        for j in range(i + 1, len(tokenized_texts)):
            tokens1 = tokenized_texts[i]
            tokens2 = tokenized_texts[j]

            if len(tokens1) >= len(tokens2):
                max_tokens = len(tokens1)
            else:
                max_tokens = len(tokens2)

            distance = Levenshtein.distance(tokens1, tokens2)
            if norm:
                distance = distance / max_tokens
            total_distance += distance
            pair_count += 1

    average_distance = total_distance / pair_count if pair_count > 0 else 0
    return average_distance

In [6]:
def round_number(num, decimal_places):
    formatted_num = "{:.{}f}".format(num, decimal_places)
    rounded_num_str = "{:.{}f}".format(float(formatted_num), decimal_places)
    return rounded_num_str

def add_thousand_dots(n_sample):
    if isinstance(n_sample, str):
        if '.' in n_sample:
            integer_part, decimal_part = n_sample.split('.')
            formatted_integer_part = "{:,}".format(int(integer_part))
            result = f"{formatted_integer_part}.{decimal_part}"
        else:
            result = "{:,}".format(int(n_sample))
    elif isinstance(n_sample, np.float64):
        result = "{:,}".format(round(n_sample, 1))
    else:
        result = n_sample
    
    return result


### Load Datasets


In [7]:
dataset = {"synth": {}, "real": []}

# Load Synth
for llm in LLMS:
    dataset["synth"][llm] = {}
    for prompting in FS_CONDITIONS:
        dataset["synth"][llm][prompting] = []
        for split in range(6):
            with open(f"../07 train models/synth/{llm}/{prompting}/split_{split}.json", 'r', encoding='utf-8') as json_file:
                split_data = json.load(json_file)
            for example in split_data:
                example["tokenized_text"] = nlp(example["text"])
            dataset["synth"][llm][prompting].append(split_data)

# Load Real
for split in range(6):
    with open(f"../07 train models/real/split_{split}.json", 'r', encoding='utf-8') as json_file:
        split_data = json.load(json_file)
    for example in split_data:
        example["tokenized_text"] = nlp(example["text"])
    dataset["real"].append(split_data)

In [8]:
def get_one_hot(subset):
    labels_one_hot = []
    for i in range(len(subset)):
        tags_in_example = list(set([tag[CRITERIA_RS]
                               for tag in subset[i]["tags"]]))
        one_hot_encoded_combination = np.array(
            [1 if tag in tags_in_example else 0 for tag in COMBINATIONS])
        labels_one_hot.append(one_hot_encoded_combination)
    return labels_one_hot

In [9]:
for llm in LLMS:
    for few_shot_condition in FS_CONDITIONS:
        for iteration in range(6):
            if few_shot_condition == "random":
                subset = dataset["synth"][llm][few_shot_condition][iteration]
            else:
                subset = dataset["synth"][llm][few_shot_condition][iteration][475:]

            found_3_split = False
            restart_idx = 0
            while found_3_split == False:
                mskf = MultilabelStratifiedKFold(
                    n_splits=3, shuffle=True, random_state=RANDOM_STATE+restart_idx)
                section = []
                for train_index, test_index in mskf.split(subset, get_one_hot(subset)):
                    split_500 = [subset[i] for i in test_index]
                    section.append(split_500)

                if len(section[0]) == 500 and len(section[1]) == 500 and len(section[2]) == 500:
                    found_3_split = True

                restart_idx += 1

            dataset["synth"][llm][few_shot_condition][iteration] = section

In [10]:
real_examples = []
for i in [0, 1, 2, 3, 4, 5]:
    real_examples.append([])
    for k in [0, 1, 2]:
        if (i+k) < 6:
            t = i+k
        else:
            t = i+k - 6
        real_examples[i].append(dataset["real"][t])
dataset["real"] = real_examples

### Document Analysis


#### Synthetic Data


In [11]:
for idx_llm, llm in enumerate(LLMS):
    for idx_fsc, few_shot_condition in enumerate(FS_CONDITIONS):
        for idx_sample, n_sample in enumerate([500, 1000, 1500]):
            iterations_n_unique_tokens = []
            iterations_n_lemmas = []
            iterations_avg_unique_sentences = []
            for it in range(6):
                samples = [item for k in range(
                    int(n_sample / 500)) for item in dataset["synth"][llm][few_shot_condition][it][k]]
                n_unique_tokens = count_unique_tokens(
                    [token for example in samples for token in example["tokenized_text"]])
                n_unique_lemmas = count_unique_lemmas(
                    [token for example in samples for token in example["tokenized_text"]])
                n_unique_sentences = count_unique_sentences(
                    [example["text"] for example in samples])

                iterations_n_unique_tokens.append(n_unique_tokens)
                iterations_n_lemmas.append(n_unique_lemmas)
                iterations_avg_unique_sentences.append(n_unique_sentences)

            llm_print = "\multirow{6}{*}{" + \
                LLMS_ENCODED[llm] + \
                "}" if idx_sample == 0 and idx_fsc == 0 else ""
            fs_condition_print = "\multirow{3}{*}{" + \
                ENCODE_CONDITION[few_shot_condition] + \
                "}" if idx_sample == 0 else ""

            print(llm_print, "&", fs_condition_print, "&", add_thousand_dots(str(n_sample)), "&",
                  add_thousand_dots(
                      round_number(np.mean(iterations_avg_unique_sentences), 1)), "&",
                  add_thousand_dots(
                      round_number(np.mean(iterations_n_unique_tokens), 1)), "&",
                  add_thousand_dots(round(np.mean(iterations_n_lemmas), 1)), "\\\\")
        if idx_fsc == 0:
            print("\\arrayrulecolor{gray}\cline{2-6}\\arrayrulecolor{black}")
        else:
            print("\\hline")

\multirow{6}{*}{\textbf{Llama-2-70B}} & \multirow{3}{*}{\textbf{LRS\textsubscript{25}}} & 500 & 480.3 & 711.8 & 546.5 \\
 &  & 1,000 & 933.2 & 1,048.5 & 802.5 \\
 &  & 1,500 & 1,383.2 & 1,300.3 & 994.3 \\
\arrayrulecolor{gray}\cline{2-6}\arrayrulecolor{black}
 & \multirow{3}{*}{\textbf{LRS\textsubscript{500}}} & 500 & 486.2 & 753.8 & 582.7 \\
 &  & 1,000 & 948.2 & 1,107.5 & 848.7 \\
 &  & 1,500 & 1,397.2 & 1,380.3 & 1,054.7 \\
\hline
\multirow{6}{*}{\textbf{GPT-3.5-turbo}} & \multirow{3}{*}{\textbf{LRS\textsubscript{25}}} & 500 & 308.5 & 295.7 & 216.2 \\
 &  & 1,000 & 553.3 & 377.3 & 275.0 \\
 &  & 1,500 & 777.8 & 439.8 & 319.2 \\
\arrayrulecolor{gray}\cline{2-6}\arrayrulecolor{black}
 & \multirow{3}{*}{\textbf{LRS\textsubscript{500}}} & 500 & 318.0 & 293.8 & 216.5 \\
 &  & 1,000 & 560.5 & 387.3 & 280.5 \\
 &  & 1,500 & 784.3 & 453.8 & 326.8 \\
\hline


In [12]:
# for idx_llm, llm in enumerate(LLMS):
#     for idx_fsc, few_shot_condition in enumerate(FS_CONDITIONS):
#         for idx_sample, n_sample in enumerate([500, 1000, 1500]):
#             iterations_avg_unique_words_in_k_words = []
#             iterations_avg_levenshtein_distance = []
#             iterations_avg_levenshtein_distance_norm = []
#             for it in range(6):
#                 samples = [item for k in range(
#                     int(n_sample / 500)) for item in dataset["synth"][llm][few_shot_condition][it][k]]
#                 n_unique_words_in_k_words = get_avg_unique_words_in_k_words(
#                     [token.text for example in samples for token in example["tokenized_text"]])
#                 avg_levenshtein_distance = average_word_level_levenshtein_distance(
#                     samples)
#                 avg_levenshtein_distance_norm = average_word_level_levenshtein_distance(
#                     samples, norm=True)

#                 iterations_avg_unique_words_in_k_words.append(
#                     n_unique_words_in_k_words)
#                 iterations_avg_levenshtein_distance.append(
#                     avg_levenshtein_distance)
#                 iterations_avg_levenshtein_distance_norm.append(
#                     avg_levenshtein_distance_norm)
                
#             llm_print = "\multirow{6}{*}{" + \
#                 LLMS_ENCODED[llm] + \
#                 "}" if idx_sample == 0 and idx_fsc == 0 else ""
#             fs_condition_print = "\multirow{3}{*}{" + \
#                 ENCODE_CONDITION[few_shot_condition] + \
#                 "}" if idx_sample == 0 else ""

#             print(llm_print, "&", fs_condition_print, "&", add_thousand_dots(n_sample), "&",
#                   add_thousand_dots(round_number(np.mean(iterations_avg_unique_words_in_k_words), 2)), "&",
#                   add_thousand_dots(round_number(np.mean(iterations_avg_levenshtein_distance), 2)), "&",
#                   add_thousand_dots(round_number(np.mean(iterations_avg_levenshtein_distance_norm), 2)))
        
#         if idx_fsc == 0:
#             print("\\arrayrulecolor{gray}\cline{2-6}\\arrayrulecolor{black}")
#         else:
#             print("\\hline")

#### Real Data


In [13]:
for idx_sample, n_sample in enumerate([500, 1000, 1500]):
    iterations_n_unique_tokens = []
    iterations_n_lemmas = []
    iterations_avg_unique_sentences = []

    for it in range(6):
        samples = [item for k in range(
            int(n_sample / 500)) for item in dataset["real"][it][k]]
        n_unique_tokens = count_unique_tokens(
            [token for example in samples for token in example["tokenized_text"]])
        n_unique_lemmas = count_unique_lemmas(
            [token for example in samples for token in example["tokenized_text"]])
        n_unique_sentences = count_unique_sentences(
            [example["text"] for example in samples])

        iterations_n_unique_tokens.append(n_unique_tokens)
        iterations_n_lemmas.append(n_unique_lemmas)
        iterations_avg_unique_sentences.append(n_unique_sentences)

    data_source_print = "\multirow{3}{*}{\\textbf{Real Examples}}" if idx_sample == 0 else ""

    fs_condition_print = "\multirow{3}{*}{-}" if idx_sample == 0 else ""

    print(data_source_print, "&", fs_condition_print, "&", add_thousand_dots(n_sample), "&",
          add_thousand_dots(round_number(np.mean(iterations_avg_unique_sentences), 1)), "&",
          add_thousand_dots(round_number(np.mean(iterations_n_unique_tokens), 1)), "&",
          add_thousand_dots(round_number(np.mean(iterations_n_lemmas), 1)), "\\\\")
print("\\hline")

\multirow{3}{*}{\textbf{Real Examples}} & \multirow{3}{*}{-} & 500 & 497.0 & 1,918.2 & 1,493.0 \\
 &  & 1000 & 989.7 & 3,061.0 & 2,349.2 \\
 &  & 1500 & 1,480.7 & 3,995.5 & 3,037.8 \\
\hline


In [14]:
# for idx_sample, n_sample in enumerate([500, 1000, 1500]):
#     iterations_avg_unique_words_in_k_words = []
#     iterations_avg_levenshtein_distance = []
#     iterations_avg_levenshtein_distance_norm = []
#     for it in range(6):
#         samples = [item for k in range(
#             int(n_sample / 500)) for item in dataset["real"][it][k]]

#         n_unique_words_in_k_words = get_avg_unique_words_in_k_words(
#             [token.text for example in samples for token in example["tokenized_text"]])
#         avg_levenshtein_distance = average_word_level_levenshtein_distance(
#             samples)
#         avg_levenshtein_distance_norm = average_word_level_levenshtein_distance(
#             samples, norm=True)

#         iterations_avg_unique_words_in_k_words.append(
#             n_unique_words_in_k_words)
#         iterations_avg_levenshtein_distance.append(
#             avg_levenshtein_distance)
#         iterations_avg_levenshtein_distance_norm.append(
#             avg_levenshtein_distance_norm)
        
#     data_source_print = "\multirow{3}{*}{\\textbf{Real Examples}}" if idx_sample == 0 else ""
#     fs_condition_print = "\multirow{3}{*}{-}" if idx_sample == 0 else ""

#     print(data_source_print, "&", fs_condition_print, "&", add_thousand_dots(n_sample), "&",
#           add_thousand_dots(
#               round_number(np.mean(iterations_avg_unique_words_in_k_words), 2)), "&",
#           add_thousand_dots(
#               round_number(np.mean(iterations_avg_levenshtein_distance), 2)), "&",
#           add_thousand_dots(round_number(np.mean(iterations_avg_levenshtein_distance_norm), 2)), "\\\\")
# print("\\hline")

### Aspect Term Analysis

#### Synthetic Data

In [15]:
def print_k_unique_at(unique_counts, max_count):
    if None in unique_counts:
        return "-"
    return add_thousand_dots(round_number(np.mean(unique_counts), 1))


In [16]:
for idx_llm, llm in enumerate(LLMS):
    for idx_fsc, few_shot_condition in enumerate(FS_CONDITIONS):
        for idx_sample, n_sample in enumerate([500, 1000, 1500]):
            tags_from_splits_count = []
            tags_from_splits_count_implicit = []
            tags_from_splits_count_explicit = []
            count_unique_aspect_terms_in_split = []
            count_unique_aspect_terms_in_100_aspect_terms = []
            count_unique_aspect_terms_in_200_aspect_terms = []
            count_unique_aspect_terms_in_500_aspect_terms = []
            count_unique_aspect_terms_in_1000_aspect_terms = []

            for it in range(6):
                tags = [tag for k in range(
                    int(n_sample / 500)) for example in dataset["synth"][llm][few_shot_condition][it][k] for tag in example["tags"]]
                tags_explicit = [tag["text"]
                                 for tag in tags if tag["type"] == "label-explicit"]
                tags_from_splits_count.append(len(tags))
                tags_from_splits_count_explicit.append(
                    len([tag for tag in tags if tag["type"] == "label-explicit"]))
                tags_from_splits_count_implicit.append(
                    len([tag for tag in tags if tag["type"] == "label-implicit"]))

                unique_tags = len(set(tags_explicit))

                # Calculate number of unique tokens in k aspect terms
                count_unique_aspect_terms_in_100_aspect_terms.append(
                    get_avg_unique_words_in_k_words(tags_explicit, n_selection=100))
                count_unique_aspect_terms_in_200_aspect_terms.append(
                    get_avg_unique_words_in_k_words(tags_explicit, n_selection=200))
                if len(tags_explicit) >= 500:
                    count_unique_aspect_terms_in_500_aspect_terms.append(
                        get_avg_unique_words_in_k_words(tags_explicit, n_selection=500))
                else:
                    count_unique_aspect_terms_in_500_aspect_terms.append(None)

                if len(tags_explicit) >= 1000:
                    count_unique_aspect_terms_in_1000_aspect_terms.append(
                        get_avg_unique_words_in_k_words(tags_explicit, n_selection=1000))
                else:
                    count_unique_aspect_terms_in_1000_aspect_terms.append(None)

                count_unique_aspect_terms_in_split.append(unique_tags)

            llm_print = "\multirow{6}{*}{" + \
                LLMS_ENCODED[llm] + \
                "}" if idx_sample == 0 and idx_fsc == 0 else ""
            fs_condition_print = "\multirow{3}{*}{" + \
                ENCODE_CONDITION[few_shot_condition] + \
                "}" if idx_sample == 0 else ""

            print(llm_print, "&", fs_condition_print,
                  "&", add_thousand_dots(n_sample),  # n samples
                  "&", add_thousand_dots(
                      round_number(np.mean(tags_from_splits_count), 2)),  # n aspects
                  "&", add_thousand_dots(
                      round_number(np.mean(tags_from_splits_count_implicit) / np.mean(tags_from_splits_count) * 100, 1)) + " \\%",  # % implicit
                  "\\textit{(SD = " + add_thousand_dots(round_number(np.std([a / b * 100 for a, b in zip(
                      tags_from_splits_count_implicit, tags_from_splits_count)]), 2)) + ")}",
                  "&", add_thousand_dots(
                      round_number(np.mean(tags_from_splits_count_explicit), 2)),  # n aspects
                  "&", add_thousand_dots(
                      round_number(np.mean(count_unique_aspect_terms_in_split), 2)),  # n unique
                  "&", add_thousand_dots(
                      round_number(np.mean(count_unique_aspect_terms_in_100_aspect_terms), 1)),
                  "&", add_thousand_dots(
                      round_number(np.mean(count_unique_aspect_terms_in_200_aspect_terms), 1)),
                  "&",
                  print_k_unique_at(
                      count_unique_aspect_terms_in_500_aspect_terms, 500),
                  "&",
                  print_k_unique_at(
                      count_unique_aspect_terms_in_1000_aspect_terms, 1000),
                  "\\\\")

        if idx_fsc == 0:
            print("\\arrayrulecolor{gray}\cline{2-11}\\arrayrulecolor{black}")
        else:
            print("\\hline")

\multirow{6}{*}{\textbf{Llama-2-70B}} & \multirow{3}{*}{\textbf{LRS\textsubscript{25}}} & 500 & 797.00 & 28.5 \% \textit{(SD = 4.73)} & 570.17 & 107.67 & 34.7 & 54.0 & 98.9 & - \\
 &  & 1000 & 1,591.67 & 29.1 \% \textit{(SD = 4.93)} & 1,127.83 & 176.17 & 34.9 & 54.4 & 100.7 & 162.7 \\
 &  & 1500 & 2,388.00 & 29.3 \% \textit{(SD = 4.92)} & 1,687.67 & 229.33 & 34.5 & 53.5 & 98.7 & 159.8 \\
\arrayrulecolor{gray}\cline{2-11}\arrayrulecolor{black}
 & \multirow{3}{*}{\textbf{LRS\textsubscript{500}}} & 500 & 703.17 & 26.3 \% \textit{(SD = 1.94)} & 518.33 & 115.17 & 37.5 & 59.7 & - & - \\
 &  & 1000 & 1,406.00 & 26.2 \% \textit{(SD = 1.62)} & 1,037.33 & 193.00 & 38.6 & 61.7 & 116.1 & - \\
 &  & 1500 & 2,105.50 & 26.5 \% \textit{(SD = 1.05)} & 1,548.00 & 254.00 & 38.4 & 61.2 & 115.2 & 187.2 \\
\hline
\multirow{6}{*}{\textbf{GPT-3.5-turbo}} & \multirow{3}{*}{\textbf{LRS\textsubscript{25}}} & 500 & 797.00 & 36.4 \% \textit{(SD = 8.90)} & 506.50 & 40.50 & 18.6 & 25.8 & - & - \\
 &  & 1000 & 1,591.

### Real Data

In [17]:
for idx_sample, n_sample in enumerate([500, 1000, 1500]):
    tags_from_splits_count = []
    tags_from_splits_count_implicit = []
    tags_from_splits_count_explicit = []
    count_unique_aspect_terms_in_split = []
    count_unique_aspect_terms_in_100_aspect_terms = []
    count_unique_aspect_terms_in_200_aspect_terms = []
    count_unique_aspect_terms_in_500_aspect_terms = []
    count_unique_aspect_terms_in_1000_aspect_terms = []

    for it in range(6):
        tags = [tag for k in range(int(n_sample / 500))
                for example in dataset["real"][it][k] for tag in example["tags"]]
        tags_explicit = [tag["text"]
                         for tag in tags if tag["type"] == "label-explicit"]
        tags_from_splits_count.append(len(tags))
        tags_from_splits_count_explicit.append(
            len([tag for tag in tags if tag["type"] == "label-explicit"]))
        tags_from_splits_count_implicit.append(
            len([tag for tag in tags if tag["type"] == "label-implicit"]))

        unique_tags = len(set(tags_explicit))

        # Calculate number of unique tokens in k aspect terms
        count_unique_aspect_terms_in_100_aspect_terms.append(
            get_avg_unique_words_in_k_words(tags_explicit, n_selection=100))
        count_unique_aspect_terms_in_200_aspect_terms.append(
            get_avg_unique_words_in_k_words(tags_explicit, n_selection=200))
        if len(tags_explicit) >= 500:
            count_unique_aspect_terms_in_500_aspect_terms.append(
                get_avg_unique_words_in_k_words(tags_explicit, n_selection=500))
        else:
            count_unique_aspect_terms_in_500_aspect_terms.append(None)

        if len(tags_explicit) >= 1000:
            count_unique_aspect_terms_in_1000_aspect_terms.append(
                get_avg_unique_words_in_k_words(tags_explicit, n_selection=1000))
        else:
            count_unique_aspect_terms_in_1000_aspect_terms.append(None)

        count_unique_aspect_terms_in_split.append(unique_tags)

    data_source_print = "\multirow{3}{*}{\\textbf{Real Examples}}" if idx_sample == 0 else ""
    fs_condition_print = "\multirow{3}{*}{-}" if idx_sample == 0 else ""

    print(data_source_print, "&", fs_condition_print,
          "&", add_thousand_dots(n_sample),  # n samples
          "&", add_thousand_dots(
              round_number(np.mean(tags_from_splits_count), 2)),  # n aspects
          "&", add_thousand_dots(
              round_number(np.mean(tags_from_splits_count_implicit) / np.mean(tags_from_splits_count) * 100, 1)) + " \\%",  # % implicit
          "\\textit{(SD = " + add_thousand_dots(round_number(np.std([a / b * 100 for a, b in zip(
              tags_from_splits_count_implicit, tags_from_splits_count)]), 2)) + ")}",
          "&", add_thousand_dots(
              round_number(np.mean(tags_from_splits_count_explicit), 2)),  # n aspects explicit
          "&", add_thousand_dots(
              round_number(np.mean(count_unique_aspect_terms_in_split), 2)),  # n unique
          "&", add_thousand_dots(
              round_number(np.mean(count_unique_aspect_terms_in_100_aspect_terms), 1)),
          "&", add_thousand_dots(
              round_number(np.mean(count_unique_aspect_terms_in_200_aspect_terms), 1)),
          "&", print_k_unique_at(
              count_unique_aspect_terms_in_500_aspect_terms, 500),
          "&", print_k_unique_at(
              count_unique_aspect_terms_in_1000_aspect_terms, 1000), "\\\\")
print("\\hline")

\multirow{3}{*}{\textbf{Real Examples}} & \multirow{3}{*}{-} & 500 & 701.83 & 26.8 \% \textit{(SD = 1.30)} & 513.50 & 254.67 & 68.0 & 119.9 & - & - \\
 &  & 1000 & 1,403.67 & 26.8 \% \textit{(SD = 0.67)} & 1,027.00 & 437.00 & 68.0 & 119.8 & 249.0 & - \\
 &  & 1500 & 2,105.50 & 26.8 \% \textit{(SD = 0.19)} & 1,540.50 & 595.67 & 67.8 & 119.9 & 249.4 & 427.9 \\
\hline


### Aspect Term Analysis (With Aspect Category)

### Synth Examples

In [18]:
def print_k_unique_at(unique_counts, max_count):
    if None in unique_counts:
        return "-"
    return add_thousand_dots(round_number(np.mean(unique_counts), 1))

In [20]:
for idx_llm, llm in enumerate(LLMS):
    for idx_fsc, few_shot_condition in enumerate(FS_CONDITIONS):
        for idx_ac, aspect_category in enumerate(ASPECT_CATEGORIES):
            for idx_sample, n_sample in enumerate([500, 1000, 1500]):
                tags_from_splits_count = []
                tags_from_splits_count_implicit = []
                tags_from_splits_count_explicit = []
                count_unique_aspect_terms_in_split = []
                count_unique_aspect_terms_in_10_aspect_terms = []
                count_unique_aspect_terms_in_50_aspect_terms = []
                count_unique_aspect_terms_in_100_aspect_terms = []
                count_unique_aspect_terms_in_200_aspect_terms = []
                count_unique_aspect_terms_in_300_aspect_terms = []
                for it in range(6):
                    tags = [tag for k in range(
                        int(n_sample / 500)) for example in dataset["synth"][llm][few_shot_condition][it][k] for tag in example["tags"] if tag["label"] == aspect_category]
                    tags_explicit = [tag["text"]
                                     for tag in tags if tag["type"] == "label-explicit"]
                    tags_from_splits_count.append(len(tags))
                    tags_from_splits_count_explicit.append(
                        len([tag for tag in tags if tag["type"] == "label-explicit"]))
                    tags_from_splits_count_implicit.append(
                        len([tag for tag in tags if tag["type"] == "label-implicit"]))

                    unique_tags = len(set(tags_explicit))


                    if len(tags_explicit) >= 10:
                        count_unique_aspect_terms_in_10_aspect_terms.append(
                            get_avg_unique_words_in_k_words(tags_explicit, n_selection=10))
                    else:
                        count_unique_aspect_terms_in_10_aspect_terms.append(None)

                    if len(tags_explicit) >= 50:
                        count_unique_aspect_terms_in_50_aspect_terms.append(
                            get_avg_unique_words_in_k_words(tags_explicit, n_selection=50))
                    else:
                        count_unique_aspect_terms_in_50_aspect_terms.append(None)

                    if len(tags_explicit) >= 100:
                        count_unique_aspect_terms_in_100_aspect_terms.append(
                            get_avg_unique_words_in_k_words(tags_explicit, n_selection=100))
                    else:
                        count_unique_aspect_terms_in_100_aspect_terms.append(None)

                    if len(tags_explicit) >= 200:
                        count_unique_aspect_terms_in_200_aspect_terms.append(
                            get_avg_unique_words_in_k_words(tags_explicit, n_selection=200))
                    else:
                        count_unique_aspect_terms_in_200_aspect_terms.append(None)

                    if len(tags_explicit) >= 300:
                        count_unique_aspect_terms_in_300_aspect_terms.append(
                            get_avg_unique_words_in_k_words(tags_explicit, n_selection=300))
                    else:
                        count_unique_aspect_terms_in_300_aspect_terms.append(None)                    

                    count_unique_aspect_terms_in_split.append(unique_tags)

                llm_print = "\multirow{30}{*}{\\textbf{" + \
                    LLMS_ENCODED[llm] + \
                    "}}" if idx_sample == 0 and idx_fsc == 0 and idx_ac == 0 else ""
                fs_condition_print = "\multirow{15}{*}{" + \
                    ENCODE_CONDITION[few_shot_condition] + \
                    "}" if idx_sample == 0 and idx_ac == 0 else ""
                ac_print = "\multirow{3}{*}{\\texttt{" + \
                    aspect_category + "}}" if idx_sample == 0 else ""

                # print(idx_llm, idx_fsc, idx_ac, idx_sample)

                print(llm_print, "&", fs_condition_print, "&", ac_print,
                      "&", add_thousand_dots(n_sample),
                      "&", add_thousand_dots(
                          round_number(np.mean(tags_from_splits_count), 2)),
                      "&", add_thousand_dots(
                          round_number(np.mean(tags_from_splits_count_implicit) / np.mean(tags_from_splits_count) * 100, 1)) + " \\%",  # % implicit
                      "\\textit{(SD = " + add_thousand_dots(round_number(np.std([a / b * 100 for a, b in zip(
                          tags_from_splits_count_implicit, tags_from_splits_count)]), 2)) + ")}",
                      "&", add_thousand_dots(
                          round_number(np.mean(tags_from_splits_count_explicit), 2)),
                      "&", add_thousand_dots(round_number(
                          np.mean(count_unique_aspect_terms_in_split), 2)),
                      "&", print_k_unique_at(count_unique_aspect_terms_in_10_aspect_terms, 10), 
                      "&", print_k_unique_at(count_unique_aspect_terms_in_50_aspect_terms, 50),
                      "&", print_k_unique_at(count_unique_aspect_terms_in_100_aspect_terms, 100),
                      "&", print_k_unique_at(count_unique_aspect_terms_in_200_aspect_terms, 200),
                      "&", print_k_unique_at(count_unique_aspect_terms_in_300_aspect_terms, 300), "\\\\")
            # print(idx_llm, idx_fsc, idx_ac)
            if idx_fsc == 1 and idx_ac == 4:
                print("\\hline")
            elif idx_ac == 4:
                print("\\cline{2-13}")
            else:
                print(
                    "\\arrayrulecolor{gray}\cline{3-13}\\arrayrulecolor{black}")

\multirow{30}{*}{\textbf{\textbf{Llama-2-70B}}} & \multirow{15}{*}{\textbf{LRS\textsubscript{25}}} & \multirow{3}{*}{\texttt{GENERAL-IMPRESSION}} & 500 & 159.67 & 45.9 \% \textit{(SD = 5.51)} & 86.33 & 33.83 & 7.4 & 23.4 & - & - & - \\
 &  &  & 1000 & 319.83 & 46.9 \% \textit{(SD = 5.62)} & 169.83 & 56.00 & 7.5 & 23.8 & 38.7 & - & - \\
 &  &  & 1500 & 479.00 & 45.5 \% \textit{(SD = 6.58)} & 261.17 & 77.00 & 7.5 & 23.6 & 38.5 & 63.7 & - \\
\arrayrulecolor{gray}\cline{3-13}\arrayrulecolor{black}
 &  & \multirow{3}{*}{\texttt{FOOD}} & 500 & 158.67 & 22.0 \% \textit{(SD = 4.79)} & 123.83 & 34.33 & 5.4 & 17.8 & 29.4 & - & - \\
 &  &  & 1000 & 318.50 & 24.1 \% \textit{(SD = 5.98)} & 241.67 & 57.00 & 5.4 & 18.0 & 30.0 & 49.8 & - \\
 &  &  & 1500 & 478.00 & 25.0 \% \textit{(SD = 5.78)} & 358.67 & 72.00 & 5.4 & 17.5 & 29.0 & 47.8 & 63.6 \\
\arrayrulecolor{gray}\cline{3-13}\arrayrulecolor{black}
 &  & \multirow{3}{*}{\texttt{SERVICE}} & 500 & 160.67 & 21.1 \% \textit{(SD = 7.31)} & 126.83 & 18.0

### Real Examples

In [22]:
for idx_ac, aspect_category in enumerate(ASPECT_CATEGORIES):
    for idx_sample, n_sample in enumerate([500, 1000, 1500]):
        tags_from_splits_count = []
        tags_from_splits_count_implicit = []
        tags_from_splits_count_explicit = []
        count_unique_aspect_terms_in_split = []
        count_unique_aspect_terms_in_10_aspect_terms = []
        count_unique_aspect_terms_in_50_aspect_terms = []
        count_unique_aspect_terms_in_100_aspect_terms = []
        count_unique_aspect_terms_in_200_aspect_terms = []
        count_unique_aspect_terms_in_300_aspect_terms = []

        for it in range(6):
            tags = [tag for k in range(int(n_sample / 500))
                    for example in dataset["real"][it][k] for tag in example["tags"] if tag["label"] == aspect_category]
            tags_explicit = [tag["text"]
                             for tag in tags if tag["type"] == "label-explicit"]
            tags_from_splits_count.append(len(tags))
            tags_from_splits_count_explicit.append(
                len([tag for tag in tags if tag["type"] == "label-explicit"]))
            tags_from_splits_count_implicit.append(
                len([tag for tag in tags if tag["type"] == "label-implicit"]))

            unique_tags = len(set(tags_explicit))

            if len(tags_explicit) >= 10:
                count_unique_aspect_terms_in_10_aspect_terms.append(
                    get_avg_unique_words_in_k_words(tags_explicit, n_selection=10))
            else:
                count_unique_aspect_terms_in_10_aspect_terms.append(None)

            if len(tags_explicit) >= 50:
                count_unique_aspect_terms_in_50_aspect_terms.append(
                    get_avg_unique_words_in_k_words(tags_explicit, n_selection=50))
            else:
                count_unique_aspect_terms_in_50_aspect_terms.append(None)

            if len(tags_explicit) >= 100:
                count_unique_aspect_terms_in_100_aspect_terms.append(
                    get_avg_unique_words_in_k_words(tags_explicit, n_selection=100))
            else:
                count_unique_aspect_terms_in_100_aspect_terms.append(None)

            if len(tags_explicit) >= 200:
                count_unique_aspect_terms_in_200_aspect_terms.append(
                    get_avg_unique_words_in_k_words(tags_explicit, n_selection=200))
            else:
                count_unique_aspect_terms_in_200_aspect_terms.append(None)

            if len(tags_explicit) >= 300:
                count_unique_aspect_terms_in_300_aspect_terms.append(
                    get_avg_unique_words_in_k_words(tags_explicit, n_selection=300))
            else:
                count_unique_aspect_terms_in_300_aspect_terms.append(None)

            count_unique_aspect_terms_in_split.append(unique_tags)

        # print(idx_ac, idx_sample)
        data_source_print = "\multirow{15}{*}{\\textbf{Real Examples}}" if idx_sample == 0 and idx_ac == 0 else ""
        fs_condition_print = "\multirow{15}{*}{-}" if idx_sample == 0 and idx_ac == 0 else ""
        ac_print = "\multirow{3}{*}{\\texttt{" + \
            aspect_category + "}}" if idx_sample == 0 else ""

        print(data_source_print, "&", fs_condition_print, "&", ac_print,
              "&", add_thousand_dots(n_sample),
              "&", add_thousand_dots(
                  round_number(np.mean(tags_from_splits_count), 2)),
              "&", add_thousand_dots(
                  round_number(np.mean(tags_from_splits_count_implicit) / np.mean(tags_from_splits_count) * 100, 1)) + " \\%",  # % implicit
              "\\textit{(SD = " + add_thousand_dots(round_number(np.std([a / b * 100 for a, b in zip(
                  tags_from_splits_count_implicit, tags_from_splits_count)]), 2)) + ")}",
              "&", add_thousand_dots(
                  round_number(np.mean(tags_from_splits_count_explicit), 2)),
              "&", add_thousand_dots(round_number(
                  np.mean(count_unique_aspect_terms_in_split), 2)),
              "&", print_k_unique_at(
                  count_unique_aspect_terms_in_10_aspect_terms, 10),
              "&", print_k_unique_at(
                  count_unique_aspect_terms_in_50_aspect_terms, 50),
              "&", print_k_unique_at(
                  count_unique_aspect_terms_in_100_aspect_terms, 100),
              "&", print_k_unique_at(
                  count_unique_aspect_terms_in_200_aspect_terms, 200),
              "&", print_k_unique_at(count_unique_aspect_terms_in_300_aspect_terms, 300), "\\\\")
    print("\\arrayrulecolor{gray}\cline{3-13}\\arrayrulecolor{black}")
print("\\hline")

\multirow{15}{*}{\textbf{Real Examples}} & \multirow{15}{*}{-} & \multirow{3}{*}{\texttt{GENERAL-IMPRESSION}} & 500 & 126.17 & 78.5 \% \textit{(SD = 2.65)} & 27.17 & 13.83 & 6.9 & - & - & - & - \\
 &  &  & 1000 & 252.33 & 78.5 \% \textit{(SD = 1.61)} & 54.33 & 23.00 & 6.9 & 21.6 & - & - & - \\
 &  &  & 1500 & 378.50 & 78.5 \% \textit{(SD = 0.89)} & 81.50 & 31.33 & 6.9 & 21.7 & - & - & - \\
\arrayrulecolor{gray}\cline{3-13}\arrayrulecolor{black}
 &  & \multirow{3}{*}{\texttt{FOOD}} & 500 & 279.33 & 11.2 \% \textit{(SD = 1.00)} & 248.17 & 142.50 & 8.7 & 36.6 & 66.8 & 119.1 & - \\
 &  &  & 1000 & 558.67 & 11.2 \% \textit{(SD = 0.79)} & 496.33 & 246.83 & 8.7 & 36.5 & 66.6 & 118.5 & 164.7 \\
 &  &  & 1500 & 838.00 & 11.2 \% \textit{(SD = 0.58)} & 744.50 & 338.83 & 8.6 & 36.5 & 66.6 & 118.4 & 164.5 \\
\arrayrulecolor{gray}\cline{3-13}\arrayrulecolor{black}
 &  & \multirow{3}{*}{\texttt{SERVICE}} & 500 & 175.50 & 22.3 \% \textit{(SD = 2.51)} & 136.33 & 51.83 & 7.1 & 23.6 & 40.7 & - & - \\
 & 