# Notebook: Analyse Language


## Packages


In [1]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from spacy.lang.de.stop_words import STOP_WORDS
from collections import Counter
import Levenshtein
import numpy as np
import random
import string
import spacy
import nltk
import json

## Constants


In [2]:
ASPECT_CATEGORIES = ["GENERAL-IMPRESSION",
                     "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
LLMS = ["GPT-3", "Llama70B"]
FS_CONDITIONS = ["fixed", "random"]
PROMPTING_ENCODING = {"fixed": "25 fixed examples",
                      "random": "25 random examples"}
N_FOLDS = 3
CRITERIA_RS = "tag_with_polarity"
POLARITIES = ["POSITIVE", "NEGATIVE", "NEUTRAL"]
MENTIONING_TYPE = ["implicit", "explicit"]
COMBINATIONS = [f"{aspect}-{polarity}" for aspect in ["SERVICE", "FOOD",
                                                      "GENERAL-IMPRESSION", "AMBIENCE", "PRICE"] for polarity in POLARITIES]
RANDOM_STATE = 43

In [34]:
LLMS_ENCODED = {"GPT-3": "GPT-3.5-turbo", "Llama70B":"Llama-2-70B"}

## Settings


In [4]:
nlp = spacy.load("de_core_news_lg")
nltk.download('punkt')
random.seed(RANDOM_STATE)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Code


### Helper


In [35]:
def count_unique_sentences(sentences):
    unique_sentences = set(sentences)
    return len(unique_sentences)


def count_unique_tokens(tokens):
    unique_tokens = set(token.text for token in tokens)
    return len(unique_tokens)


def count_unique_lemmas(tokens):
    unique_lemmas = set(token.lemma_ for token in tokens)
    return len(unique_lemmas)


def get_avg_unique_words_in_k_words(tokens, n_selection=100, n_repetitions=10000):
    iterations_n_unique_words = []
    for i in range(n_repetitions):
        random_indices = random.sample(range(len(tokens)), n_selection)
        random_words = [tokens[index] for index in random_indices]
        n_unique_words = len(set(random_words))
        iterations_n_unique_words.append(n_unique_words)
    return np.mean(iterations_n_unique_words)


def average_word_level_levenshtein_distance(docs, norm=False):
    tokenized_texts = [
        [token.text for token in doc["tokenized_text"]] for doc in docs]

    total_distance = 0
    pair_count = 0

    for i in range(len(tokenized_texts)):
        for j in range(i + 1, len(tokenized_texts)):
            tokens1 = tokenized_texts[i]
            tokens2 = tokenized_texts[j]

            if len(tokens1) >= len(tokens2):
                max_tokens = len(tokens1)
            else:
                max_tokens = len(tokens2)

            distance = Levenshtein.distance(tokens1, tokens2)
            if norm:
                distance = distance / max_tokens
            total_distance += distance
            pair_count += 1

    average_distance = total_distance / pair_count if pair_count > 0 else 0
    return average_distance

In [36]:
def add_thousand_dots(n_sample):
    return f"{n_sample:,}"

### Load Datasets


In [12]:
dataset = {"synth": {}, "real": []}

# Load Synth
for llm in LLMS:
    dataset["synth"][llm] = {}
    for prompting in FS_CONDITIONS:
        dataset["synth"][llm][prompting] = []
        for split in range(5):
            with open(f"../07 train models/synth/{llm}/{prompting}/split_{split}.json", 'r', encoding='utf-8') as json_file:
                split_data = json.load(json_file)
            for example in split_data:
                example["tokenized_text"] = nlp(example["text"])
            dataset["synth"][llm][prompting].append(split_data)

# Load Real
for split in range(6):
    with open(f"../07 train models/real/split_{split}.json", 'r', encoding='utf-8') as json_file:
        split_data = json.load(json_file)
    for example in split_data:
        example["tokenized_text"] = nlp(example["text"])
    dataset["real"].append(split_data)

In [13]:
def get_one_hot(subset):
    labels_one_hot = []
    for i in range(len(subset)):
        tags_in_example = list(set([tag[CRITERIA_RS]
                               for tag in subset[i]["tags"]]))
        one_hot_encoded_combination = np.array(
            [1 if tag in tags_in_example else 0 for tag in COMBINATIONS])
        labels_one_hot.append(one_hot_encoded_combination)
    return labels_one_hot

In [14]:
for llm in LLMS:
    for few_shot_condition in FS_CONDITIONS:
        for iteration in range(5):
            if few_shot_condition == "random":
                subset = dataset["synth"][llm][few_shot_condition][iteration]
            else:
                subset = dataset["synth"][llm][few_shot_condition][iteration][475:]

            found_5_split = False
            restart_idx = 0
            while found_5_split == False:
                mskf = MultilabelStratifiedKFold(
                    n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE+restart_idx)
                section = []
                for train_index, test_index in mskf.split(subset, get_one_hot(subset)):
                    split_500 = [subset[i] for i in test_index]
                    section.append(split_500)

                if len(section[0]) == 500 and len(section[1]) == 500 and len(section[2]) == 500:
                    found_5_split = True

                restart_idx += 1

            dataset["synth"][llm][few_shot_condition][iteration] = section

In [15]:
real_examples = []
for i in [0, 1, 2, 3, 4, 5]:
    real_examples.append([])
    for k in [0, 1, 2]:
        if (i+k) < 6:
            t = i+k
        else:
            t = i+k - 6
        real_examples[i].append(dataset["real"][t])
dataset["real"] = real_examples

### Document Analysis


#### Synthetic Data


In [45]:
add_thousand_dots(2300.3)

'2,300.3'

In [37]:
for llm in LLMS:
    for few_shot_condition in FS_CONDITIONS:
        for n_sample in [500, 1000, 1500]:
            iterations_n_unique_tokens = []
            iterations_n_lemmas = []
            iterations_avg_unique_sentences = []
            for it in range(5):
                samples = [item for k in range(
                    int(n_sample / 500)) for item in dataset["synth"][llm][few_shot_condition][it][k]]
                n_unique_tokens = count_unique_tokens(
                    [token for example in samples for token in example["tokenized_text"]])
                n_unique_lemmas = count_unique_lemmas(
                    [token for example in samples for token in example["tokenized_text"]])
                n_unique_sentences = count_unique_sentences(
                    [example["text"] for example in samples])

                iterations_n_unique_tokens.append(n_unique_tokens)
                iterations_n_lemmas.append(n_unique_lemmas)
                iterations_avg_unique_sentences.append(n_unique_sentences)

            print("\\textbf{"+LLMS_ENCODED[llm]+"}", "&", few_shot_condition, "&", add_thousand_dots(n_sample), "&",
                  add_thousand_dots(round(np.mean(iterations_avg_unique_sentences), 2)), "&",
                  add_thousand_dots(round(np.mean(iterations_n_unique_tokens), 2)), "&",
                  add_thousand_dots(round(np.mean(iterations_n_lemmas), 2)), "\\\\")
        print("\\hline")

\textbf{GPT-3.5-turbo} & fixed & 500 & 307.6 & 289.6 & 208.2 \\
\textbf{GPT-3.5-turbo} & fixed & 1,000 & 549.0 & 369.6 & 264.4 \\
\textbf{GPT-3.5-turbo} & fixed & 1,500 & 769.8 & 428.4 & 307.0 \\
\hline
\textbf{GPT-3.5-turbo} & random & 500 & 317.0 & 295.4 & 217.0 \\
\textbf{GPT-3.5-turbo} & random & 1,000 & 561.2 & 389.4 & 281.2 \\
\textbf{GPT-3.5-turbo} & random & 1,500 & 782.2 & 456.6 & 328.4 \\
\hline
\textbf{Llama-2-70B} & fixed & 500 & 480.0 & 694.4 & 536.6 \\
\textbf{Llama-2-70B} & fixed & 1,000 & 934.0 & 1023.8 & 787.0 \\
\textbf{Llama-2-70B} & fixed & 1,500 & 1383.2 & 1269.2 & 973.0 \\
\hline
\textbf{Llama-2-70B} & random & 500 & 485.4 & 751.8 & 580.2 \\
\textbf{Llama-2-70B} & random & 1,000 & 949.4 & 1103.0 & 846.8 \\
\textbf{Llama-2-70B} & random & 1,500 & 1400.0 & 1380.4 & 1054.0 \\
\hline


In [38]:
for llm in LLMS:
    for few_shot_condition in FS_CONDITIONS:
        for n_sample in [500, 1000, 1500]:
            iterations_avg_unique_words_in_k_words = []
            iterations_avg_levenshtein_distance = []
            iterations_avg_levenshtein_distance_norm = []
            for it in range(5):
                samples = [item for k in range(
                    int(n_sample / 500)) for item in dataset["synth"][llm][few_shot_condition][it][k]]
                n_unique_words_in_k_words = get_avg_unique_words_in_k_words(
                    [token.text for example in samples for token in example["tokenized_text"]])
                avg_levenshtein_distance = average_word_level_levenshtein_distance(
                    samples)
                avg_levenshtein_distance_norm = average_word_level_levenshtein_distance(
                    samples, norm=True)

                iterations_avg_unique_words_in_k_words.append(
                    n_unique_words_in_k_words)
                iterations_avg_levenshtein_distance.append(
                    avg_levenshtein_distance)
                iterations_avg_levenshtein_distance_norm.append(
                    avg_levenshtein_distance_norm)

            print("\\textbf{"+LLMS_ENCODED[llm]+"}", "&", few_shot_condition, "&", add_thousand_dots(n_sample), "&",
                  add_thousand_dots(round(np.mean(iterations_avg_unique_words_in_k_words), 2)), "&",
                  add_thousand_dots(round(np.mean(iterations_avg_levenshtein_distance), 2)), "&",
                  add_thousand_dots(round(np.mean(iterations_avg_levenshtein_distance_norm), 2)))
        print("\\hline")

\textbf{GPT-3.5-turbo} & fixed & 500 & 47.86 & 9.79 & 0.79
\textbf{GPT-3.5-turbo} & fixed & 1,000 & 47.84 & 9.79 & 0.79
\textbf{GPT-3.5-turbo} & fixed & 1,500 & 48.0 & 9.8 & 0.79
\hline
\textbf{GPT-3.5-turbo} & random & 500 & 48.09 & 8.81 & 0.77
\textbf{GPT-3.5-turbo} & random & 1,000 & 48.29 & 8.84 & 0.77
\textbf{GPT-3.5-turbo} & random & 1,500 & 48.26 & 8.85 & 0.77
\hline
\textbf{Llama-2-70B} & fixed & 500 & 59.16 & 11.08 & 0.85
\textbf{Llama-2-70B} & fixed & 1,000 & 59.71 & 11.09 & 0.85
\textbf{Llama-2-70B} & fixed & 1,500 & 59.62 & 11.06 & 0.85
\hline
\textbf{Llama-2-70B} & random & 500 & 61.61 & 11.06 & 0.86
\textbf{Llama-2-70B} & random & 1,000 & 61.36 & 11.04 & 0.86
\textbf{Llama-2-70B} & random & 1,500 & 61.22 & 10.98 & 0.86
\hline


#### Real Data


In [39]:
for n_sample in [500, 1000, 1500]:
    iterations_n_unique_tokens = []
    iterations_n_lemmas = []
    iterations_avg_unique_sentences = []

    for it in range(5):
        samples = [item for k in range(
            int(n_sample / 500)) for item in dataset["real"][it][k]]
        n_unique_tokens = count_unique_tokens(
            [token for example in samples for token in example["tokenized_text"]])
        n_unique_lemmas = count_unique_lemmas(
            [token for example in samples for token in example["tokenized_text"]])
        n_unique_sentences = count_unique_sentences(
            [example["text"] for example in samples])

        iterations_n_unique_tokens.append(n_unique_tokens)
        iterations_n_lemmas.append(n_unique_lemmas)
        iterations_avg_unique_sentences.append(n_unique_sentences)

    print("\\textbf{Real Examples}", "&", "-", "&", add_thousand_dots(n_sample), "&",
          add_thousand_dots(round(np.mean(iterations_avg_unique_sentences), 2)), "&",
          add_thousand_dots(round(np.mean(iterations_n_unique_tokens), 2)), "&",
          add_thousand_dots(round(np.mean(iterations_n_lemmas), 2)), "\\\\")
print("\\hline")

\textbf{Real Examples} & - & 500 & 496.4 & 1914.8 & 1492.8 \\
\textbf{Real Examples} & - & 1,000 & 988.8 & 3064.2 & 2352.4 \\
\textbf{Real Examples} & - & 1,500 & 1480.8 & 3998.6 & 3041.0 \\
\hline


In [40]:
for n_sample in [500, 1000, 1500]:
    iterations_avg_unique_words_in_k_words = []
    iterations_avg_levenshtein_distance = []
    iterations_avg_levenshtein_distance_norm = []
    for it in range(5):
        samples = [item for k in range(
            int(n_sample / 500)) for item in dataset["real"][it][k]]

        n_unique_words_in_k_words = get_avg_unique_words_in_k_words(
            [token.text for example in samples for token in example["tokenized_text"]])
        avg_levenshtein_distance = average_word_level_levenshtein_distance(
            samples)
        avg_levenshtein_distance_norm = average_word_level_levenshtein_distance(
            samples, norm=True)

        iterations_avg_unique_words_in_k_words.append(
            n_unique_words_in_k_words)
        iterations_avg_levenshtein_distance.append(
            avg_levenshtein_distance)
        iterations_avg_levenshtein_distance_norm.append(
            avg_levenshtein_distance_norm)

    print("\\textbf{Real Examples}", "&", "-", "&", add_thousand_dots(n_sample), "&",
          add_thousand_dots(
              round(np.mean(iterations_avg_unique_words_in_k_words), 2)), "&",
          add_thousand_dots(
              round(np.mean(iterations_avg_levenshtein_distance), 2)), "&",
          add_thousand_dots(round(np.mean(iterations_avg_levenshtein_distance_norm), 2)), "\\\\")
print("\\hline")

\textbf{Real Examples} & - & 500 & 78.21 & 16.37 & 0.93 \\
\textbf{Real Examples} & - & 1,000 & 78.28 & 16.42 & 0.93 \\
\textbf{Real Examples} & - & 1,500 & 78.32 & 16.39 & 0.93 \\
\hline


### Aspect Term Analysis

#### Synthetic Data

In [41]:
for llm in LLMS:
    for few_shot_condition in FS_CONDITIONS:
        for n_sample in [500, 1000, 1500]:
            tags_from_splits_count = []
            tags_from_splits_count_implicit = []
            tags_from_splits_count_explicit = []
            count_unique_aspect_terms_in_split = []
            count_unique_aspect_terms_in_k_aspect_terms = []
            for it in range(5):
                tags = [tag for k in range(
                    int(n_sample / 500)) for example in dataset["synth"][llm][few_shot_condition][it][k] for tag in example["tags"]]
                tags_explicit = [tag["text"]
                                 for tag in tags if tag["type"] == "label-explicit"]
                tags_from_splits_count.append(len(tags))
                tags_from_splits_count_explicit.append(
                    len([tag for tag in tags if tag["type"] == "label-explicit"]))
                tags_from_splits_count_implicit.append(
                    len([tag for tag in tags if tag["type"] == "label-implicit"]))

                unique_tags = len(set(tags_explicit))

                # Calculate number of unique tokens in 100 aspect terms
                count_unique_aspect_terms_in_k_aspect_terms.append(
                    get_avg_unique_words_in_k_words(tags_explicit))

                count_unique_aspect_terms_in_split.append(unique_tags)
            print("\\textbf{"+LLMS_ENCODED[llm]+"}", "&", few_shot_condition,
                  "&", add_thousand_dots(n_sample),
                  "&", add_thousand_dots(round(np.mean(tags_from_splits_count), 2)),
                  "&", add_thousand_dots(round(np.mean(tags_from_splits_count_implicit), 2)),
                  "&", add_thousand_dots(round(np.mean(tags_from_splits_count_explicit), 2)),
                  "&", add_thousand_dots(round(np.mean(count_unique_aspect_terms_in_split), 2)),
                  "&", add_thousand_dots(round(np.mean(count_unique_aspect_terms_in_k_aspect_terms), 2)), "\\\\")
        print("\\hline")

\textbf{GPT-3.5-turbo} & fixed & 500 & 787.6 & 271.4 & 516.2 & 38.8 & 17.92 \\
\textbf{GPT-3.5-turbo} & fixed & 1,000 & 1576.0 & 543.6 & 1032.4 & 50.6 & 17.56 \\
\textbf{GPT-3.5-turbo} & fixed & 1,500 & 2365.4 & 815.0 & 1550.4 & 61.8 & 17.46 \\
\hline
\textbf{GPT-3.5-turbo} & random & 500 & 694.4 & 304.2 & 390.2 & 33.0 & 18.13 \\
\textbf{GPT-3.5-turbo} & random & 1,000 & 1387.8 & 606.8 & 781.0 & 43.8 & 17.56 \\
\textbf{GPT-3.5-turbo} & random & 1,500 & 2082.8 & 902.6 & 1180.2 & 54.2 & 17.6 \\
\hline
\textbf{Llama-2-70B} & fixed & 500 & 787.2 & 208.4 & 578.8 & 104.8 & 34.12 \\
\textbf{Llama-2-70B} & fixed & 1,000 & 1574.2 & 424.8 & 1149.4 & 170.0 & 33.92 \\
\textbf{Llama-2-70B} & fixed & 1,500 & 2360.0 & 639.6 & 1720.4 & 220.0 & 33.41 \\
\hline
\textbf{Llama-2-70B} & random & 500 & 695.2 & 178.0 & 517.2 & 116.8 & 37.99 \\
\textbf{Llama-2-70B} & random & 1,000 & 1390.8 & 353.4 & 1037.4 & 194.4 & 39.0 \\
\textbf{Llama-2-70B} & random & 1,500 & 2088.2 & 538.2 & 1550.0 & 256.6 & 38.88 \\
\h

### Real Data

In [42]:
for n_sample in [500, 1000, 1500]:
    tags_from_splits_count = []
    tags_from_splits_count_implicit = []
    tags_from_splits_count_explicit = []
    count_unique_aspect_terms_in_split = []
    count_unique_aspect_terms_in_k_aspect_terms = []
    for it in range(5):
        tags = [tag for k in range(int(n_sample / 500))
                for example in dataset["real"][it][k] for tag in example["tags"]]
        tags_explicit = [tag["text"]
                         for tag in tags if tag["type"] == "label-explicit"]
        tags_from_splits_count.append(len(tags))
        tags_from_splits_count_explicit.append(
            len([tag for tag in tags if tag["type"] == "label-explicit"]))
        tags_from_splits_count_implicit.append(
            len([tag for tag in tags if tag["type"] == "label-implicit"]))

        unique_tags = len(set(tags_explicit))

        # Calculate number of unique tokens in 100 aspect terms
        count_unique_aspect_terms_in_k_aspect_terms.append(
            get_avg_unique_words_in_k_words(tags_explicit))

        count_unique_aspect_terms_in_split.append(unique_tags)
    print("\\textbf{Real Examples}", "&", "-",
          "&", add_thousand_dots(n_sample),
          "&", add_thousand_dots(round(np.mean(tags_from_splits_count), 2)),
          "&", add_thousand_dots(
              round(np.mean(tags_from_splits_count_implicit), 2)),
          "&", add_thousand_dots(
              round(np.mean(tags_from_splits_count_explicit), 2)),
          "&", add_thousand_dots(
              round(np.mean(count_unique_aspect_terms_in_split), 2)),
          "&", add_thousand_dots(round(np.mean(count_unique_aspect_terms_in_k_aspect_terms), 2)), "\\\\")

\textbf{Real Examples} & - & 500 & 703.2 & 186.0 & 517.2 & 256.4 & 68.24 \\
\textbf{Real Examples} & - & 1,000 & 1403.8 & 374.0 & 1029.8 & 438.4 & 68.29 \\
\textbf{Real Examples} & - & 1,500 & 2107.2 & 564.8 & 1542.4 & 595.6 & 68.16 \\


### Aspect Term Analysis (With Aspect Category)

### Synth Examples

In [43]:
for llm in LLMS:
    for few_shot_condition in FS_CONDITIONS:
        for aspect_category in ASPECT_CATEGORIES:
            for n_sample in [500, 1000, 1500]:
                tags_from_splits_count = []
                tags_from_splits_count_implicit = []
                tags_from_splits_count_explicit = []
                count_unique_aspect_terms_in_split = []
                count_unique_aspect_terms_in_k_aspect_terms = []
                for it in range(5):
                    tags = [tag for k in range(
                        int(n_sample / 500)) for example in dataset["synth"][llm][few_shot_condition][it][k] for tag in example["tags"] if tag["label"] == aspect_category]
                    tags_explicit = [tag["text"]
                                     for tag in tags if tag["type"] == "label-explicit"]
                    tags_from_splits_count.append(len(tags))
                    tags_from_splits_count_explicit.append(
                        len([tag for tag in tags if tag["type"] == "label-explicit"]))
                    tags_from_splits_count_implicit.append(
                        len([tag for tag in tags if tag["type"] == "label-implicit"]))

                    unique_tags = len(set(tags_explicit))

                    # Calculate number of unique tokens in 100 aspect terms
                    count_unique_aspect_terms_in_k_aspect_terms.append(
                        get_avg_unique_words_in_k_words(tags_explicit, n_selection=10))

                    count_unique_aspect_terms_in_split.append(unique_tags)
                print("\\textbf{"+LLMS_ENCODED[llm]+"}", "&", few_shot_condition, "&", aspect_category,
                      "&", add_thousand_dots(n_sample),
                      "&", add_thousand_dots(
                          round(np.mean(tags_from_splits_count), 2)),
                      "&", add_thousand_dots(
                          round(np.mean(tags_from_splits_count_implicit), 2)),
                      "&", add_thousand_dots(
                          round(np.mean(tags_from_splits_count_explicit), 2)),
                      "&", add_thousand_dots(round(
                          np.mean(count_unique_aspect_terms_in_split), 2)),
                      "&", add_thousand_dots(round(np.mean(count_unique_aspect_terms_in_k_aspect_terms), 2)), "\\\\")
        print("\\hline")

\textbf{GPT-3.5-turbo} & fixed & GENERAL-IMPRESSION & 500 & 155.4 & 111.4 & 44.0 & 12.0 & 5.16 \\
\textbf{GPT-3.5-turbo} & fixed & GENERAL-IMPRESSION & 1,000 & 311.6 & 225.4 & 86.2 & 17.2 & 5.17 \\
\textbf{GPT-3.5-turbo} & fixed & GENERAL-IMPRESSION & 1,500 & 467.6 & 341.0 & 126.6 & 20.4 & 5.21 \\
\textbf{GPT-3.5-turbo} & fixed & FOOD & 500 & 158.2 & 38.4 & 119.8 & 12.2 & 2.61 \\
\textbf{GPT-3.5-turbo} & fixed & FOOD & 1,000 & 317.0 & 76.0 & 241.0 & 16.2 & 2.46 \\
\textbf{GPT-3.5-turbo} & fixed & FOOD & 1,500 & 476.2 & 110.6 & 365.6 & 20.4 & 2.45 \\
\textbf{GPT-3.5-turbo} & fixed & SERVICE & 500 & 158.8 & 32.0 & 126.8 & 4.6 & 2.01 \\
\textbf{GPT-3.5-turbo} & fixed & SERVICE & 1,000 & 316.8 & 61.8 & 255.0 & 6.2 & 2.2 \\
\textbf{GPT-3.5-turbo} & fixed & SERVICE & 1,500 & 474.8 & 90.0 & 384.8 & 7.8 & 2.18 \\
\textbf{GPT-3.5-turbo} & fixed & AMBIENCE & 500 & 158.6 & 35.2 & 123.4 & 10.2 & 3.48 \\
\textbf{GPT-3.5-turbo} & fixed & AMBIENCE & 1,000 & 315.6 & 72.4 & 243.2 & 12.8 & 3.35 \\
\text

### Real Examples

In [44]:
for aspect_category in ASPECT_CATEGORIES:
    for n_sample in [500, 1000, 1500]:
        tags_from_splits_count = []
        tags_from_splits_count_implicit = []
        tags_from_splits_count_explicit = []
        count_unique_aspect_terms_in_split = []
        count_unique_aspect_terms_in_k_aspect_terms = []
        for it in range(5):
            tags = [tag for k in range(int(n_sample / 500))
                    for example in dataset["real"][it][k] for tag in example["tags"] if tag["label"] == aspect_category]
            tags_explicit = [tag["text"]
                             for tag in tags if tag["type"] == "label-explicit"]
            tags_from_splits_count.append(len(tags))
            tags_from_splits_count_explicit.append(
                len([tag for tag in tags if tag["type"] == "label-explicit"]))
            tags_from_splits_count_implicit.append(
                len([tag for tag in tags if tag["type"] == "label-implicit"]))

            unique_tags = len(set(tags_explicit))

            # Calculate number of unique tokens in 100 aspect terms
            count_unique_aspect_terms_in_k_aspect_terms.append(
                get_avg_unique_words_in_k_words(tags_explicit, n_selection=10))

            count_unique_aspect_terms_in_split.append(unique_tags)
        print("\\textbf{Real Examples} &", aspect_category,
              "&", add_thousand_dots(n_sample),
              "&", add_thousand_dots(
                  round(np.mean(tags_from_splits_count), 2)),
              "&", add_thousand_dots(
                  round(np.mean(tags_from_splits_count_implicit), 2)),
              "&", add_thousand_dots(
                  round(np.mean(tags_from_splits_count_explicit), 2)),
              "&", add_thousand_dots(round(
                  np.mean(count_unique_aspect_terms_in_split), 2)),
              "&", add_thousand_dots(round(np.mean(count_unique_aspect_terms_in_k_aspect_terms), 2)), "\\\\")

\textbf{Real Examples} & GENERAL-IMPRESSION & 500 & 124.6 & 96.8 & 27.8 & 14.0 & 6.91 \\
\textbf{Real Examples} & GENERAL-IMPRESSION & 1,000 & 249.2 & 194.0 & 55.2 & 24.0 & 7.07 \\
\textbf{Real Examples} & GENERAL-IMPRESSION & 1,500 & 376.8 & 294.8 & 82.0 & 32.4 & 7.04 \\
\textbf{Real Examples} & FOOD & 500 & 281.2 & 30.8 & 250.4 & 144.8 & 8.71 \\
\textbf{Real Examples} & FOOD & 1,000 & 561.2 & 62.6 & 498.6 & 249.6 & 8.69 \\
\textbf{Real Examples} & FOOD & 1,500 & 839.4 & 94.6 & 744.8 & 339.8 & 8.69 \\
\textbf{Real Examples} & SERVICE & 500 & 174.8 & 39.6 & 135.2 & 51.4 & 7.17 \\
\textbf{Real Examples} & SERVICE & 1,000 & 349.8 & 79.6 & 270.2 & 87.2 & 7.14 \\
\textbf{Real Examples} & SERVICE & 1,500 & 525.8 & 118.8 & 407.0 & 116.8 & 7.13 \\
\textbf{Real Examples} & AMBIENCE & 500 & 80.0 & 12.0 & 68.0 & 38.0 & 7.78 \\
\textbf{Real Examples} & AMBIENCE & 1,000 & 159.0 & 23.4 & 135.6 & 66.4 & 7.82 \\
\textbf{Real Examples} & AMBIENCE & 1,500 & 238.8 & 34.4 & 204.4 & 93.2 & 7.81 \\
\textbf