# Notebook: Analyse Language

## Packages

In [43]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from spacy.lang.de.stop_words import STOP_WORDS
from nltk.tokenize import word_tokenize
from collections import Counter
import Levenshtein
import numpy as np
import random
import string
import spacy
import nltk
import json

## Constants

In [29]:
ASPECT_CATEGORIES = ["GENERAL-IMPRESSION",
                     "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
LLMS = ["GPT-3", "Llama70B"]
FS_CONDITIONS = ["fixed", "random"]
PROMPTING_ENCODING = {"fixed": "25 fixed examples",
                      "random": "25 random examples"}
N_FOLDS = 3
CRITERIA_RS = "tag_with_polarity"
POLARITIES = ["POSITIVE", "NEGATIVE", "NEUTRAL"]
MENTIONING_TYPE = ["implicit", "explicit"]
COMBINATIONS = [f"{aspect}-{polarity}" for aspect in ["SERVICE", "FOOD", "GENERAL-IMPRESSION", "AMBIENCE", "PRICE"] for polarity in POLARITIES]
RANDOM_STATE = 43

## Settings

In [44]:
nlp = spacy.load("de_core_news_lg")
nltk.download('punkt')
random.seed(RANDOM_STATE)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Code

### Helper

In [83]:
def count_tokens(texts):
    token_counts = []
    for text in texts:
        tokens = word_tokenize(text)
        token_counts.append(len(tokens))
    return token_counts


def count_unique_tokens(tokens):
    unique_tokens = set(token.text for token in tokens)
    return len(unique_tokens)


def count_unique_lemmas(tokens):
    unique_lemmas = set(token.lemma_ for token in tokens)
    return len(unique_lemmas)


def remove_stopwords_and_punctuation(text):
    doc = nlp(text)
    cleaned_tokens = [token.lemma_ for token in doc if token.text.lower(
    ) not in STOP_WORDS and token.text not in string.punctuation and token.text.isalpha()]
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text


def count_top_n_lemmas(texts, n):
    lemma_counts = {}
    for text in texts:
        cleaned_text = remove_stopwords_and_punctuation(text)
        doc = nlp(cleaned_text)
        for token in doc:
            lemma = token.lemma_
            if lemma in lemma_counts:
                lemma_counts[lemma] += 1
            else:
                lemma_counts[lemma] = 1

    sorted_lemmas = sorted(
        lemma_counts, key=lambda lemma: lemma_counts[lemma], reverse=True)
    top_n_lemmas = sorted_lemmas[:n]

    return ', '.join(top_n_lemmas)


def get_avg_unique_words_in_k_words(tokens, n_selection=100, n_repetitions=100000):
    lemmas = [token.text for token in tokens]
    iterations_n_unique_lemmas = []
    for i in range(n_repetitions):
        random_indices = random.sample(range(len(lemmas)), n_selection)
        random_lemmas = [lemmas[index] for index in random_indices]
        n_unique_lemmas = len(set(random_lemmas))
        iterations_n_unique_lemmas.append(n_unique_lemmas)
    return np.mean(iterations_n_unique_lemmas)


def average_word_level_levenshtein_distance(docs, norm=False):
    tokenized_texts = [
        [token.text for token in doc["tokenized_text"]] for doc in docs]

    total_distance = 0
    pair_count = 0

    for i in range(len(tokenized_texts)):
        for j in range(i + 1, len(tokenized_texts)):
            tokens1 = tokenized_texts[i]
            tokens2 = tokenized_texts[j]

            if len(tokens1) >= len(tokens2):
                max_tokens = len(tokens1)
            else:
                max_tokens = len(tokens2)

            distance = Levenshtein.distance(tokens1, tokens2)
            if norm:
                distance = distance / max_tokens
            total_distance += distance
            pair_count += 1

    average_distance = total_distance / pair_count if pair_count > 0 else 0
    return average_distance

### Load Datasets

In [31]:
dataset = {"synth": {}, "real": []}

# Load Synth
for llm in LLMS:
    dataset["synth"][llm] = {}
    for prompting in FS_CONDITIONS:
        dataset["synth"][llm][prompting] = []
        for split in range(5):
            with open(f"../07 train models/synth/{llm}/{prompting}/split_{split}.json", 'r', encoding='utf-8') as json_file:
                split_data = json.load(json_file)
            for example in split_data:
                example["tokenized_text"] = nlp(example["text"])
            dataset["synth"][llm][prompting].append(split_data)

# Load Real
for split in range(5):
    with open(f"../07 train models/real/split_{split}.json", 'r', encoding='utf-8') as json_file:
        split_data = json.load(json_file)
    for example in split_data:
        example["tokenized_text"] = nlp(example["text"])
    dataset["real"].append(split_data)

In [32]:
def get_one_hot(subset):
    labels_one_hot = []
    for i in range(len(subset)):
        tags_in_example = list(set([tag[CRITERIA_RS] for tag in subset[i]["tags"]]))
        one_hot_encoded_combination = np.array([1 if tag in tags_in_example else 0 for tag in COMBINATIONS])
        labels_one_hot.append(one_hot_encoded_combination)
    return labels_one_hot

In [33]:
for llm in LLMS:
    for few_shot_condition in FS_CONDITIONS:
        for iteration in range(5):
            print(llm, few_shot_condition, iteration)
            if few_shot_condition == "random":
                subset = dataset["synth"][llm][few_shot_condition][iteration]
            else:
                subset = dataset["synth"][llm][few_shot_condition][iteration][475:]

            found_5_split = False
            restart_idx = 0
            while found_5_split == False:
                mskf = MultilabelStratifiedKFold(
                    n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE+restart_idx)
                section = []
                for train_index, test_index in mskf.split(subset, get_one_hot(subset)):
                    split_500 = [subset[i] for i in test_index]
                    section.append(split_500)

                if len(section[0]) == 500 and len(section[1]) == 500 and len(section[2]) == 500:
                    found_5_split = True

                restart_idx += 1

            dataset["synth"][llm][few_shot_condition][iteration] = section

GPT-3 fixed 0
GPT-3 fixed 1
GPT-3 fixed 2
GPT-3 fixed 3
GPT-3 fixed 4
GPT-3 random 0
GPT-3 random 1
GPT-3 random 2
GPT-3 random 3
GPT-3 random 4
Llama70B fixed 0
Llama70B fixed 1
Llama70B fixed 2
Llama70B fixed 3
Llama70B fixed 4
Llama70B random 0
Llama70B random 1
Llama70B random 2
Llama70B random 3
Llama70B random 4


In [34]:
real_examples = []
for i in [0, 1, 2, 3, 4]:
    real_examples.append([])
    for k in [0, 1, 2]:
        if (i+k) < 5:
            t = i+k
        else:
            t = i+k - 5
        real_examples[i].append(dataset["real"][t])
dataset["real"] = real_examples

### Text Analysis

In [84]:
for llm in LLMS:
    for few_shot_condition in FS_CONDITIONS:
        for n_sample in [500, 1000, 1500]:
            iterations_n_unique_tokens = []
            iterations_n_lemmas = []
            iterations_avg_unique_words_in_k_words = []
            iterations_avg_levenshtein_distance = []
            iterations_avg_levenshtein_distance_norm = []
            for it in range(5):
                samples = [item for k in range(
                    int(n_sample / 500)) for item in dataset["synth"][llm][few_shot_condition][it][k]]
                n_unique_tokens = count_unique_tokens(
                    [token for example in samples for token in example["tokenized_text"]])
                n_unique_lemmas = count_unique_lemmas(
                    [token for example in samples for token in example["tokenized_text"]])
                n_unique_words_in_k_words = get_avg_unique_words_in_k_words(
                    [token for example in samples for token in example["tokenized_text"]])
                avg_levenshtein_distance = average_word_level_levenshtein_distance(
                    samples)
                avg_levenshtein_distance_norm = average_word_level_levenshtein_distance(
                    samples, norm=True)

                iterations_n_unique_tokens.append(n_unique_tokens)
                iterations_n_lemmas.append(n_unique_lemmas)
                iterations_avg_unique_words_in_k_words.append(
                    n_unique_words_in_k_words)
                iterations_avg_levenshtein_distance.append(
                    avg_levenshtein_distance)
                iterations_avg_levenshtein_distance_norm.append(
                    avg_levenshtein_distance_norm)

            print(llm, "&", few_shot_condition, "&", n_sample, "&",
                  round(np.mean(iterations_n_unique_tokens), 2), "&",
                  round(np.mean(iterations_n_lemmas), 2), "&",
                  round(np.mean(iterations_avg_unique_words_in_k_words), 2), "&",
                  round(np.mean(iterations_avg_levenshtein_distance), 2), "&",
                  round(np.mean(iterations_avg_levenshtein_distance_norm), 2))
        print("\\hline")

GPT-3 & fixed & 500 & 289.6 & 208.2 & 47.87 & 9.79 & 0.79
GPT-3 & fixed & 1000 & 369.6 & 264.4 & 47.84 & 9.79 & 0.79
GPT-3 & fixed & 1500 & 428.4 & 307.0 & 48.0 & 9.8 & 0.79
\hline
GPT-3 & random & 500 & 295.4 & 217.0 & 48.11 & 8.81 & 0.77
GPT-3 & random & 1000 & 389.4 & 281.2 & 48.32 & 8.84 & 0.77
GPT-3 & random & 1500 & 456.6 & 328.4 & 48.28 & 8.85 & 0.77
\hline
Llama70B & fixed & 500 & 694.4 & 536.6 & 59.18 & 11.08 & 0.85
Llama70B & fixed & 1000 & 1023.8 & 787.0 & 59.69 & 11.09 & 0.85
Llama70B & fixed & 1500 & 1269.2 & 973.0 & 59.62 & 11.06 & 0.85
\hline
Llama70B & random & 500 & 751.8 & 580.2 & 61.61 & 11.06 & 0.86
Llama70B & random & 1000 & 1103.0 & 846.8 & 61.35 & 11.04 & 0.86
Llama70B & random & 1500 & 1380.4 & 1054.0 & 61.23 & 10.98 & 0.86
\hline


In [85]:

for n_sample in [500, 1000, 1500]:
    iterations_n_unique_tokens = []
    iterations_n_lemmas = []
    iterations_avg_unique_words_in_k_words = []
    iterations_avg_levenshtein_distance = []
    iterations_avg_levenshtein_distance_norm = []
    for it in range(5):
        samples = [item for k in range(
            int(n_sample / 500)) for item in dataset["real"][it][k]]
        n_unique_tokens = count_unique_tokens(
            [token for example in samples for token in example["tokenized_text"]])
        n_unique_lemmas = count_unique_lemmas(
            [token for example in samples for token in example["tokenized_text"]])
        n_unique_words_in_k_words = get_avg_unique_words_in_k_words(
            [token for example in samples for token in example["tokenized_text"]])
        avg_levenshtein_distance = average_word_level_levenshtein_distance(
            samples)
        avg_levenshtein_distance_norm = average_word_level_levenshtein_distance(
            samples, norm=True)

        iterations_n_unique_tokens.append(n_unique_tokens)
        iterations_n_lemmas.append(n_unique_lemmas)
        iterations_avg_unique_words_in_k_words.append(
            n_unique_words_in_k_words)
        iterations_avg_levenshtein_distance.append(
            avg_levenshtein_distance)
        iterations_avg_levenshtein_distance_norm.append(
            avg_levenshtein_distance_norm)

    print("-", "&", "-", "&", n_sample, "&",
          round(np.mean(iterations_n_unique_tokens), 2), "&",
          round(np.mean(iterations_n_lemmas), 2), "&",
          round(np.mean(iterations_avg_unique_words_in_k_words), 2), "&",
          round(np.mean(iterations_avg_levenshtein_distance), 2), "&",
          round(np.mean(iterations_avg_levenshtein_distance_norm), 2))
print("\\hline")

- & - & 500 & 1914.8 & 1492.8 & 78.19 & 16.37 & 0.93
- & - & 1000 & 3054.6 & 2345.0 & 78.19 & 16.38 & 0.93
- & - & 1500 & 3986.4 & 3032.0 & 78.2 & 16.38 & 0.93
\hline
