# Notebook: Analyse Language


## Packages


In [1]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from spacy.lang.de.stop_words import STOP_WORDS
from collections import Counter
import Levenshtein
import numpy as np
import random
import string
import spacy
import nltk
import json

## Constants


In [2]:
ASPECT_CATEGORIES = ["GENERAL-IMPRESSION",
                     "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
LLMS = ["Llama70B", "GPT-3"]
FS_CONDITIONS = ["random", "fixed"]
N_FOLDS = 3
CRITERIA_RS = "tag_with_polarity"
POLARITIES = ["POSITIVE", "NEGATIVE", "NEUTRAL"]
MENTIONING_TYPE = ["implicit", "explicit"]
COMBINATIONS = [f"{aspect}-{polarity}" for aspect in ["SERVICE", "FOOD",
                                                      "GENERAL-IMPRESSION", "AMBIENCE", "PRICE"] for polarity in POLARITIES]
RANDOM_STATE = 43

In [3]:
LLMS_ENCODED = {"GPT-3": "\\textbf{GPT-3.5-turbo}",
                "Llama70B": "\\textbf{Llama-2-70B}"}
ENCODE_CONDITION = {"fixed": "\\textbf{LRS\\textsubscript{25}}",
                    "random": "\\textbf{LRS\\textsubscript{500}}"}

## Settings


In [4]:
nlp = spacy.load("de_core_news_lg")
nltk.download('punkt')
random.seed(RANDOM_STATE)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Code


In [5]:
def round_number(num, decimal_places):
    formatted_num = "{:.{}f}".format(num, decimal_places)
    rounded_num = float(formatted_num)
    return rounded_num

### Load Datasets


In [6]:
dataset_raw = {"synth": {}, "real": []}

# Load Synth
for llm in LLMS:
    dataset_raw["synth"][llm] = {}
    for prompting in FS_CONDITIONS:
        dataset_raw["synth"][llm][prompting] = []
        for split in range(5):
            with open(f"../07 train models/synth/{llm}/{prompting}/split_{split}.json", 'r', encoding='utf-8') as json_file:
                split_data = json.load(json_file)
            for example in split_data:
                example["tokenized_text"] = nlp(example["text"])
            dataset_raw["synth"][llm][prompting].append(split_data)

# Load Real
for split in range(6):
    with open(f"../07 train models/real/split_{split}.json", 'r', encoding='utf-8') as json_file:
        split_data = json.load(json_file)
    for example in split_data:
        example["tokenized_text"] = nlp(example["text"])
    dataset_raw["real"].append(split_data)

### First Token in Sentence


#### Synthetic Data

In [7]:
len([example["tokenized_text"][0].pos_ for example in dataset_raw["synth"]["Llama70B"]["fixed"][0]])

1975

In [8]:
for llm in LLMS:
    for few_shot_condition in FS_CONDITIONS:
        first_tokens = [example["tokenized_text"][0].pos_ for split_id in range(5) for example in dataset_raw["synth"][llm][few_shot_condition][split_id]]
        pos_counts = Counter(first_tokens)
        article_percentage = (pos_counts["DET"] / len(first_tokens)) * 100
        print(
            f"Prozentsatz der Artikel ({LLMS_ENCODED[llm]}, {ENCODE_CONDITION[prompting]}): {round_number(article_percentage, 2)} %")

Prozentsatz der Artikel (\textbf{GPT-3.5-turbo}, \textbf{LRS\textsubscript{500}}): 91.47 %
Prozentsatz der Artikel (\textbf{GPT-3.5-turbo}, \textbf{LRS\textsubscript{500}}): 91.67 %
Prozentsatz der Artikel (\textbf{Llama-2-70B}, \textbf{LRS\textsubscript{500}}): 57.72 %
Prozentsatz der Artikel (\textbf{Llama-2-70B}, \textbf{LRS\textsubscript{500}}): 57.33 %


#### Real Data

In [9]:
first_tokens = [example["tokenized_text"][0].pos_ for split_idx in range(
    5) for example in dataset_raw["real"][split_idx]]
pos_counts = Counter(first_tokens)
article_percentage = (pos_counts["DET"] / len(first_tokens)) * 100
print(
    f"Prozentsatz der Artikel Real: {round_number(article_percentage, 3)} %")

Prozentsatz der Artikel Real: 27.6 %


### AVG Number of Tokens in Sentence for each Data

#### Synthetic Data

In [10]:
len([len(example["tokenized_text"]) for split_idx in range(5) for example in dataset_raw["synth"]["Llama70B"]["fixed"][split_idx]])

9875

In [13]:
for llm in LLMS:
    for few_shot_condition in FS_CONDITIONS:
        word_counts = [len(example["tokenized_text"]) for split_idx in range(
            5) for example in dataset_raw["synth"][llm][few_shot_condition][split_idx]]

        word_counts_splits = []
        for idx in range(5):
            word_counts_splits.append(np.mean(
                [len(example["tokenized_text"]) for example in dataset_raw["synth"][llm][few_shot_condition][idx]]))

        print(LLMS_ENCODED[llm], "&", ENCODE_CONDITION[few_shot_condition], "&", round_number(
            np.mean(word_counts), 2), [round_number(count_avg, 2) for count_avg in word_counts_splits], round_number(np.std(word_counts_splits), 3))

\textbf{GPT-3.5-turbo} & \textbf{LRS\textsubscript{25}} & 9.68 [9.24, 9.72, 9.1, 10.32, 10.01] 0.457
\textbf{GPT-3.5-turbo} & \textbf{LRS\textsubscript{500}} & 9.04 [8.97, 9.11, 8.84, 8.87, 9.4] 0.204
\textbf{Llama-2-70B} & \textbf{LRS\textsubscript{25}} & 10.35 [9.56, 10.98, 10.0, 10.42, 10.8] 0.519
\textbf{Llama-2-70B} & \textbf{LRS\textsubscript{500}} & 10.2 [10.06, 10.26, 9.89, 10.24, 10.57] 0.228


#### Real Data

In [14]:
word_counts = [len(example["tokenized_text"]) for idx in range(6) for example in dataset_raw["real"][idx]]

print("Real", round_number(np.mean(word_counts),2))


Real 13.12
