# Notebook: Analyse Language

## Packages

In [1]:
from spacy.lang.de.stop_words import STOP_WORDS
from nltk.tokenize import word_tokenize
from collections import Counter
import random
import numpy as np
import json

## Settings

In [2]:
random.seed(42)

## Constants

In [3]:
ASPECT_CATEGORIES = ["GENERAL-IMPRESSION",
                     "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
LLMS = ["GPT-3", "Llama70B"]
FS_CONDITIONS = ["fixed", "random"]
PROMPTING_ENCODING = {"fixed": "LRS\\textsubscript{25}",
                      "random": "LRS\\textsubscript{500}"}

## Code

### Load Datasets

In [4]:
dataset = {"synth": {}, "real": []}

# Load Synth
for llm in LLMS:
    dataset["synth"][llm] = {}
    for prompting in FS_CONDITIONS:
        dataset["synth"][llm][prompting] = []
        for split in range(6):
            with open(f"../07 train models/synth/{llm}/{prompting}/split_{split}.json", 'r', encoding='utf-8') as json_file:
                split_data = json.load(json_file)
            dataset["synth"][llm][prompting].append(split_data)

# Load Real
for split in range(6):
    with open(f"../07 train models/real/split_{split}.json", 'r', encoding='utf-8') as json_file:
        split_data = json.load(json_file)
    dataset["real"].append(split_data)


### Count Avg Number of Words

In [5]:
def convert_label_string(label):
    return "\\begin{tabular}[c]{@{}l@{}}{[}" + " \\\\ ".join(["\\texttt{"+str(tuple(t))+"}" for t in label]) + "{]}\\end{tabular}"
    #return "\\texttt{" + str([str(tuple(t)) for t in label]).replace("[", "{[}").replace("]", "{]}") + "}"

In [6]:
def wrap_text(text, max_line_length=95):
    words = text.split()
    rows = []
    current_line = ""

    for word in words:
        if len(current_line) + len(word) + 1 <= max_line_length:
            current_line += word + " "
        else:
            rows.append(current_line.strip())
            current_line = word + " "

    if current_line:
        rows.append(current_line.strip())

    rows = ["\\texttt{" + row + "}" for row in rows]


    return "\\begin{tabular}[c]{@{}l@{}}" + " \\\\ ".join(rows).replace("<", "\\textless{}").replace(">", "\\textgreater{}") + "\\end{tabular}"

In [7]:
shown_ids = []
k = 0
for llm in LLMS:
    for prompting in FS_CONDITIONS:
        print(llm, prompting, "\n\n")
        for ac_idx, aspect_category in enumerate(ASPECT_CATEGORIES):
            print("\\textbf{\\texttt{"+aspect_category+"}} & \\\\ \\hline")
            for i in range(2):
                random_split_idx = random.choice(range(0,6))
                k+=1
                examples = [example for example in dataset["synth"][llm][prompting][random_split_idx] if aspect_category in [
                    tag["label"] for tag in example["tags"] if tag["type"] == "label-explicit"] if example["id"] not in shown_ids]
                
                random_example = random.choice(examples)
                print(convert_label_string(random_example["llm_label"]), "&", wrap_text(random_example["llm_prediction_raw"]), "\\\\")
                if k % 10 != 0:
                   print("\\arrayrulecolor{gray}\\hline")
        print("\n\n")

GPT-3 fixed 


\textbf{\texttt{GENERAL-IMPRESSION}} & \\ \hline
\begin{tabular}[c]{@{}l@{}}{[}\texttt{('SERVICE', 'NEUTRAL')} \\ \texttt{('PRICE', 'POSITIVE')} \\ \texttt{('PRICE', 'NEGATIVE')} \\ \texttt{('GENERAL-IMPRESSION', 'POSITIVE')} \\ \texttt{('AMBIENCE', 'NEGATIVE')} \\ \texttt{('PRICE', 'NEUTRAL')}{]}\end{tabular} & \begin{tabular}[c]{@{}l@{}}\texttt{Der \textless{}aspect-term aspect="SERVICE" polarity="NEUTRAL"\textgreater{}Service\textless{}/aspect-term\textgreater{} war in Ordnung, die} \\ \texttt{\textless{}aspect-term aspect="PRICE" polarity="POSITIVE"\textgreater{}Preise\textless{}/aspect-term\textgreater{} für einige Gerichte waren} \\ \texttt{angemessen, für andere jedoch zu hoch, insgesamt war der \textless{}aspect-term} \\ \texttt{aspect="GENERAL-IMPRESSION" polarity="POSITIVE"\textgreater{}Eindruck\textless{}/aspect-term\textgreater{} aber positiv, das} \\ \texttt{\textless{}aspect-term aspect="AMBIENCE" polarity="NEGATIVE"\textgreater{}Ambiente\textless{}/aspect-