# Notebook: Analyse Language

## Packages

In [1]:
from spacy.lang.de.stop_words import STOP_WORDS
from nltk.tokenize import word_tokenize
from collections import Counter
import Levenshtein
import numpy as np
import string
import spacy
import nltk
import json

## Settings

In [2]:
nlp = spacy.load("de_core_news_lg")
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Constants

In [3]:
ASPECT_CATEGORIES = ["GENERAL-IMPRESSION",
                     "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
LLMS = ["GPT-3", "Llama70B"]
FS_CONDITIONS = ["fixed", "random"]
PROMPTING_ENCODING = {"fixed": "25 fixed examples",
                      "random": "25 random examples"}

## Code

### Helper

In [4]:
def count_tokens(texts):
    token_counts = [] 
    for text in texts:
        tokens = word_tokenize(text)
        token_counts.append(len(tokens))
    return token_counts

def count_unique_lemmas(texts):
    unique_lemmas = set()
    for text in texts:
        doc = nlp(text)
        for token in doc:
            unique_lemmas.add(token.lemma_)
    return len(unique_lemmas)

def remove_stopwords_and_punctuation(text):
    doc = nlp(text)
    cleaned_tokens = [token.lemma_ for token in doc if token.text.lower() not in STOP_WORDS and token.text not in string.punctuation and token.text.isalpha()]
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text


def count_top_n_lemmas(texts, n):
    lemma_counts = {}
    for text in texts:
        cleaned_text = remove_stopwords_and_punctuation(text)
        doc = nlp(cleaned_text)
        for token in doc:
            lemma = token.lemma_
            if lemma in lemma_counts:
                lemma_counts[lemma] += 1
            else:
                lemma_counts[lemma] = 1
    
    sorted_lemmas = sorted(lemma_counts, key=lambda lemma: lemma_counts[lemma], reverse=True)
    top_n_lemmas = sorted_lemmas[:n]
    
    return ', '.join(top_n_lemmas)

### Load Datasets

In [5]:
dataset = {"synth": {}, "real": []}

# Load Synth
for llm in LLMS:
    dataset["synth"][llm] = {}
    for prompting in FS_CONDITIONS:
        dataset["synth"][llm][prompting] = []
        for split in range(5):
            with open(f"../07 train models/synth/{llm}/{prompting}/split_{split}.json", 'r', encoding='utf-8') as json_file:
                split_data = json.load(json_file)
            for example in split_data:
                example["tokenized_text"] = nlp(example["text"])
            dataset["synth"][llm][prompting].append(split_data)

# Load Real
for split in range(5):
    with open(f"../07 train models/real/split_{split}.json", 'r', encoding='utf-8') as json_file:
        split_data = json.load(json_file)
    for example in split_data:
        example["tokenized_text"] = nlp(example["text"])
    dataset["real"].append(split_data)


### Count Avg Number of Words

In [6]:
for llm in LLMS:
    for prompting in FS_CONDITIONS:
        print("-----")
        print(llm, prompting, round(np.mean(count_tokens(
            [example["text"] for split_data in dataset["synth"][llm][prompting] for example in split_data])), 2))
        print(llm, prompting, [round(np.mean(count_tokens(
            [example["text"] for example in dataset["synth"][llm][prompting][split_id]])), 2) for split_id in range(0, 5)])

-----
GPT-3 fixed 9.68
GPT-3 fixed [9.24, 9.72, 9.1, 10.32, 10.01]
-----
GPT-3 random 9.04
GPT-3 random [8.97, 9.11, 8.84, 8.87, 9.4]
-----
Llama70B fixed 10.31
Llama70B fixed [9.53, 10.93, 9.98, 10.31, 10.78]
-----
Llama70B random 10.16
Llama70B random [10.02, 10.22, 9.85, 10.18, 10.55]


In [7]:
print("Real", round(np.mean(count_tokens([example["text"] for split_examples in dataset["real"] for example in split_examples])), 2))

Real 13.1


### Analyse Text Similarity

In [48]:
def average_word_level_levenshtein_distance(docs):
    tokenized_texts = [[token.text for token in doc] for doc in docs]

    total_distance = 0
    pair_count = 0

    for i in range(len(tokenized_texts)):
        for j in range(i + 1, len(tokenized_texts)):
            tokens1 = tokenized_texts[i]
            tokens2 = tokenized_texts[j]

            if len(tokens1) >= len(tokens2):
                max_tokens = len(tokens1)
            else:
                max_tokens = len(tokens2)

            distance = Levenshtein.distance(tokens1, tokens2)
            total_distance += distance
            pair_count += 1

    average_distance = total_distance / pair_count if pair_count > 0 else 0
    return average_distance

In [49]:
for llm in LLMS:
    for prompting in FS_CONDITIONS:
        for split_idx in range(5):
            texts_in_split = [example["tokenized_text"] for example in dataset["synth"][llm][prompting][split_idx]]
            print(llm, prompting, split_idx, average_word_level_levenshtein_distance(texts_in_split))

            # for ac in ASPECT_CATEGORIES:
            #     tags_in_split = [nlp(tag["text"]) for example in dataset["synth"][llm][prompting][split_idx] for tag in example["tags"] if tag["type"] == "label-explicit" and tag["label"] == ac]
            #     print(llm, prompting, split_idx, ac, average_word_level_cosine_similarity(tags_in_split))

GPT-3 fixed 0 9.43767509266028
GPT-3 fixed 1 10.021982994113348
GPT-3 fixed 2 9.306075179869955
GPT-3 fixed 3 10.383843638182448
GPT-3 fixed 4 9.958813948418042
GPT-3 random 0 8.851007338225484
GPT-3 random 1 8.96883878140983
GPT-3 random 2 8.621337780742717
GPT-3 random 3 8.47982477207027
GPT-3 random 4 9.342684456304204
Llama70B fixed 0 10.288666076718863
Llama70B fixed 1 11.788395983224962
Llama70B fixed 2 10.640098495633104
Llama70B fixed 3 11.24434868480115
Llama70B fixed 4 11.343140317802316
Llama70B random 0 10.895543695797198
Llama70B random 1 11.07879653102068
Llama70B random 2 10.53895574827663
Llama70B random 3 10.937361796753391
Llama70B random 4 11.44266044029353


### Most frequent terms

Ähnlich wie bei den realen Daten Aspekte, die das Aspekt selber benennen

In [37]:
round(456.6)

457

In [47]:
for llm in LLMS:
    print(llm)
    for prompting in FS_CONDITIONS:
        for ac_idx, aspect_category in enumerate(ASPECT_CATEGORIES):
            aspect_terms_all_splits = []
            n_aspects_in_splits = []
            n_aspect_terms_in_splits = []
            n_unique_aspects_in_split = []

            for split_idx in range(5):
                aspects_in_split = [tag for example in dataset["synth"][llm][prompting][split_idx]
                                    for tag in example["tags"] if tag["label"] == aspect_category]
                aspect_terms_in_split = [
                    tag["text"] for tag in aspects_in_split if tag["type"] == "label-explicit"]
                aspect_terms_all_splits += aspect_terms_in_split
                n_aspects_in_splits.append(len(aspects_in_split))
                n_aspect_terms_in_splits.append(len(aspect_terms_in_split))
                n_unique_aspects_in_split.append(
                    len(list(set(aspect_terms_in_split))))

            aspect_term_counts = Counter(aspect_terms_all_splits)
            most_common_aspect_terms = aspect_term_counts.most_common(5)

            term_list = [
                f"\\textit{{{term}}} ({round(count/5,2)})" for term, count in most_common_aspect_terms]
            term_string = ", ".join(term_list)

            if ac_idx == 0:
                print(
                    f"\n {PROMPTING_ENCODING[prompting]} & {aspect_category} & {round(np.mean(n_aspect_terms_in_splits))} & {round(np.mean(n_unique_aspects_in_split), 2)} & {term_string} \\\\")
            elif ac_idx == 4:
                print(
                    f"\n & {aspect_category}  & {round(np.mean(n_aspect_terms_in_splits))} & {round(np.mean(n_unique_aspects_in_split), 2)} & {term_string} \\\\ \\hline")
            else:
                print(
                    f"\n & {aspect_category}  & {round(np.mean(n_aspect_terms_in_splits))} & {round(np.mean(n_unique_aspects_in_split), 2)} & {term_string} \\\\")

GPT-3

 25 fixed examples & GENERAL-IMPRESSION & 161 & 22.4 & \textit{Restaurant} (71.4), \textit{Eindruck} (35.0), \textit{Gesamteindruck} (12.4), \textit{allgemeine Impression} (5.4), \textit{Atmosphäre} (4.6) \\

 & FOOD  & 479 & 23.4 & \textit{Essen} (405.2), \textit{Speisen} (12.2), \textit{Dessert} (11.8), \textit{Gericht} (7.4), \textit{Pizza} (5.8) \\

 & SERVICE  & 509 & 8.8 & \textit{Service} (414.4), \textit{Personal} (62.4), \textit{Bedienung} (14.2), \textit{Servicepersonal} (11.0), \textit{Kellner} (3.2) \\

 & AMBIENCE  & 482 & 17.6 & \textit{Ambiente} (278.8), \textit{Atmosphäre} (101.8), \textit{Restaurant} (54.0), \textit{Musik} (14.6), \textit{Einrichtung} (12.4) \\

 & PRICE  & 403 & 10.6 & \textit{Preise} (300.4), \textit{Preis} (32.6), \textit{Preis-Leistungs-Verhältnis} (32.2), \textit{Preis-Leistungsverhältnis} (18.2), \textit{Preisniveau} (7.8) \\ \hline

 25 random examples & GENERAL-IMPRESSION & 78 & 13.0 & \textit{Restaurant} (50.6), \textit{Eindruck} (10.2)

### Prozentualer Anteil an Aspektbegriffen je Aspekt

In [12]:
for llm in LLMS:
    for prompting in FS_CONDITIONS:
        aspect_terms_total = []
        n_aspects_total = 0
        ratio_splits = []
        for split_idx in range(5):
            aspect_terms_split = []
            n_aspects_split = 0
            for example in dataset["synth"][llm][prompting][split_idx]:
                aspect_terms = [tag["text"] for tag in example["tags"] if tag["type"] == "label-explicit"]
                n_aspects_example = len([tag for tag in example["tags"]])

                aspect_terms_split += aspect_terms
                n_aspects_split += n_aspects_example

        
            aspect_terms_total += aspect_terms_split
            n_aspects_total += n_aspects_split
            ratio_splits.append(len(aspect_terms_split) / n_aspects_split)
                
        print(llm, prompting, "total:", len(aspect_terms_total) / n_aspects_total, "splits:", ratio_splits, np.std(ratio_splits), np.var(ratio_splits))

GPT-3 fixed total: 0.6528686946476704 splits: [0.5901639344262295, 0.5418544752092723, 0.7245912151330555, 0.623593699774992, 0.7827751196172249] 0.08846750333202695 0.007826499145802199
GPT-3 random total: 0.5666410601113885 splits: [0.5739503816793893, 0.5501460564751705, 0.5521085797382452, 0.6105577689243028, 0.5485636114911081] 0.02362742458662041 0.0005582551925964346
Llama70B fixed total: 0.7271791572853007 splits: [0.754650416933932, 0.6447751536719508, 0.7412407585985213, 0.7285300739787713, 0.7660462130937099] 0.04302372886341045 0.0018510412453122578
Llama70B random total: 0.7422660664687291 splits: [0.7538022813688213, 0.7445255474452555, 0.7443318861553304, 0.7433981066268062, 0.726158038147139] 0.008978311412013579 8.061007581109327e-05


### Erster Token im Text

In [13]:
# Todo: für normale daten berechnen

In [14]:
for llm in LLMS:
    for prompting in FS_CONDITIONS:
        first_pos = [example["tokenized_text"][0].pos_ for split_idx in range(5) for example in dataset["synth"][llm][prompting][split_idx]]
        pos_counts = Counter(first_pos)
        article_percentage = (pos_counts["DET"] / len(first_pos)) * 100
        print(f"Prozentsatz der Artikel ({llm}, {prompting}): {article_percentage}%")


Prozentsatz der Artikel (GPT-3, fixed): 91.47341772151898%
Prozentsatz der Artikel (GPT-3, random): 91.66666666666666%
Prozentsatz der Artikel (Llama70B, fixed): 57.72151898734177%
Prozentsatz der Artikel (Llama70B, random): 57.333333333333336%
