# Notebook: Analyse Language

## Packages

In [2]:
from spacy.lang.de.stop_words import STOP_WORDS
from nltk.tokenize import word_tokenize
from collections import Counter
import Levenshtein
import numpy as np
import string
import spacy
import nltk
import json

## Settings

In [3]:
nlp = spacy.load("de_core_news_lg")
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Constants

In [4]:
ASPECT_CATEGORIES = ["GENERAL-IMPRESSION",
                     "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
LLMS = ["GPT-3", "Llama70B"]
FS_CONDITIONS = ["fixed", "random"]
PROMPTING_ENCODING = {"fixed": "25 fixed examples",
                      "random": "25 random examples"}

## Code

### Helper

In [5]:
def count_tokens(texts):
    token_counts = [] 
    for text in texts:
        tokens = word_tokenize(text)
        token_counts.append(len(tokens))
    return token_counts

def count_unique_lemmas(texts):
    unique_lemmas = set()
    for text in texts:
        doc = nlp(text)
        for token in doc:
            unique_lemmas.add(token.lemma_)
    return len(unique_lemmas)

def remove_stopwords_and_punctuation(text):
    doc = nlp(text)
    cleaned_tokens = [token.lemma_ for token in doc if token.text.lower() not in STOP_WORDS and token.text not in string.punctuation and token.text.isalpha()]
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text


def count_top_n_lemmas(texts, n):
    lemma_counts = {}
    for text in texts:
        cleaned_text = remove_stopwords_and_punctuation(text)
        doc = nlp(cleaned_text)
        for token in doc:
            lemma = token.lemma_
            if lemma in lemma_counts:
                lemma_counts[lemma] += 1
            else:
                lemma_counts[lemma] = 1
    
    sorted_lemmas = sorted(lemma_counts, key=lambda lemma: lemma_counts[lemma], reverse=True)
    top_n_lemmas = sorted_lemmas[:n]
    
    return ', '.join(top_n_lemmas)

### Load Datasets

In [6]:
dataset = {"synth": {}, "real": []}

# Load Synth
for llm in LLMS:
    dataset["synth"][llm] = {}
    for prompting in FS_CONDITIONS:
        dataset["synth"][llm][prompting] = []
        for split in range(5):
            with open(f"../07 train models/synth/{llm}/{prompting}/split_{split}.json", 'r', encoding='utf-8') as json_file:
                split_data = json.load(json_file)
            dataset["synth"][llm][prompting].append(split_data)

# Load Real
for split in range(5):
    with open(f"../07 train models/real/split_{split}.json", 'r', encoding='utf-8') as json_file:
        split_data = json.load(json_file)
    dataset["real"].append(split_data)


### Count Avg Number of Words

In [7]:
for llm in LLMS:
    for prompting in FS_CONDITIONS:
        print("-----")
        print(llm, prompting, round(np.mean(count_tokens(
            [example["text"] for split_data in dataset["synth"][llm][prompting] for example in split_data])), 2))
        print(llm, prompting, [round(np.mean(count_tokens(
            [example["text"] for example in dataset["synth"][llm][prompting][split_id]])), 2) for split_id in range(0, 5)])

-----
GPT-3 fixed 9.68
GPT-3 fixed [9.24, 9.72, 9.1, 10.32, 10.01]
-----
GPT-3 random 9.04
GPT-3 random [8.97, 9.11, 8.84, 8.87, 9.4]
-----
Llama70B fixed 10.31
Llama70B fixed [9.53, 10.93, 9.98, 10.31, 10.78]
-----
Llama70B random 10.16
Llama70B random [10.02, 10.21, 9.85, 10.17, 10.54]


In [8]:
print("Real", round(np.mean(count_tokens([example["text"] for split_examples in dataset["real"] for example in split_examples])), 2))

Real 13.1


### Unique Aspect Terms

In [9]:
def calculate_word_statistics(texts):
    word_statistics = Counter()

    for text in texts:
        words = word_tokenize(text, language='german')
        word_statistics.update(words)

    return dict(word_statistics)




In [10]:
texts_original = [example["text"] for split_examples in dataset["real"] for example in split_examples]

In [11]:
texts_synth = [example["text"] for example in dataset["synth"]["GPT-3"]["fixed"][0]][:1500]

In [12]:
def tokenize_document(text, nlp):
    doc = nlp(text)
    return [token.text for token in doc]

def calculate_levenshtein_distance_word_level(tokens1, tokens2):
    return Levenshtein.distance(tokens1, tokens2)

def average_word_level_levenshtein_distance(texts, nlp):
    tokenized_texts = [tokenize_document(text, nlp) for text in texts]

    total_distance = 0
    pair_count = 0

    for i in range(len(tokenized_texts)):
        for j in range(i + 1, len(tokenized_texts)):
            tokens1 = tokenized_texts[i]
            tokens2 = tokenized_texts[j]
            
            distance = calculate_levenshtein_distance_word_level(tokens1, tokens2)
            total_distance += distance
            pair_count += 1

    average_distance = total_distance / pair_count if pair_count > 0 else 0
    return average_distance

result = average_word_level_levenshtein_distance(texts_original, nlp)
print(f"Durchschnittliche Levenshtein-Distanz (Wortebene) aller Paare: {result}")


Durchschnittliche Levenshtein-Distanz (Wortebene) aller Paare: 16.37945386154462


### Most frequent terms

Ähnlich wie bei den realen Daten Aspekte, die das Aspekt selber benennen

In [13]:
for llm in LLMS:
    print(llm)
    for prompting in FS_CONDITIONS:
        for ac_idx, aspect_category in enumerate(ASPECT_CATEGORIES):
            aspect_terms = []
            for split_idx in range(5):
                for example in dataset["synth"][llm][prompting][split_idx]:
                    aspect_terms += [tag["text"] for tag in example["tags"] if tag["type"]
                                     == "label-explicit" and tag["label"] == aspect_category]
            aspect_term_counts = Counter(aspect_terms)
            most_common_aspect_terms = aspect_term_counts.most_common(5)

            term_list = [
                f"\\textit{{{term}}} ({count})" for term, count in most_common_aspect_terms]
            term_string = ", ".join(term_list)

            if ac_idx == 0:
                print(
                    f"\n {PROMPTING_ENCODING[prompting]} & {aspect_category} & {term_string} \\\\")
            elif ac_idx == 4:
                print(
                    f"\n & {aspect_category} & {term_string} \\\\ \\hline")
            else:
                print(
                    f"\n & {aspect_category} & {term_string} \\\\")

GPT-3

 25 fixed examples & GENERAL-IMPRESSION & \textit{Restaurant} (357), \textit{Eindruck} (175), \textit{Gesamteindruck} (62), \textit{allgemeine Impression} (27), \textit{Atmosphäre} (23) \\

 & FOOD & \textit{Essen} (2026), \textit{Speisen} (61), \textit{Dessert} (59), \textit{Gericht} (37), \textit{Pizza} (29) \\

 & SERVICE & \textit{Service} (2072), \textit{Personal} (312), \textit{Bedienung} (71), \textit{Servicepersonal} (55), \textit{Kellner} (16) \\

 & AMBIENCE & \textit{Ambiente} (1394), \textit{Atmosphäre} (509), \textit{Restaurant} (270), \textit{Musik} (73), \textit{Einrichtung} (62) \\

 & PRICE & \textit{Preise} (1502), \textit{Preis} (163), \textit{Preis-Leistungs-Verhältnis} (161), \textit{Preis-Leistungsverhältnis} (91), \textit{Preisniveau} (39) \\ \hline

 25 random examples & GENERAL-IMPRESSION & \textit{Restaurant} (253), \textit{Eindruck} (51), \textit{Gesamteindruck} (15), \textit{Service} (10), \textit{allgemeine Impression} (9) \\

 & FOOD & \textit{Essen

### Prozentualer Anteil an Aspektbegriffe

In [14]:
for llm in LLMS:
    for prompting in FS_CONDITIONS:
        aspect_terms_total = []
        n_aspects_total = 0
        ratio_splits = []
        for split_idx in range(5):
            aspect_terms_split = []
            n_aspects_split = 0
            for example in dataset["synth"][llm][prompting][split_idx]:
                aspect_terms = [tag["text"] for tag in example["tags"] if tag["type"] == "label-explicit"]
                n_aspects_example = len([tag for tag in example["tags"]])

                aspect_terms_split += aspect_terms
                n_aspects_split += n_aspects_example

        
            aspect_terms_total += aspect_terms_split
            n_aspects_total += n_aspects_split
            ratio_splits.append(len(aspect_terms_split) / n_aspects_split)
                
        print(llm, prompting, "total:", len(aspect_terms_total) / n_aspects_total, "splits:", ratio_splits, np.std(ratio_splits), np.var(ratio_splits))

GPT-3 fixed total: 0.6528686946476704 splits: [0.5901639344262295, 0.5418544752092723, 0.7245912151330555, 0.623593699774992, 0.7827751196172249] 0.08846750333202695 0.007826499145802199
GPT-3 random total: 0.5666410601113885 splits: [0.5739503816793893, 0.5501460564751705, 0.5521085797382452, 0.6105577689243028, 0.5485636114911081] 0.02362742458662041 0.0005582551925964346
Llama70B fixed total: 0.7271791572853007 splits: [0.754650416933932, 0.6447751536719508, 0.7412407585985213, 0.7285300739787713, 0.7660462130937099] 0.04302372886341045 0.0018510412453122578
Llama70B random total: 0.7420745139354468 splits: [0.7538022813688213, 0.7445255474452555, 0.7443318861553304, 0.7428998505231689, 0.7257039055404177] 0.00913414563314946 8.343261644758334e-05


### Erster Token im Text

In [15]:
for llm in LLMS:
    for prompting in FS_CONDITIONS:
        first_pos = [nlp(example["text"])[0].pos_ for split_idx in range(5) for example in dataset["synth"][llm][prompting][split_idx]]
        pos_counts = Counter(first_pos)
        article_percentage = (pos_counts["DET"] / len(first_pos)) * 100
        print(f"Prozentsatz der Artikel ({llm}, {prompting}): {article_percentage}%")


KeyboardInterrupt: 

In [22]:
[example["text"] for example in dataset["synth"]["Llama70B"]["random"][2] if len(example["text"]) == 0]

[]

In [None]:
# 1, 3, 4