# Notebook: Analyse Language

## Packages

In [None]:
from spacy.lang.de.stop_words import STOP_WORDS
from nltk.tokenize import word_tokenize
from collections import Counter
import Levenshtein
import numpy as np
import string
import spacy
import nltk
import json

## Settings

In [None]:
nlp = spacy.load("de_core_news_lg")
nltk.download('punkt')

## Constants

In [None]:
ASPECT_CATEGORIES = ["GENERAL-IMPRESSION",
                     "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
LLMS = ["GPT-3", "Llama70B"]
FS_CONDITIONS = ["fixed", "random"]
PROMPTING_ENCODING = {"fixed": "25 fixed examples",
                      "random": "25 random examples"}

## Code

### Helper

In [None]:
def count_tokens(texts):
    token_counts = [] 
    for text in texts:
        tokens = word_tokenize(text)
        token_counts.append(len(tokens))
    return token_counts

def count_unique_lemmas(texts):
    unique_lemmas = set()
    for text in texts:
        doc = nlp(text)
        for token in doc:
            unique_lemmas.add(token.lemma_)
    return len(unique_lemmas)

def remove_stopwords_and_punctuation(text):
    doc = nlp(text)
    cleaned_tokens = [token.lemma_ for token in doc if token.text.lower() not in STOP_WORDS and token.text not in string.punctuation and token.text.isalpha()]
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text


def count_top_n_lemmas(texts, n):
    lemma_counts = {}
    for text in texts:
        cleaned_text = remove_stopwords_and_punctuation(text)
        doc = nlp(cleaned_text)
        for token in doc:
            lemma = token.lemma_
            if lemma in lemma_counts:
                lemma_counts[lemma] += 1
            else:
                lemma_counts[lemma] = 1
    
    sorted_lemmas = sorted(lemma_counts, key=lambda lemma: lemma_counts[lemma], reverse=True)
    top_n_lemmas = sorted_lemmas[:n]
    
    return ', '.join(top_n_lemmas)

### Load Datasets

In [None]:
dataset = {"synth": {}, "real": []}

# Load Synth
for llm in LLMS:
    dataset["synth"][llm] = {}
    for prompting in FS_CONDITIONS:
        dataset["synth"][llm][prompting] = []
        for split in range(5):
            with open(f"../07 train models/synth/{llm}/{prompting}/split_{split}.json", 'r', encoding='utf-8') as json_file:
                split_data = json.load(json_file)
            for example in split_data:
                example["tokenized_text"] = nlp(example["text"])
            dataset["synth"][llm][prompting].append(split_data)

# Load Real
for split in range(5):
    with open(f"../07 train models/real/split_{split}.json", 'r', encoding='utf-8') as json_file:
        split_data = json.load(json_file)
    for example in split_data:
        example["tokenized_text"] = nlp(example["text"])
    dataset["real"].append(split_data)


### Count Avg Number of Words

In [None]:
for llm in LLMS:
    for prompting in FS_CONDITIONS:
        print("-----")
        print(llm, prompting, round(np.mean(count_tokens(
            [example["text"] for split_data in dataset["synth"][llm][prompting] for example in split_data])), 2))
        print(llm, prompting, [round(np.mean(count_tokens(
            [example["text"] for example in dataset["synth"][llm][prompting][split_id]])), 2) for split_id in range(0, 5)])

In [None]:
print("Real", round(np.mean(count_tokens([example["text"] for split_examples in dataset["real"] for example in split_examples])), 2))

### Analyse Text Similarity: Sampling Statistics

In [None]:
tokenized_texts = [token.text for split_examples in dataset["real"] for example in split_examples for token in example["tokenized_text"]]
word_distribution = Counter(tokenized_texts)
len(word_distribution.keys())

In [None]:
from itertools import combinations

def calculate_average_unique_words(word_distribution, num_draws):
    total_words = sum(word_distribution.values())
    all_words = list(word_distribution.keys())

    possible_combinations = list(combinations(all_words, num_draws))

    unique_combinations = set()
    for combination in possible_combinations:
        unique_combinations.add(tuple(sorted(combination)))

    num_possible_combinations = len(possible_combinations)
    num_unique_combinations = len(unique_combinations)

    average_unique_words = num_unique_combinations / num_possible_combinations

    return average_unique_words

num_draws = 10

#average_unique_words = calculate_average_unique_words(word_distribution, num_draws)

#print(f"Durchschnittliche Anzahl einzigartiger Wörter bei {num_draws} Zügen: {average_unique_words}")


### Analyse Text Similarity: Levenshtein Distance

In [None]:
def average_word_level_levenshtein_distance(docs):
    tokenized_texts = [[token.text for token in doc] for doc in docs]

    total_distance = 0
    pair_count = 0

    for i in range(len(tokenized_texts)):
        for j in range(i + 1, len(tokenized_texts)):
            tokens1 = tokenized_texts[i]
            tokens2 = tokenized_texts[j]

            if len(tokens1) >= len(tokens2):
                max_tokens = len(tokens1)
            else:
                max_tokens = len(tokens2)

            distance = Levenshtein.distance(tokens1, tokens2)
            total_distance += distance
            pair_count += 1

    average_distance = total_distance / pair_count if pair_count > 0 else 0
    return average_distance

In [None]:
for llm in LLMS:
    for prompting in FS_CONDITIONS:
        for split_idx in range(5):
            texts_in_split = [example["tokenized_text"] for example in dataset["synth"][llm][prompting][split_idx]]
            print(llm, prompting, split_idx, average_word_level_levenshtein_distance(texts_in_split))

            # for ac in ASPECT_CATEGORIES:
            #     tags_in_split = [nlp(tag["text"]) for example in dataset["synth"][llm][prompting][split_idx] for tag in example["tags"] if tag["type"] == "label-explicit" and tag["label"] == ac]
            #     print(llm, prompting, split_idx, ac, average_word_level_cosine_similarity(tags_in_split))

### Most frequent terms

Ähnlich wie bei den realen Daten Aspekte, die das Aspekt selber benennen

In [None]:
round(456.6)

In [None]:
for llm in LLMS:
    print(llm)
    for prompting in FS_CONDITIONS:
        for ac_idx, aspect_category in enumerate(ASPECT_CATEGORIES):
            aspect_terms_all_splits = []
            n_aspects_in_splits = []
            n_aspect_terms_in_splits = []
            n_unique_aspects_in_split = []

            for split_idx in range(5):
                aspects_in_split = [tag for example in dataset["synth"][llm][prompting][split_idx]
                                    for tag in example["tags"] if tag["label"] == aspect_category]
                aspect_terms_in_split = [
                    tag["text"] for tag in aspects_in_split if tag["type"] == "label-explicit"]
                aspect_terms_all_splits += aspect_terms_in_split
                n_aspects_in_splits.append(len(aspects_in_split))
                n_aspect_terms_in_splits.append(len(aspect_terms_in_split))
                n_unique_aspects_in_split.append(
                    len(list(set(aspect_terms_in_split))))

            aspect_term_counts = Counter(aspect_terms_all_splits)
            most_common_aspect_terms = aspect_term_counts.most_common(5)

            term_list = [
                f"\\textit{{{term}}} ({round(count/5,2)})" for term, count in most_common_aspect_terms]
            term_string = ", ".join(term_list)

            if ac_idx == 0:
                print(
                    f"\n {PROMPTING_ENCODING[prompting]} & {aspect_category} & {round(np.mean(n_aspect_terms_in_splits))} & {round(np.mean(n_unique_aspects_in_split), 2)} & {term_string} \\\\")
            elif ac_idx == 4:
                print(
                    f"\n & {aspect_category}  & {round(np.mean(n_aspect_terms_in_splits))} & {round(np.mean(n_unique_aspects_in_split), 2)} & {term_string} \\\\ \\hline")
            else:
                print(
                    f"\n & {aspect_category}  & {round(np.mean(n_aspect_terms_in_splits))} & {round(np.mean(n_unique_aspects_in_split), 2)} & {term_string} \\\\")

### Prozentualer Anteil an Aspektbegriffen je Aspekt

In [None]:
for llm in LLMS:
    for prompting in FS_CONDITIONS:
        aspect_terms_total = []
        n_aspects_total = 0
        ratio_splits = []
        for split_idx in range(5):
            aspect_terms_split = []
            n_aspects_split = 0
            for example in dataset["synth"][llm][prompting][split_idx]:
                aspect_terms = [tag["text"] for tag in example["tags"] if tag["type"] == "label-explicit"]
                n_aspects_example = len([tag for tag in example["tags"]])

                aspect_terms_split += aspect_terms
                n_aspects_split += n_aspects_example

        
            aspect_terms_total += aspect_terms_split
            n_aspects_total += n_aspects_split
            ratio_splits.append(len(aspect_terms_split) / n_aspects_split)
                
        print(llm, prompting, "total:", len(aspect_terms_total) / n_aspects_total, "splits:", ratio_splits, np.std(ratio_splits), np.var(ratio_splits))

### Erster Token im Text

In [None]:
# Todo: für normale daten berechnen

In [None]:
for llm in LLMS:
    for prompting in FS_CONDITIONS:
        first_pos = [example["tokenized_text"][0].pos_ for split_idx in range(5) for example in dataset["synth"][llm][prompting][split_idx]]
        pos_counts = Counter(first_pos)
        article_percentage = (pos_counts["DET"] / len(first_pos)) * 100
        print(f"Prozentsatz der Artikel ({llm}, {prompting}): {article_percentage}%")
