# Notebook: Analyse Language

## Packages

In [86]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from spacy.lang.de.stop_words import STOP_WORDS
from nltk.tokenize import word_tokenize
from collections import Counter
import Levenshtein
import numpy as np
import string
import spacy
import nltk
import json

## Settings

In [87]:
nlp = spacy.load("de_core_news_lg")
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Constants

In [88]:
ASPECT_CATEGORIES = ["GENERAL-IMPRESSION",
                     "FOOD", "SERVICE", "AMBIENCE", "PRICE"]
LLMS = ["GPT-3", "Llama70B"]
FS_CONDITIONS = ["fixed", "random"]
PROMPTING_ENCODING = {"fixed": "25 fixed examples",
                      "random": "25 random examples"}
N_FOLDS = 3
CRITERIA_RS = "tag_with_polarity"
POLARITIES = ["POSITIVE", "NEGATIVE", "NEUTRAL"]
MENTIONING_TYPE = ["implicit", "explicit"]
COMBINATIONS = [f"{aspect}-{polarity}" for aspect in ["SERVICE", "FOOD", "GENERAL-IMPRESSION", "AMBIENCE", "PRICE"] for polarity in POLARITIES]
RANDOM_STATE = 43

## Code

### Helper

In [89]:
def count_tokens(texts):
    token_counts = [] 
    for text in texts:
        tokens = word_tokenize(text)
        token_counts.append(len(tokens))
    return token_counts

def count_unique_lemmas(texts):
    unique_lemmas = set()
    for text in texts:
        doc = nlp(text)
        for token in doc:
            unique_lemmas.add(token.lemma_)
    return len(unique_lemmas)

def remove_stopwords_and_punctuation(text):
    doc = nlp(text)
    cleaned_tokens = [token.lemma_ for token in doc if token.text.lower() not in STOP_WORDS and token.text not in string.punctuation and token.text.isalpha()]
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text


def count_top_n_lemmas(texts, n):
    lemma_counts = {}
    for text in texts:
        cleaned_text = remove_stopwords_and_punctuation(text)
        doc = nlp(cleaned_text)
        for token in doc:
            lemma = token.lemma_
            if lemma in lemma_counts:
                lemma_counts[lemma] += 1
            else:
                lemma_counts[lemma] = 1
    
    sorted_lemmas = sorted(lemma_counts, key=lambda lemma: lemma_counts[lemma], reverse=True)
    top_n_lemmas = sorted_lemmas[:n]
    
    return ', '.join(top_n_lemmas)

### Load Datasets

In [90]:
dataset = {"synth": {}, "real": []}

# Load Synth
for llm in LLMS:
    dataset["synth"][llm] = {}
    for prompting in FS_CONDITIONS:
        dataset["synth"][llm][prompting] = []
        for split in range(5):
            with open(f"../07 train models/synth/{llm}/{prompting}/split_{split}.json", 'r', encoding='utf-8') as json_file:
                split_data = json.load(json_file)
            # for example in split_data:
            #     example["tokenized_text"] = nlp(example["text"])
            dataset["synth"][llm][prompting].append(split_data)

# Load Real
for split in range(5):
    with open(f"../07 train models/real/split_{split}.json", 'r', encoding='utf-8') as json_file:
        split_data = json.load(json_file)
    # for example in split_data:
    #     example["tokenized_text"] = nlp(example["text"])
    dataset["real"].append(split_data)

In [91]:
def get_one_hot(subset):
    labels_one_hot = []
    for i in range(len(subset)):
        tags_in_example = list(set([tag[CRITERIA_RS] for tag in subset[i]["tags"]]))
        one_hot_encoded_combination = np.array([1 if tag in tags_in_example else 0 for tag in COMBINATIONS])
        labels_one_hot.append(one_hot_encoded_combination)
    return labels_one_hot

In [92]:
for llm in LLMS:
    for few_shot_condition in FS_CONDITIONS:
        for iteration in range(5):
            if few_shot_condition == "random":
                subset = dataset["synth"][llm][few_shot_condition][iteration]
            else:
                subset = dataset["synth"][llm][few_shot_condition][iteration][475:]
            mskf = MultilabelStratifiedKFold(
                n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

            idx = 0

            section = []

            for train_index, test_index in mskf.split(subset, get_one_hot(subset)):
                split_500 = [subset[i] for i in test_index]
                section.append(split_500)
            dataset["synth"][llm][few_shot_condition][iteration] = section

In [93]:
real_examples = []
for i in [0, 1, 2, 3, 4]:
    real_examples.append([])
    for k in [0, 1, 2]:
        if (i+k) < 5:
            t = i+k
        else:
            t = i+k - 5
        real_examples[i].append(dataset["real"][t])
dataset["real"] = real_examples

index 0  - plus 0
index 0  - plus 1
index 0  - plus 2
index 1  - plus 1
index 1  - plus 2
index 1  - plus 3
index 2  - plus 2
index 2  - plus 3
index 2  - plus 4
index 3  - plus 3
index 3  - plus 4
index 3  - plus 0
index 4  - plus 4
index 4  - plus 0
index 4  - plus 1
