# Notebook: Analyse Language Diversity

This notebook is used to analyse the language diversity of all conditions.

## Packages

In [1]:
import sys
import os
sys.path.append(os.path.abspath('../07 train models'))

In [2]:
from load_dataset_folds import load_dataset_folds
from spacy.lang.de.stop_words import STOP_WORDS
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import string
import spacy
import nltk
import json

In [3]:
nlp = spacy.load("de_core_news_sm")
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Settings

In [4]:
ASPECT_CATEGORIES = ["FOOD", "SERVICE", "PRICE", "GENERAL-IMPRESSION", "AMBIENCE"]

## Code

In [5]:
def count_tokens(texts):
    token_counts = [] 
    for text in texts:
        tokens = word_tokenize(text)
        token_counts.append(len(tokens))
    return token_counts

In [6]:
def count_unique_lemmas(texts):
    unique_lemmas = set()
    for text in texts:
        doc = nlp(text)
        for token in doc:
            unique_lemmas.add(token.lemma_)
    return len(unique_lemmas)

In [7]:
def remove_stopwords_and_punctuation(text):
    doc = nlp(text)
    cleaned_tokens = [token.lemma_ for token in doc if token.text.lower() not in STOP_WORDS and token.text not in string.punctuation and token.text.isalpha()]
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text


def count_top_n_lemmas(texts, n):
    lemma_counts = {}
    for text in texts:
        cleaned_text = remove_stopwords_and_punctuation(text)
        doc = nlp(cleaned_text)
        for token in doc:
            lemma = token.lemma_
            if lemma in lemma_counts:
                lemma_counts[lemma] += 1
            else:
                lemma_counts[lemma] = 1
    
    sorted_lemmas = sorted(lemma_counts, key=lambda lemma: lemma_counts[lemma], reverse=True)
    top_n_lemmas = sorted_lemmas[:n]
    
    return ', '.join(top_n_lemmas)

In [8]:
def get_language_statistic(n_synth, n_real, labels_fixed, model_name, splits):
    total_texts = []
    unique_lemma_counts = []
    texts_token_counts = []
    n_unique_explicit_aspects_total = []
    total_llm_invalid_xml_schema = 0
    total_llm_invalid_xml_tags = 0
    total_llm_aspect_polarity_in_text_but_not_in_label = 0
    total_llm_more_than_one_sentences = 0

    n_unique_explicit_aspects_total_aspect = {}

    for i in range(0, len(splits)):
        examples_in_split = splits[i]

        texts = [example["text"] for example in examples_in_split]

        # Calculate n tokens in texts
        texts_token_count = count_tokens(texts)
        for count in texts_token_count:
            texts_token_counts.append(count)

        # Calcuate unique lemmas in text
        unique_lemma_count = count_unique_lemmas(texts)
        unique_lemma_counts.append(unique_lemma_count)

        # Calculate number of unique aspect terms
        explicit_aspects = [tag["text"] for example in examples_in_split for tag in example["tags"] if tag["type"] == "label-explicit"]
        n_unique_aspect_terms = len(set(explicit_aspects))
        n_unique_explicit_aspects_total.append(n_unique_aspect_terms)

        for ac in ASPECT_CATEGORIES:
            explicit_aspects = [tag["text"] for example in examples_in_split for tag in example["tags"] if tag["type"] == "label-explicit" and tag["label"] == ac]
            n_unique_aspect_terms = len(set(explicit_aspects))
            if f"n_unique_aspect_terms_{ac}" in n_unique_explicit_aspects_total_aspect:
                n_unique_explicit_aspects_total_aspect[f"n_unique_aspect_terms_{ac}"].append(n_unique_aspect_terms)
            else:
                n_unique_explicit_aspects_total_aspect[f"n_unique_aspect_terms_{ac}"] = [n_unique_aspect_terms]



        if model_name != None:
            # Calculate number of retries for example
            for example in examples_in_split:
                total_llm_invalid_xml_schema += example["llm_invalid_xml_schema"]
                total_llm_invalid_xml_tags += example["llm_invalid_xml_tags"]
                total_llm_aspect_polarity_in_text_but_not_in_label += example[
                "llm_aspect_polarity_in_text_but_not_in_label"]
                total_llm_more_than_one_sentences += example["llm_more_than_one_sentences"]

        # Add to total text collection
        total_texts.extend(texts)

    top_n_lemmas = count_top_n_lemmas(total_texts, 5)
    unique_lemmas_avg = np.mean(unique_lemma_counts)
    texts_token_counts_avg = np.mean(texts_token_counts)
    texts_token_counts_sd = np.std(texts_token_counts)

    statistic = {
        "n_real": n_real,
        "n_synth": n_synth,
        "llm": model_name,
        "few-shot examples": "fixed" if labels_fixed else "random",
        "top_n_lemmas": top_n_lemmas,
        "unique_lemmas_avg": unique_lemmas_avg,
        "avg_number_of_tokens_in_example_text": texts_token_counts_avg,
        "sd_number_of_tokens_in_example_text": texts_token_counts_sd,
        "n_unique_explicit_aspects_total": np.mean(n_unique_explicit_aspects_total),
        # Summe aller invaliden retries über alle 5 folds hinweg
        "total_llm_invalid_xml_schema": total_llm_invalid_xml_schema,
        "total_llm_invalid_xml_tags": total_llm_invalid_xml_tags,
        "total_llm_aspect_polarity_in_text_but_not_in_label": total_llm_aspect_polarity_in_text_but_not_in_label,
        "total_llm_more_than_one_sentences": total_llm_more_than_one_sentences,
        "total_llm_retries": total_llm_invalid_xml_schema + total_llm_invalid_xml_tags + total_llm_aspect_polarity_in_text_but_not_in_label + total_llm_more_than_one_sentences
    }

    for key in n_unique_explicit_aspects_total_aspect.keys():
        statistic[key] = np.mean(n_unique_explicit_aspects_total_aspect[key])

    return statistic

In [9]:
statistics = {}

In [10]:
# Load real examples and save statistics
train_dataset, test_dataset, validation_dataset = load_dataset_folds("Llama70B", 2000, 0, "random")
real_split_counts = [500, 1000, 1500]

for i in range(len(real_split_counts)):
    splits = train_dataset.copy()
    for k in range(len(splits)):
       splits[k] = splits[k][0: real_split_counts[i]]

    statistic = get_language_statistic(0, real_split_counts[i], False, None, splits)
    # n_real, n_fake, llm
    statistics[f"{real_split_counts[i]}_{0}_only_real"] = statistic

Train: 2000 5
Test: 500 5
Validation: 500 5


In [11]:
statistics

{'500_0_only_real': {'n_real': 500,
  'n_synth': 0,
  'llm': None,
  'few-shot examples': 'random',
  'top_n_lemmas': 'essen, LOC, Service, freundlich, Bedienung',
  'unique_lemmas_avg': 1518.4,
  'avg_number_of_tokens_in_example_text': 12.9948,
  'sd_number_of_tokens_in_example_text': 8.515419717195389,
  'n_unique_explicit_aspects_total': 258.4,
  'total_llm_invalid_xml_schema': 0,
  'total_llm_invalid_xml_tags': 0,
  'total_llm_aspect_polarity_in_text_but_not_in_label': 0,
  'total_llm_more_than_one_sentences': 0,
  'total_llm_retries': 0,
  'n_unique_aspect_terms_FOOD': 144.4,
  'n_unique_aspect_terms_SERVICE': 52.0,
  'n_unique_aspect_terms_PRICE': 24.2,
  'n_unique_aspect_terms_GENERAL-IMPRESSION': 14.6,
  'n_unique_aspect_terms_AMBIENCE': 37.8},
 '1000_0_only_real': {'n_real': 1000,
  'n_synth': 0,
  'llm': None,
  'few-shot examples': 'random',
  'top_n_lemmas': 'essen, LOC, Service, freundlich, Bedienung',
  'unique_lemmas_avg': 2411.8,
  'avg_number_of_tokens_in_example_text'

In [12]:
llms = ["Llama70B"]
synth_splits = [500, 1000, 1500]
few_shot_examples_conditions = [False, True]


for model_name in llms:
    for label_fixed in few_shot_examples_conditions:
        # 1975 is the number of synthetic examples that were synthesized for each of the five real splits
        train_dataset, test_dataset, validation_dataset = load_dataset_folds(model_name, 0, 1975 if label_fixed else 1500, "fixed" if label_fixed else "random")
        # exclude real samples from training
        train_dataset = [[example for example in split if "model" in example]
                         for split in train_dataset]

        for split_size in synth_splits:
            # load 500, 1000, 1500 synthetic examples for each 5 splits
            splits = [subset[0:split_size] for subset in train_dataset]
            statistic = get_language_statistic(split_size, 0, label_fixed, model_name, splits)
            # n_real, n_fake, llm
            statistics[f"{0}_{split_size}_{model_name}"] = statistic
            print("llm:", model_name, "|synth_size:", split_size, "|few-shot-fixed:", label_fixed, "|n_synth:", len(splits[0]))

Train: 2000 5
Test: 500 5
Validation: 500 5
llm: Llama70B |synth_size: 500 |few-shot-fixed: False |n_synth: 500
llm: Llama70B |synth_size: 1000 |few-shot-fixed: False |n_synth: 1000
llm: Llama70B |synth_size: 1500 |few-shot-fixed: False |n_synth: 1500
Train: 1975 5
Test: 500 5
Validation: 500 5
llm: Llama70B |synth_size: 500 |few-shot-fixed: True |n_synth: 500
llm: Llama70B |synth_size: 1000 |few-shot-fixed: True |n_synth: 1000
llm: Llama70B |synth_size: 1500 |few-shot-fixed: True |n_synth: 1500


In [13]:
with open("language_statistics.json", 'w') as json_file:
    json.dump(statistics, json_file, indent=4) 

In [14]:
statistics

{'500_0_only_real': {'n_real': 500,
  'n_synth': 0,
  'llm': None,
  'few-shot examples': 'random',
  'top_n_lemmas': 'essen, LOC, Service, freundlich, Bedienung',
  'unique_lemmas_avg': 1518.4,
  'avg_number_of_tokens_in_example_text': 12.9948,
  'sd_number_of_tokens_in_example_text': 8.515419717195389,
  'n_unique_explicit_aspects_total': 258.4,
  'total_llm_invalid_xml_schema': 0,
  'total_llm_invalid_xml_tags': 0,
  'total_llm_aspect_polarity_in_text_but_not_in_label': 0,
  'total_llm_more_than_one_sentences': 0,
  'total_llm_retries': 0,
  'n_unique_aspect_terms_FOOD': 144.4,
  'n_unique_aspect_terms_SERVICE': 52.0,
  'n_unique_aspect_terms_PRICE': 24.2,
  'n_unique_aspect_terms_GENERAL-IMPRESSION': 14.6,
  'n_unique_aspect_terms_AMBIENCE': 37.8},
 '1000_0_only_real': {'n_real': 1000,
  'n_synth': 0,
  'llm': None,
  'few-shot examples': 'random',
  'top_n_lemmas': 'essen, LOC, Service, freundlich, Bedienung',
  'unique_lemmas_avg': 2411.8,
  'avg_number_of_tokens_in_example_text'