# Notebook: Analyse Language Diversity

This notebook is used to analyse the language diversity of all conditions.

## Packages

In [1]:
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import spacy
import nltk
import json

In [2]:
from spacy.lang.de.stop_words import STOP_WORDS
import string

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
nlp = spacy.load("de_core_news_sm")

## Settings

In [5]:
N_REAL = 0
N_SYNTH = 500
LABELS_FIXED = True
MODEL_NAME = "Llama13B" # "Llama70B", "GPT-3" 

## Parameters

In [6]:
MODELS = ["Llama70B", "GPT-3"]
SAMPLE_COUNT = [500, 1000, 2000]
SAMPLING_STRATEGY = [True, False]

## Code

In [7]:
def count_tokens(texts):
    token_counts = [] 
    for text in texts:
        tokens = word_tokenize(text)
        token_counts.append(len(tokens))
    return token_counts

In [8]:
def count_unique_lemmas(texts):
    unique_lemmas = set()
    for text in texts:
        doc = nlp(text)
        for token in doc:
            unique_lemmas.add(token.lemma_)
    return len(unique_lemmas)

In [9]:
def remove_stopwords_and_punctuation(text):
    doc = nlp(text)
    cleaned_tokens = [token.lemma_ for token in doc if token.text.lower() not in STOP_WORDS and token.text not in string.punctuation]
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

def count_top_n_lemmas(texts, n):
    lemma_counts = {}
    for text in texts:
        cleaned_text = remove_stopwords_and_punctuation(text)
        doc = nlp(cleaned_text)
        for token in doc:
            lemma = token.lemma_
            if lemma in lemma_counts:
                lemma_counts[lemma] += 1
            else:
                lemma_counts[lemma] = 1
    
    sorted_lemmas = sorted(lemma_counts, key=lambda lemma: lemma_counts[lemma], reverse=True)
    top_n_lemmas = sorted_lemmas[:n]
    
    return ', '.join(top_n_lemmas)

In [14]:
def get_language_statistic(n_synth, n_real, labels_fixed, model_name):
    total_texts = []
    unique_lemma_counts = []
    texts_token_counts = []
    n_aspects_total = []
    n_implicit_aspects_total = []
    n_explicit_aspects_total = []
    total_llm_invalid_xml_schema = 0
    total_llm_invalid_xml_tags = 0
    total_llm_aspect_polarity_in_text_but_not_in_label = 0
    total_llm_more_than_one_sentences = 0
    total_llm_no_german_language = 0
    
    for i in range(0, 5):
        print(i)
        examples_in_split = []
    
        # Load Real Split
        real_path = f"../07 train classifier/real/split_{i}.json"
        with open(real_path, 'r') as file:
            real_data = json.load(file)[:n_real]
            examples_in_split = real_data
    
        # Load Synth Split
        if n_synth > 0:
            fake_path = f"../07 train classifier/synth/{model_name}/{'fixed' if labels_fixed else 'random'}/split_{i}.json"
            with open(fake_path, 'r') as file:
                fake_data = json.load(file)[:n_synth]
                examples_in_split = fake_data
        
        texts = [example["text"] for example in examples_in_split]
    
    
        # Calculate n tokens in texts
        texts_token_count = count_tokens(texts)
        for count in texts_token_count:
            texts_token_counts.append(count)
    
        # Calcuate unique lemmas in text
        unique_lemma_count = count_unique_lemmas(texts)
        unique_lemma_counts.append(unique_lemma_count)
    
        # Calculate number of aspects (implicit+explicit)
        n_aspects = len([tag["text"] for example in examples_in_split for tag in example["tags"]])
        n_aspects_total.append(n_aspects)
    
    
        # Calculate number of implicit aspects
        n_implicit_aspects = len([tag["text"] for example in examples_in_split for tag in example["tags"] if tag["text"] is None])
        n_implicit_aspects_total.append(n_implicit_aspects)
    
        # Calculate number of unique aspect terms
        explicit_aspects = [tag["text"] for example in examples_in_split for tag in example["tags"] if tag["text"] is not None]
        n_unique_aspect_terms = len(set(explicit_aspects))
        n_explicit_aspects_total.append(n_unique_aspect_terms)
        
        print(examples_in_split[0]["llm_invalid_xml_schema"])
        # Add to total count
        for example in examples_in_split:
            total_llm_invalid_xml_schema += example["llm_invalid_xml_schema"]
            total_llm_invalid_xml_tags += example["llm_invalid_xml_tags"]
            total_llm_aspect_polarity_in_text_but_not_in_label += example["llm_aspect_polarity_in_text_but_not_in_label"]
            total_llm_more_than_one_sentences += example["llm_more_than_one_sentences"]
            total_llm_no_german_language += example["llm_no_german_language"]
    
    
        # Add to total text collection
        total_texts.extend(texts)
        
    top_n_lemmas = count_top_n_lemmas(total_texts, 5)
    unique_lemmas_avg = np.mean(unique_lemma_counts)
    texts_token_counts_avg = np.mean(texts_token_counts)
    texts_token_counts_sd = np.std(texts_token_counts)
    n_aspects_avg = np.mean(n_aspects_total)
    n_implicit_aspects_avg = np.mean(n_implicit_aspects_total) / (np.mean(n_implicit_aspects_total) + np.mean(n_explicit_aspects_total))
    n_explicit_aspects_avg = np.mean(n_explicit_aspects_total) / (np.mean(n_implicit_aspects_total) + np.mean(n_explicit_aspects_total))
    
    statistic = {
      "condition": f"{n_synth} fake" if n_synth > 0 else (f"{n_real} fake" if n_real > 0 else "unknown condition"),
      "llm": model_name,
      "few-shot examples": "fixed" if labels_fixed else "random",
      "top_n_lemmas": top_n_lemmas,
      "unique_lemmas_avg": unique_lemmas_avg,
      "texts_token_counts_avg": texts_token_counts_avg,
      "texts_token_counts_sd": texts_token_counts_sd,
      "n_aspects_avg": n_aspects_avg,
      "n_implicit_aspects_avg": n_implicit_aspects_avg,
      "n_explicit_aspects_avg": n_explicit_aspects_avg,
      "total_llm_invalid_xml_schema": total_llm_invalid_xml_schema, # Summe aller invaliden retries über alle 5 folds hinweg
      "total_llm_invalid_xml_tags": total_llm_invalid_xml_tags,
      "total_llm_aspect_polarity_in_text_but_not_in_label": total_llm_aspect_polarity_in_text_but_not_in_label,
      "total_llm_more_than_one_sentences": total_llm_more_than_one_sentences,
      "total_llm_no_german_language": total_llm_no_german_language,
      "total_llm_retries": total_llm_invalid_xml_schema + total_llm_invalid_xml_tags + total_llm_aspect_polarity_in_text_but_not_in_label + total_llm_more_than_one_sentences + total_llm_no_german_language
    }

    return statistic
    

In [15]:
statistics = []
statistics.append(get_language_statistic(5, 0, True, "Llama13B"))
statistics

0
0
1
0
2
0
3
0
4
0


[{'condition': '5 fake',
  'llm': 'Llama13B',
  'few-shot examples': 'fixed',
  'top_n_lemmas': 'Service, Preis, essen, LOC, Speise',
  'unique_lemmas_avg': 42.4,
  'texts_token_counts_avg': 13.2,
  'texts_token_counts_sd': 7.919595949289332,
  'n_aspects_avg': 12.4,
  'n_implicit_aspects_avg': 0.2833333333333333,
  'n_explicit_aspects_avg': 0.7166666666666667,
  'total_llm_invalid_xml_schema': 0,
  'total_llm_invalid_xml_tags': 0,
  'total_llm_aspect_polarity_in_text_but_not_in_label': 3,
  'total_llm_more_than_one_sentences': 11,
  'total_llm_no_german_language': 1,
  'total_llm_retries': 15}]

In [12]:
for synth_count in SAMPLE_COUNT:
    for sampling_strategy in SAMPLING_STRATEGY:
        for model in MODELS:
            pass
            #statistics.append(get_language_statistic(synth_count, 0, sampling_strategy, model))

In [13]:
for real_count in SAMPLE_COUNT:
    # model and sampling strategy are irrelevant. only real ones are considered anyway
    print(real_count)
    statistics.append(get_language_statistic(0, real_count, False, MODELS[0]))

500
0


KeyError: 'llm_invalid_xml_schema'

In [None]:
statistics

In [None]:
with open("language_statistics.json", 'w') as json_file:
    json.dump(statistics, json_file, indent=4) 