# Notebook: Correlation between Generation Length and Example Length

## Packages

In [30]:
import spacy
import random
import json
import nltk
from nltk.tokenize import word_tokenize
from scipy.stats import pearsonr
import numpy as np

## Settings

In [8]:
FS_CONDITIONS = ["fixed", "random"]
LLMS = ["Llama70B", "GPT-3"]
RANDOM_STATE = 43
N_FOLDS = 6

In [9]:
nlp = spacy.load("de_core_news_lg")
nltk.download('punkt')
random.seed(RANDOM_STATE)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Code

In [10]:
def count_tokens(sentence):
    tokens = word_tokenize(sentence, language='german')
    return len(tokens)

In [11]:
dataset_raw = {"synth": {}}

# Load Synth
for llm in LLMS:
    dataset_raw["synth"][llm] = {}
    for prompting in FS_CONDITIONS:
        dataset_raw["synth"][llm][prompting] = []
        for split in range(N_FOLDS):
            with open(f"../07 train models/synth/{llm}/{prompting}/split_{split}.json", 'r', encoding='utf-8') as json_file:
                split_data = json.load(json_file)
            dataset_raw["synth"][llm][prompting].append(split_data)

In [21]:
def calculate_avg_len_fs_examples(llm_examples):
    return np.mean([count_tokens(example["text"]) for example in llm_examples])

In [24]:
llm_prediction_size_vs_fs_length = {"synth": {}}

In [25]:
for llm in LLMS:
    llm_prediction_size_vs_fs_length["synth"][llm] = {}
    for prompting in FS_CONDITIONS:
        llm_prediction_size_vs_fs_length["synth"][llm][prompting] = {
            "output_length": [], "fs_length": []}
        for k in range(N_FOLDS):
            examples = [example for example in dataset_raw["synth"]
                        [llm][prompting][k]]

            # 1. Calculate Output Length
            llm_prediction_size_vs_fs_length["synth"][llm][prompting]["output_length"] += [count_tokens(example["text"])
                              for example in examples]

            # 2. Calculate AVG Length of Examples in Prompt
            llm_prediction_size_vs_fs_length["synth"][llm][prompting]["fs_length"] += [calculate_avg_len_fs_examples(
                example["llm_examples"]) for example in examples]

In [33]:
if len(llm_prediction_size_vs_fs_length["synth"]["Llama70B"]["random"]["fs_length"]) == len(llm_prediction_size_vs_fs_length["synth"]["Llama70B"]["random"]["output_length"]):
    correlation_coefficient, p_value = pearsonr(llm_prediction_size_vs_fs_length["synth"]["Llama70B"]["random"]["fs_length"], llm_prediction_size_vs_fs_length["synth"]["Llama70B"]["random"]["output_length"])
    
    print(f"Korrelationskoeffizient: {correlation_coefficient}")
    print(f"P-Wert: {p_value}")
else:
    print("Die Listen haben unterschiedliche Längen.")


Korrelationskoeffizient: 0.10757225908593293
P-Wert: 1.40369409201301e-24


In [35]:
for llm in LLMS:
    for prompting in FS_CONDITIONS:
        correlation_coefficient, p_value = pearsonr(llm_prediction_size_vs_fs_length["synth"][llm][prompting]["fs_length"], llm_prediction_size_vs_fs_length["synth"][llm][prompting]["output_length"])
        print(llm, prompting)
        print("---------------")
        print(f"Korrelationskoeffizient: {correlation_coefficient}")
        print(f"P-Wert: {p_value}")
        print("\n")
   

Llama70B fixed
---------------
Korrelationskoeffizient: 0.08928389554115965
P-Wert: 2.0828235134130387e-22


Llama70B random
---------------
Korrelationskoeffizient: 0.10757225908593293
P-Wert: 1.40369409201301e-24


GPT-3 fixed
---------------
Korrelationskoeffizient: 0.02265962642670147
P-Wert: 0.013635258958982356


GPT-3 random
---------------
Korrelationskoeffizient: 0.06803880820220079
P-Wert: 1.0379387352151512e-10


