# ⚙️ Install Libraries and Download Dataset

After this step, the runtime must be restarted.

In [None]:
# https://stackoverflow.com/questions/56081324/why-are-google-colab-shell-commands-not-working
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

!pip install simplet5 evaluate sacrebleu tqdm spacy matplotlib openai zenodo-get bert_score tensorflow

!zenodo_get 10.5281/zenodo.8023142

# 📝 Initialize config
Here you can also determine which models should be evaluated. By default, models are used that were trained during the master thesis.

To change the models use the x-model property. Here you can also specify local models by specifying the appropriate file path.

In [None]:
config = {
    "models": ["t5-small", "t5-base", "t5-large"],
    "result_path": "./dataset_eval_results/",
    "csv_comments_path": "../data/filtered_data/intense_style_comments.csv",
    "gyafc_path": ["../data/GYAFC/GYAFC_Corpus/Entertainment_Music/train",
      "../data/GYAFC/GYAFC_Corpus/Family_Relationships/train"],
    "t5_models": [{"name": "t5-base", "path": "Suppi123/T5-Base-Text-Style-Transfer-Using-Examples"},
                  {"name": "flan-t5-base", "path": "Suppi123/Flan-T5-Base-Text-Style-Transfer-Using-Examples"}],
    "bert_models": ["Suppi123/Bert-Base-Uncased-Text-Style-Transfer-Using-Examples"],
    "bart_models": [{"name": "bart-base", "path": "Suppi123/Bart-Base-Text-Style-Transfer-Using-Examples"}],
    "gptneo_models": ["Suppi123/GPT-NEO-2.7B-Text-Style-Transfer-Using-Examples"],
    "trainings_data_path": "training_labeled_with_style_samples.json",
    "eval_data_path": "eval_labeled_with_style_samples.json",
    "min_perplexity": 100,
    "result_data": "results/bart_epoch0_results.json",
    "evaluation_models": ["antiwork", "atheism", "Conservative", "conspiracy", "dankmemes", "gaybros", "leagueoflegends",
      "lgbt", "Libertarian", "linguistics", "MensRights", "news", "offbeat", "PoliticalCompassMemes", "politics",
      "teenagers", "TrueReddit", "TwoXChromosomes", "wallstreetbets", "worldnews"]
}

# ✍️ Generate output with T5 and Flan-T5

In [None]:
import json
from simplet5 import SimpleT5


# Load prompts
eval_data_file = open(config['eval_data_path'])
eval_data = json.load(eval_data_file)['data']


input_texts = []
for prompt in eval_data:
    trainings_prompt = 'Here a example sentences: '
    for sample in prompt['style_samples']:
        trainings_prompt += '{' + sample + '} '
    trainings_prompt += 'Here is a sentence: {' + prompt['input_sentence'] + '} '
    trainings_prompt += 'Here is a rewrite of this sentence according to the example sentences: {'
    input_texts.append({'prompt': trainings_prompt, 'input': prompt['input_sentence'],
                        'subreddit': prompt['subreddit'], 'result_sentence': prompt['result_sentence']})


for model_config in config['t5_models']:
    results = []
    model = SimpleT5()
    model.load_model("t5", f"{model_config['path']}", use_gpu=True)
    for input_text in input_texts:
        output_text = model.predict(source_text=input_text['prompt'], repetition_penalty=1.0,
                                    num_return_sequences=1, num_beams=5)
        results.append({'prompt': input_text['prompt'], 'input': input_text['input'], 'output': output_text[0],
                        'subreddit': input_text['subreddit'], 'result_sentence': input_text['result_sentence']})

    json_result_object = json.dumps({'data': results}, indent=4)
    with open(f"{model_config['name']}_results.json", "w") as outfile:
        outfile.write(json_result_object)


# ✍️ Generate output with BART

In [None]:
import json
import torch
from tqdm import tqdm
from transformers import BartForConditionalGeneration, BartTokenizer

# Load prompts
eval_data_file = open(config['eval_data_path'])
eval_data = json.load(eval_data_file)['data']

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

sep_token = tokenizer.sep_token
bos_token = tokenizer.bos_token
eos_token = tokenizer.eos_token

input_texts = []
for prompt in eval_data:
    trainings_prompt = ""
    for sample in prompt['style_samples']:
        trainings_prompt += bos_token + sample + sep_token
    trainings_prompt += bos_token + prompt['input_sentence'] + eos_token
    input_texts.append({'prompt': trainings_prompt, 'input': prompt['input_sentence'],
                        'subreddit': prompt['subreddit']})

test_models = tqdm(config['bart_models'])
for model_config in test_models:
    model_path = model_config['path']
    model_name = model_config['name']
    results = []
    test_models.set_description(f"Processing {model_name}")
    model = BartForConditionalGeneration.from_pretrained(model_path).to(device)
    for input_text in input_texts:
        input_ids = tokenizer.encode(input_text['prompt'], return_tensors='pt').to(device)
        output_ids = model.generate(input_ids, max_length=512, num_beams=5, early_stopping=True)
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        results.append({'prompt': input_text['prompt'], 'input': input_text['input'], 'output': output_text,
                        'subreddit': input_text['subreddit']})

    json_result_object = json.dumps({'data': results}, indent=4)
    with open(f"{model_name}_results.json", "w") as outfile:
        outfile.write(json_result_object)


# ✍️ Generate output with BERT

In [None]:
import json
import torch
from tqdm import tqdm
from transformers import BertLMHeadModel, BertTokenizer

# Load prompts
eval_data_file = open(config['eval_data_path'])
eval_data = json.load(eval_data_file)['data']

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

sep_token = tokenizer.sep_token

input_texts = []
for prompt in eval_data:
    trainings_prompt = ""
    for sample in prompt['style_samples']:
        trainings_prompt += sep_token + sample + sep_token
    trainings_prompt += sep_token + prompt['input_sentence'] + sep_token
    input_texts.append({'prompt': trainings_prompt, 'input': prompt['input_sentence'],
                        'subreddit': prompt['subreddit']})

test_models = tqdm(config['bert_models'])
for model_name in test_models:
    results = []
    test_models.set_description(f"Processing {model_name}")
    model = BertLMHeadModel.from_pretrained(model_name).to(device)
    for input_text in input_texts:
        input_ids = tokenizer.encode(input_text['prompt'], return_tensors='pt').to(device)
        output_ids = model.generate(input_ids, max_length=512, num_beams=5, early_stopping=True)
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print(output_text)
        results.append({'prompt': input_text['prompt'], 'input': input_text['input'], 'output': output_text,
                        'subreddit': input_text['subreddit']})

    json_result_object = json.dumps({'data': results}, indent=4)
    with open(f"{model_name}_results.json", "w") as outfile:
        outfile.write(json_result_object)


# ✍️ Generate output with GPT-Neo

In [None]:
import json
import torch
from tqdm import tqdm
from transformers import GPTNeoForCausalLM, GPT2Tokenizer


# Load prompts
eval_data_file = open(config['eval_data_path'])
eval_data = json.load(eval_data_file)['data']

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-2.7B')

sep_token = tokenizer.eos_token

input_texts = []
for prompt in eval_data:
    training_prompt = ""
    for sample in prompt['style_samples']:
        training_prompt += sep_token + sample + sep_token
    training_prompt += sep_token + prompt['input_sentence'] + sep_token
    input_texts.append({'prompt': training_prompt, 'input': prompt['input_sentence'],
                        'subreddit': prompt['subreddit']})

test_models = tqdm(config['gptneo_models'])
for model_name in test_models:
    results = []
    test_models.set_description(f"Processing {model_name}")
    model = GPTNeoForCausalLM.from_pretrained(model_name).to(device)
    for input_text in input_texts:
        input_ids = tokenizer.encode(input_text['prompt'], return_tensors='pt').to(device)
        output_ids = model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print(output_text)
        results.append({'prompt': input_text['prompt'], 'input': input_text['input'], 'output': output_text,
                        'subreddit': input_text['subreddit']})

    json_result_object = json.dumps({'data': results}, indent=4)
    with open(f"{model_name}_results.json", "w") as outfile:
        outfile.write(json_result_object)


# 🏋️ Train evaluation models

Optional. You only need it if you want to examine the style-specific perplexities of the data.

May take a while

In [None]:
import os
import json
import pandas
import datasets
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments, \
    AutoModelForCausalLM


trainings_epochs = 5

model_name = 'gpt2' # base-model for fine-tuning

min_perplexity = 100 # minimal perplexity for trainings comments

data_dir = 'trainings_data'

use_submisson_data = False

csv_comments_path = 'reddit_comments.csv'

eval_models_dir = 'eval_models'

trainings_data_dir = 'trainings_data'

evaluation_models = ["antiwork", "atheism", "Conservative", "conspiracy", "dankmemes", "gaybros", "leagueoflegends",
      "lgbt", "Libertarian", "linguistics", "MensRights", "news", "offbeat", "PoliticalCompassMemes", "politics",
      "teenagers", "TrueReddit", "TwoXChromosomes", "wallstreetbets", "worldnews"]


def create_trainings_data(subreddit_name, eval_split=0.8):
    """
    Generates text files that are used for training the evaluation model. A training file with the naming
    training_[subredditname].txt and an evaluation file with the naming eval_[subredditname].txt are created.
    :param subreddit_name: Name of the subreddit for which an evaluation model is to be trained
    :type subreddit_name: str
    :param eval_split: Training split size. By default 0.8
    :type eval_split float
    :param data_dir: Directory in which the trainings data should be saved
    :type data_dir: str
    """
    texts = []
    comment_data = pandas.io.parsers.read_csv(csv_comments_path)
    if use_submisson_data:
        submission_data = pandas.io.parsers.read_csv(csv_comments_path)
    query = '`subreddit` == ' + '"' + subreddit_name + '"'
    comments_data_subreddit = comment_data.query(query)
    if use_submisson_data:
        submission_data_subreddit = submission_data.query(query)
    number_of_comments = 1
    for index, row in comments_data_subreddit.iterrows():
        if row['body'] == "[deleted]":
            continue
        if row['perplexity'] < min_perplexity:
            continue
        number_of_comments += 1
        texts.append(row['body'].strip() + ' <|endoftext|>')
    if use_submisson_data:
        for index, row in submission_data_subreddit.iterrows():
            texts.append(row['title'].strip() + ' <|endoftext|>')
    # split data in training and eval
    split_index = int(len(texts) * eval_split)
    trainings_data = texts[:split_index]
    eval_data = texts[split_index:]
    # write to file
    training_text = ""
    for sentence in trainings_data:
        training_text += sentence + "\n"
    with open(f"{data_dir}/training_{subreddit_name}.txt", "w") as outfile:
        outfile.write(training_text)
    eval_text = ""
    for sentence in eval_data:
        eval_text += sentence + "\n"
    with open(f"{data_dir}/eval_{subreddit_name}.txt", "w") as outfile:
        outfile.write(eval_text)
    return number_of_comments


def train_model(subreddit_name, model_dir=eval_models_dir):
    """
    Train a model for the given subreddit
    :param subreddit_name: Name of the subreddit for which an evaluation model is to be trained
    :param data_dir: Folder in which the training data is located
    :param model_dir: Folder in which the models should be saved
    """
    train_path = f"{data_dir}/training_{subreddit_name}.txt"
    eval_path = f"{data_dir}/eval_{subreddit_name}.txt"

    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    # currently only models that support the gpt2 tokenizer can be used
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

    output_dir = f"./{model_dir}/{subreddit_name}"
    # create output folder if it not exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    training_args = TrainingArguments(
        output_dir=output_dir,  # The output directory
        overwrite_output_dir=True,  # overwrite the content of the output directory
        num_train_epochs=trainings_epochs,  # number of training epochs
        per_device_train_batch_size=4,  # batch size for training
        per_device_eval_batch_size=4,  # batch size for evaluation
        eval_steps=400,  # Number of update steps between two evaluations.
        save_steps=800,  # after # steps model is saved
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
    )

    dataset = datasets.load_dataset("text", data_files={"train": train_path, "test": eval_path}, sample_by="line")

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )

    def tokenize_function(examples):
        # Remove empty lines
        examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=512,
        )

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        num_proc=4,
        remove_columns=["text"],
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"]
    )
    trainer.train()
    trainer.save_model()


# create folders that will be used later in the process
if not os.path.exists(eval_models_dir):
    os.makedirs(eval_models_dir)
if not os.path.exists(trainings_data_dir):
    os.makedirs(trainings_data_dir)

# create the trainings and evaluation data for the evaluation models
training_data_bar = tqdm(evaluation_models)
data_stats = {}
for subreddit in training_data_bar:
    training_data_bar.set_description(f"Processing {subreddit}")
    data_stats[subreddit] = create_trainings_data(subreddit_name=subreddit)
print(f"Data distribution {str(data_stats)}")


# train the evaluation models
training_model_bar = tqdm(evaluation_models)
for subreddit in tqdm(evaluation_models):
    training_model_bar.set_description(f"Training model for {subreddit}")
    train_model(subreddit_name=subreddit)


# 👷 Prepare the evaluation
Optional. You only need it if you want to examine the style-specific perplexities of the data.

In [None]:
import json
import math

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

models = ["antiwork", "atheism", "Conservative", "conspiracy", "dankmemes",
          "gaybros", "leagueoflegends", "lgbt", "Libertarian", "linguistics",
          "MensRights", "news", "offbeat", "PoliticalCompassMemes",
          "politics", "teenagers", "TrueReddit", "TwoXChromosomes",
          "wallstreetbets", "worldnews"]

def get_perplexity(model, encodings):
    max_length = model.config.n_positions
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over input tokens.
            # Multiply it with trg_len to get the summation instead of average.
            # We will take average over all the tokens to get the true average
            # in the last step of this example.
            neg_log_likelihood = outputs.loss * trg_len

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    return torch.exp(torch.stack(nlls).sum() / end_loc).item()


def get_device():
    # check if cuda is available and set device
    if torch.cuda.is_available():
        return torch.device('cuda')
    return torch.device('cpu')


class StyleSpecificPerplexity:

    def __init__(self):
        # load config
        self.config = {"evaluation_models": models, "eval_model_dir": 'eval_models'}
        self.models = {}
        for model_name in self.config["evaluation_models"]:
            device = get_device()
            # load tokenizer an model
            model = AutoModelForCausalLM.from_pretrained(f"{self.config['eval_model_dir']}/{model_name}").to(device)
            tokenizer = AutoTokenizer.from_pretrained("gpt2")
            tokenizer.pad_token = tokenizer.eos_token
            self.models[model_name] = {"model": model, "tokenizer": tokenizer}

    def calculate_perplexity(self, model_name, input_texts=[]):
        if model_name not in self.models:
            raise Exception(f"No model for style {model_name}")

        # load model and tokenizer
        model = self.models[model_name]["model"]
        tokenizer = self.models[model_name]["tokenizer"]

        perplexities = []
        for input_text in input_texts:
            # encode text
            device = get_device()
            encodings = tokenizer(input_text, return_tensors="pt").to(device)
            # calculate perplexity
            perplexity = get_perplexity(model, encodings)

            # If no perplexity could be calculated (for example because the input contains only one word),
            # do not save. Result would be NaN. This means that no more total perplexity can be calculated.
            if math.isnan(perplexity):
                continue

            perplexities.append(perplexity)
        # return perplexity values
        return perplexities


# 🧪 Evaluate the output of the models

In [None]:
import json
import evaluate
import numpy as np
from tqdm import tqdm

# Set this variable to True if you want to calculate the style specific perplexities.
# Note that you have to execute the upper two cells for this.
calculate_style_specific_perplexities = False

# This variable is used to create a suitable name for the output file
result_data_model_name = 't5-base'

# Change this variable to evaluate other results
result_data_filename = 't5-base_results.json'

result_data_file = open(result_data_filename)
result_data_json = json.load(result_data_file)

references = []
predictions = []
subreddits = []
for res_object in result_data_json["data"]:
    if len(res_object['input']) == 0:
        continue
    references.append([res_object['input']])
    predictions.append(res_object['output'])

# create a flat list of references, needed for the calculation of BERTScore
flat_references = [item for sublist in references for item in sublist]

print("About to calculate dataset label scores")

# load models
bert_score_model = evaluate.load("bertscore")
perplexity_model = evaluate.load("perplexity", module_type="measurement")
bleu_model = evaluate.load("chrf")

res_per_subreddit = {}
for res_object in result_data_json['data']:
    subreddit = res_object['subreddit']
    if subreddit not in res_per_subreddit:
        res_per_subreddit[subreddit] = []
    res_per_subreddit[subreddit].append(res_object['output'])

tmp_scores = {}
# create a perplexity ranking of all subreddits (to check if the target subreddit has the lowest perplexity)
tmp_scores['overall_perplexity'] = []
if calculate_style_specific_perplexities:
  style_specific_perplexity = StyleSpecificPerplexity()
  print('Create all subreddit perplexities:')
  for subreddit, subreddit_data in tqdm(res_per_subreddit.items()):
      for subreddit_model in config['evaluation_models']:
          perplexities = style_specific_perplexity.calculate_perplexity(subreddit_model, subreddit_data)
          tmp_scores['overall_perplexity'].append({'target_subreddit': subreddit,
                                                 'subreddit_model': subreddit_model,
                                                 'perplexity': np.mean(perplexities)})

  # rate style specific perplexity
  tmp_scores['style_specific_perplexity'] = {}
  print('Create target subreddit perplexities:')
  for subreddit, subreddit_data in tqdm(res_per_subreddit.items()):
      perplexities = style_specific_perplexity.calculate_perplexity(subreddit, subreddit_data)
      tmp_scores['style_specific_perplexity'][subreddit] = np.mean(perplexities)



# calculate scores
print('Create bert_score')
tmp_scores['bert_score'] = bert_score_model.compute(predictions=predictions, references=flat_references,
                                                    lang="en", model_type="microsoft/deberta-xlarge-mnli",
                                                    device='cuda')

print('Create chrF++')
tmp_scores['bleu_score'] = bleu_model.compute(predictions=predictions, references=references, word_order=2, )

print('Create GPT-2 perplexities')
tmp_scores['perplexity_score'] = perplexity_model.compute(data=predictions, model_id='gpt2', device='cpu')

print("Dataset label scores calculated")

# BERT
bert_scores = tmp_scores['bert_score']
bert_f1_mean_score = sum(bert_scores['f1']) / len(bert_scores['f1'])
bert_precision_mean_score = sum(bert_scores['precision']) / len(bert_scores['precision'])
bert_recall_mean_score = sum(bert_scores['recall']) / len(bert_scores['recall'])

bert_score = {'mean_f1': bert_f1_mean_score, 'mean_precision': bert_precision_mean_score,
              'mean_recall': bert_recall_mean_score}
# Perplexity
perplexity_score = tmp_scores['perplexity_score']
median_perplexity = np.median(perplexity_score['perplexities'])
variance_perplexity = np.var(perplexity_score['perplexities'])
perplexity = {'perplexity_median': median_perplexity, 'perplexity_variance': variance_perplexity,
              'perplexity_mean': perplexity_score['mean_perplexity']}
# save results in object
result_obj = {'BLEU': tmp_scores['bleu_score'], 'Perplexity': perplexity,
              'BERTScore': bert_score,
              'overall_perplexity': tmp_scores['overall_perplexity']}

if calculate_style_specific_perplexities:
  result_obj['Style_Specific_Perplexity'] = tmp_scores['style_specific_perplexity']

# save result object
with open(f"{result_data_model_name}_eval_results.json", "w") as r:
    json.dump(result_obj, r)


# 📊 Visualize data

Visualize the accuracy of style transfers. For this, the perplexity is evaluated by subreddit trained on the styles. If the perplexity of the target subreddit is low, this should be a good sign.

You can change the filename of the files to be visualized in the code

In [None]:
import json
import matplotlib.pyplot as plt

# File to be visualized
file_name = 't5-flan_epoch4_results.json'

file = open(file_name)
data = json.load(file)['overall_perplexity']

# Get unique target subreddits
target_subreddits = set(entry["target_subreddit"] for entry in data)

for subreddit in target_subreddits:
    # Filter data for the current target subreddit
    filtered_data = [entry for entry in data if entry["target_subreddit"] == subreddit]

    # Sort the filtered data based on perplexity in ascending order
    sorted_data = sorted(filtered_data, key=lambda x: x["perplexity"])[:5]  # Get the five lowest perplexities

    # Extract the subreddit models and perplexities
    subreddit_models = [entry["subreddit_model"] for entry in sorted_data]
    perplexities = [round(entry["perplexity"], 2) for entry in sorted_data]  # Round perplexities to two decimal places

    # Increase the figure size
    plt.figure(figsize=(9, 12))  # Adjust the values as needed

    # Plotting the bar graph
    plt.bar(subreddit_models, perplexities)

    # Adding labels and title
    plt.xlabel("Subreddit Model")
    plt.ylabel("Perplexity")
    plt.title(f"Five Lowest Perplexities for {subreddit}")

    # Adding perplexity values on top of each bar
    for i, v in enumerate(perplexities):
        plt.text(i, v, str(v), ha='center', va='bottom')

    # Rotating the x-axis labels for better visibility
    plt.xticks(rotation=45)

    # Displaying the bar graph
    plt.show()


Present the BERTScores and/or chrF++ and/or the perplexity of the generated texts. What you want to display, you can set in the code

In [None]:
import json
import matplotlib.pyplot as plt


# which metrics are to be displayed
show_bert_score = True # BERTScore
show_chrf = True # chrF++
show_ppl = True # Perplexity

# files
files1 = ['bart_epoch0_results.json', 'bart_epoch1_results.json', 'bart_epoch2_results.json',
          'bart_epoch3_results.json', 'bart_epoch4_results.json']

files2 = ['t5_epoch0_results.json', 't5_epoch1_results.json', 't5_epoch2_results.json',
          'results/t5_epoch3_results.json', 'results/t5_epoch4_results.json']

files3 = ['gpt-neo_epoch0_results.json', 'gpt-neo_epoch1_results.json',
          'gpt-neo_epoch2_results.json', 'gpt-neo_epoch3_results.json',
          'gpt-neo_epoch4_results.json']

files4 = ['t5-flan_epoch0_results.json', 't5-flan_epoch1_results.json',
          't5-flan_epoch2_results.json',
          't5-flan_epoch3_results.json', 't5-flan_epoch4_results.json']

files_data1 = []
files_data2 = []
files_data3 = []
files_data4 = []

for file_path in files1:
    with open(file_path, 'r') as file:
        data = json.load(file)
        files_data1.append(data)

for file_path in files2:
    with open(file_path, 'r') as file:
        data = json.load(file)
        files_data2.append(data)

for file_path in files3:
    with open(file_path, 'r') as file:
        data = json.load(file)
        files_data3.append(data)

for file_path in files4:
    with open(file_path, 'r') as file:
        data = json.load(file)
        files_data4.append(data)

# Extract data for plotting
bleu_scores1 = [data["BLEU"]["score"] for data in files_data1]
bleu_scores2 = [data["BLEU"]["score"] for data in files_data2]
bleu_scores3 = [data["BLEU"]["score"] for data in files_data3]
bleu_scores4 = [data["BLEU"]["score"] for data in files_data4]

bert_scores1 = [data["BERTScore"]["mean_f1"] for data in files_data1]
bert_scores2 = [data["BERTScore"]["mean_f1"] for data in files_data2]
bert_scores3 = [data["BERTScore"]["mean_f1"] for data in files_data3]
bert_scores4 = [data["BERTScore"]["mean_f1"] for data in files_data4]

perplexity_median1 = [data["Perplexity"]["perplexity_mean"] for data in files_data1]
perplexity_median2 = [data["Perplexity"]["perplexity_mean"] for data in files_data2]
perplexity_median3 = [data["Perplexity"]["perplexity_mean"] for data in files_data3]
perplexity_median4 = [data["Perplexity"]["perplexity_mean"] for data in files_data4]

# Generate x-axis values (file numbers)
file_numbers1 = range(1, len(files_data1) + 1)
file_numbers2 = range(1, len(files_data2) + 1)
file_numbers3 = range(1, len(files_data3) + 1)
file_numbers4 = range(1, len(files_data4) + 1)

# Plotting the data

if show_chrf:
  plt.plot(file_numbers1, [0.83, 0.83, 0.83, 0.83, 0.83], label='davinci-003', color='red')

if show_bert_score:
  plt.plot(file_numbers1, [0.36, 0.36, 0.36, 0.36, 0.36], label='davinci-003', color='red')

if show_ppl:
  plt.plot(file_numbers1, perplexity_median1, label='bart-base')
  plt.plot(file_numbers2, perplexity_median2, label='t5-base')
  plt.plot(file_numbers4, perplexity_median4, label='flan-t5-base')
  plt.plot(file_numbers3, perplexity_median3, label='gpt-neo-2.7B')


if show_bert_score:
  plt.plot(file_numbers1, bleu_scores1, label='bart-base')
  plt.plot(file_numbers2, bleu_scores2, label='t5-base')
  plt.plot(file_numbers4, bleu_scores4, label='flan-t5-base')
  plt.plot(file_numbers3, bleu_scores3, label='gpt-neo-2.7B')

if show_chrf:
  plt.plot(file_numbers3, bert_scores3, label='gpt-neo-2.7B')
  plt.plot(file_numbers2, bert_scores2, label='t5-base')
  plt.plot(file_numbers4, bert_scores4, label='flan-t5-base')
  plt.plot(file_numbers1, bert_scores1, label='bart-base')


# Set the plot title and labels
plt.title('Development of BERTScore over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Value')

# Show a legend
plt.legend()

# Display the plot
plt.show()


The perplexities of the generated texts of the models in the course of the training.

In [None]:
import json
import matplotlib.pyplot as plt

# Define the number of epochs
num_epochs = 5

# Define the models and their corresponding file patterns
models = {
    't5': 'results/t5_epoch{}_results.json',
    'flan-t5': 'results/t5-flan_epoch{}_results.json',
     # 'gpt-neo-2.7B': 'results/gpt-neo_epoch{}_results.json',
    'bart-base': 'results/bart_epoch{}_results.json'
}

# Initialize a dictionary to store perplexity values for each model
model_perplexities = {}

# Iterate over each model
for model, file_pattern in models.items():
    # Initialize a list to store perplexity values for the current model
    perplexities = []

    # Iterate over each epoch
    for epoch in range(num_epochs):
        # Read the file for the current epoch and model
        file_path = file_pattern.format(epoch)
        with open(file_path, 'r') as file:
            data = json.load(file)
            perplexity = data['Perplexity']['perplexity_mean']
            # perplexity = data['Perplexity']['perplexity_median']
            perplexities.append(perplexity)

    # Store the perplexities for the current model in the dictionary
    model_perplexities[model] = perplexities

# Generate x-axis values (epochs)
epochs = range(1, num_epochs + 1)

# Plotting the data for each model
for model, perplexities in model_perplexities.items():
    plt.plot(epochs, perplexities, label=model)

# mean values of text-davinci-003 (Baseline-Model)
plt.plot([1, 2, 3, 4, 5], [151.7, 151.7, 151.7, 151.7, 151.7], label='davinci-003', color='red')

# median values of text-davinci-003 (Baseline-Model)
# plt.plot([1, 2, 3, 4, 5], [150.32, 150.32, 150.32, 150.32, 150.32], label='davinci-003', color='red')


# Set the plot title and labels
plt.title('Development of mean perplexity over epochs')
plt.xlabel('Epochs')
plt.ylabel('Mean Perplexity')

# Add a legend to distinguish the models
plt.legend()

# Display the plot
plt.show()

The style specific perplexities of the generated texts of the models in the course of the training.

In [None]:
import json
import matplotlib.pyplot as plt

# Define the number of epochs
num_epochs = 5

# Define the models and their corresponding file patterns
models = {
    't5-flan': 'results/t5-flan_epoch{}_results.json',
    #'gpt-neo': 'results/gpt-neo_epoch{}_results.json',
    #'t5': 'results/t5_epoch{}_results.json',
    #'bart': 'results/bart_epoch{}_results.json'
}

# Define the target subreddits and their corresponding colors
subreddits = {
    'TrueReddit': 'blue',
    'TwoXChromosomes': 'green',
    'wallstreetbets': 'red',
    'worldnews': 'purple'
}

# Define the file path for the baseline data
baseline_file = 'results/davinci-003_results.json'

# Initialize a dictionary to store perplexity values for each model-subreddit combination
model_perplexities = {}

# Iterate over each model
for model, file_pattern in models.items():
    # Initialize a dictionary to store perplexity values for the current model
    model_perplexities[model] = {}

    # Iterate over each subreddit
    for subreddit, color in subreddits.items():
        # Initialize a list to store perplexity values for the current model-subreddit combination
        perplexities = []

        # Iterate over each epoch
        for epoch in range(num_epochs):
            # Read the file for the current epoch, model, and subreddit
            file_path = file_pattern.format(epoch)
            with open(file_path, 'r') as file:
                data = json.load(file)
                perplexity = data['Style_Specific_Perplexity'][subreddit]
                perplexities.append(perplexity)

        # Store the perplexities for the current model-subreddit combination in the dictionary
        model_perplexities[model][subreddit] = perplexities

# Read the baseline data from the file
with open(baseline_file, 'r') as file:
    baseline_data = json.load(file)

# Extract the necessary information from the baseline data
baseline_perplexities = baseline_data['Style_Specific_Perplexity']

# Generate x-axis values (epochs)
epochs = range(1, num_epochs + 1)

# Plotting the data for each model-subreddit combination
for model, perplexities in model_perplexities.items():
    for subreddit, values in perplexities.items():
        color = subreddits[subreddit]
        plt.plot(epochs, values, marker='o', label=f'{model}-{subreddit}', color=color)


# Set the plot title and labels
plt.title('flan-t5-base: Development of Style-Specific Perplexity over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Style-Specific Perplexity')

# Add a legend to distinguish the model-subreddit and baseline combinations
plt.legend()

# Display the plot
plt.show()
