# Preprocess
### Step 1
Combine each prompt with its corresponding story to a new txt file.

In [4]:
import re

# Define regex pattern for impurities
pattern = r"(<newline>)"

names = ["train", "test", "valid"]

for name in names:
    # Python script to concatenate prompts and stories
    with open('data/hd/prepro/' + name + '.wp_source', 'r', encoding='utf-8') as sources, \
         open('data/hd/prepro/' + name + '.wp_target', 'r', encoding='utf-8') as targets, \
         open('data/hd/prepro/combined0/' + name + '_combined.txt', 'w', encoding='utf-8') as outfile:
        for prompt, story in zip(sources, targets):
            cleaned_prompt = re.sub(r"\<[^\>]*\>|\[ WP \]|\-\-", "", prompt[6:])
            cleaned_story = re.sub(pattern, "", story)
            outfile.write(cleaned_prompt.strip() + "\n" + cleaned_story.strip() + "\n\n")


# Finetuning
### Step 2

Finetune the model for 5 epochs with the combined txt file of the human prompt-story data.

In [2]:
!deepspeed run_clm.py \
    --model_name_or_path distilgpt2 \
    --train_file data/hd/prepro/combined0/train_combined.txt \
    --validation_file data/hd/prepro/combined0/valid_combined.txt \
    --do_train \
    --do_eval \
    --output_dir ./models/distilgpt2-finetuned_gen0 \
    --num_train_epochs 5 \
    -- save_strategy epoch \
    --learning_rate 5e-5 \
    --per_device_train_batch_size 4 \
    --gradient_accumulation_steps 4 \
    --deepspeed ds_config.json \
    #--resume_from_checkpoint ./models/distilgpt2-finetuned_gen0/checkpoint-28500


[2024-04-12 15:30:06,030] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-04-12 15:30:06,630] [INFO] [runner.py:568:main] cmd = /home/vasi/Documents/BA_Thesis_Experiment/.venv/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None run_clm.py --model_name_or_path distilgpt2 --train_file data/hd/prepro/combined/train_combined.txt --validation_file data/hd/prepro/combined/valid_combined.txt --do_train --do_eval --output_dir ./models/distilgpt2-finetuned --num_train_epochs 5 --learning_rate 5e-5 --per_device_train_batch_size 4 --gradient_accumulation_steps 4 --deepspeed ds_config.json --resume_from_checkpoint ./models/distilgpt2-finetuned/checkpoint-28500
[2024-04-12 15:30:08,583] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-04-12 15:30:09,151] [INFO] [launch.py:145:main] WORLD INFO D

# Inference
### Step 3
Let the model generate a story from a specified prompt.

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "./models/distilgpt2-finetuned_gen0/checkpoint-31000" 
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


prompt = "When the time came she knew she had to take a very difficult descision, a descision of life or death for a whole planet."  # replace with your own prompt
inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")


# Generate text
output_sequences = model.generate(
    input_ids=inputs,
    attention_mask=None,
    max_length=500,  # determines the maximum length of the generated text
    temperature=0.7,  # controls randomness: lower values make text less random
    top_k=50,  # the K most likely next words are considered for each step
    top_p=0.9,  # only the most probable tokens with probabilities that add up to top_p are considered for each step
    repetition_penalty=1.2,  # penalty applied to repeated words
    do_sample=True,  # set to True to return diverse samples
    num_return_sequences=1,  # number of independently computed samples to generate
    pad_token_id=tokenizer.eos_token_id,
)

# Decode the output sequences to get the generated text
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)


print(generated_text)

When the time came she knew she had to take a very difficult descision, a descision of life or death for a whole planet. ” <newline> <newline> <newline> “ I don't think we have anything better to do than make this decision. '' The man looked at her, but he could not help but feel his heart racing as he spoke. <newline>
[ WP ] You're driving down an empty highway in Antarctica when you see it all turn black... and everything is gone except your car that was supposed be yours on Earth now! What are they doing here?
I remember watching my wife walk into our home after dinner with her family again. We were out in the middle of the night enjoying our favorite dessert together. She was walking up to me and told me about the day before Christmas ; the day where she died from a brain tumor while drinking wine. As I sat there thinking about her death, I thought back to how she had been able to forget things just like that. <newLine 1 : <newline> <newline> `` You know what happened last year? ''

### Step 4
Clear impurities in generated text and write to ouput file.

In [29]:
import re

# Define regex pattern for impurities
pattern = r"(<newline>|<newline \d+ :>|<newline\*>|\[.*?\]|“|”|``|''|--|__________________________________________________________________|\*)"

# Remove the prompt (first sentence) by finding the first period followed by a space or end of text
text_without_prompt = re.split(r'\. (?=[A-Z])|\.$', generated_text, 1)[1] if '.' in generated_text else generated_text


# Regex to remove specified impurities
cleaned_text = re.sub(r"\<[^\>]*\>|\[WP\]|\-\-", "", text_without_prompt)
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Remove extra spaces and strip leading/trailing spaces
# Apply regex to remove impurities
cleaned_text = re.sub(pattern, "", cleaned_text)

print(cleaned_text)

with open("./outputs/gen0/story0.txt", "w") as f:
    f.write(cleaned_text)


We were out in the middle of the night enjoying our favorite dessert together. She was walking up to me and told me about the day before Christmas ; the day where she died from a brain tumor while drinking wine. As I sat there thinking about her death, I thought back to how she had been able to forget things just like that.  You know what happened last year?  I asked her.  That one girl who didn't even notice us coming over every day until 2:30AM-  Her voice trailed off abruptly then stopped completely.  No! Not anymore!  she said,  It ca n´t be happening right now because no one else has heard of this yet.  I looked around confusedly. I noticed several people standing next to me. They seemed so familiar with each other. They were all looking at me with wide, yellow eyes.  I am sorry sir, I must warn you something may happen if you keep telling me otherwise. Your wife will die soon enough though. Please get inside safely by yourself without any trouble whatsoever. Do not leave alone pl

# Evaluation
### Step 5
Evaluate the output according to predefined metrics.

In [3]:
from metrics.LexicalDiversity.lexical_diversity import *
import nltk
from nltk.tokenize import sent_tokenize

with open("./outputs/gen0/story0.txt", 'r') as f:
    story = f.read()


# Download the required Punkt tokenizer models
#nltk.download('punkt')

# Tokenize the text into sentences
sentences = sent_tokenize(story)
#print(type(sentences))

#print(story.split("."))
print("GEN0")
print("Lexical Diversity: ")
print("Distinct 2: ", calculate_distinct_n(story, 2))
print("Distinct 3: ", calculate_distinct_n(story, 3))
print(f"Self_BLEU Score: {1-calculate_self_bleu(sentences)}")
print("Over-ALL-TTR: ", calculate_ttr(story, truncate_length=300))
print("Mean-Segmental-TTR: ", calculate_mean_segmental_ttr(story, segment_size=50))

GEN0
Lexical Diversity: 
Distinct 2:  0.9907834101382489
Distinct 3:  1.0
Self_BLEU Score: 1.0
Over-ALL-TTR:  0.7981651376146789
Mean-Segmental-TTR:  0.9128888888888887


# Generate Synthetic Data
### Step 6
Contribute to the synthetic dataset by producing stories from the finetuned model.
We use 25% of the original prompt data as our prompt list.

In [2]:
import random
from transformers import pipeline, set_seed, GPT2LMHeadModel, GPT2Tokenizer
import os

prompts = []
prompt_files = ["train", "test", "valid"]
for name in prompt_files:
    # Path to the file with prompts
    file_path = './data/hd/prepro/'+name+'.wp_source'

    # Read prompts from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        prompts += ([line.strip() for line in file.readlines() if line.strip()])

#print(prompts[0:10])
# Randomly select 25% of the prompts
sample_size = len(prompts) // 4  # 25%
selected_prompts = random.sample(prompts, sample_size)

print(selected_prompts[0:10])


['[ WP ] Mr. Rogers and Bob Ross were actually prolific serial killers . For years , they secretly communicated through their respective television shows about their plans , victims , and close calls using a code only they understood .', '[ WP ] You are one of the people sent to mars in 2020 . Everyone thinks you volunteered for a suicide mission . You watch from a biosphere as Earth gets demolished by a large asteroid .', '[ WP ] A new invention enables people to remember their dreams with absolute clarity . It turns out we were forgetting them for a very good reason .', "[ WP ] Begin and end your story with this sentence : `` And yet , the city remained . ''", "[ WP ] Story from a world where humans are n't at the top of the food chain . They live their modern lives but with the fear of being eaten .", "[ WP ] You were adopted as a child , your parents did n't like pets , and you never cared for them , so you never were around any . Today as adult you went to a zoo for the first time

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import re

model_name = "./models/distilgpt2-finetuned_gen0/checkpoint-31000"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Define the device based on CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if torch.cuda.is_available():
    model = torch.nn.DataParallel(model)
    model.cuda()
else:
    model.to("cpu")

batch_size = 16

def generate_text_batch(prompts):
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move all tensors to the right device
    outputs = model.module.generate(
        **inputs, 
        max_length=500, 
        num_return_sequences=1, 
        temperature=0.7,  # More randomness
        repetition_penalty=1.2,  # Increase penalty to reduce repetitions
        top_k=50, 
        top_p=0.9
    )
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

output_synth_data = './data/sd/gen0/gen0_sd.txt'

try:
    with open(output_synth_data, 'w', encoding='utf-8') as file:
        for i in range(0, len(selected_prompts), batch_size):
            batch_prompts = selected_prompts[i:i + batch_size]
            generated_texts = generate_text_batch(batch_prompts)
            
            for prompt, generated_text in zip(batch_prompts, generated_texts):
                prompt_length = len(tokenizer.encode(prompt))
                #print(prompt)
                # Remove the prompt by slicing the tokens to skip the prompt length
                generated_text_tokens = tokenizer.encode(generated_text)[prompt_length:]
                clean_generated_text = tokenizer.decode(generated_text_tokens, skip_special_tokens=True)
                
                output_text = f"{prompt}\n{clean_generated_text}\n\n"
                #print(output_text)
                file.write(output_text)
    print("Finished generating stories.")
except Exception as e:
    print(f"An error occurred: {e}")