# Finetuning

Finetune the model for 5 epochs with the combined txt file of the human prompt-story data for gen0.

In [1]:
!deepspeed run_clm.py \
    --model_name_or_path distilgpt2 \
    --train_file data/hd/combined0/train_combined0.txt \
    --validation_file data/hd/initial_combined/valid_combined.txt \
    --do_train \
    --do_eval \
    --output_dir ./models/distilgpt2-finetuned_gen0_100 \
    --num_train_epochs 5 \
    --save_strategy epoch \
    --learning_rate 5e-5 \
    --per_device_train_batch_size 4 \
    --gradient_accumulation_steps 4 \
    --deepspeed ds_config.json \
    #--resume_from_checkpoint ./models/distilgpt2-finetuned_gen0/checkpoint-28500


[2024-04-19 20:30:40,128] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-04-19 20:30:40,691] [INFO] [runner.py:568:main] cmd = /home/vasi/Documents/BA_Thesis_Experiment/.venv/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None run_clm.py --model_name_or_path distilgpt2 --train_file data/hd/combined0/train_combined0.txt --validation_file data/hd/initial_combined/valid_combined.txt --do_train --do_eval --output_dir ./models/distilgpt2-finetuned_gen0_75 --num_train_epochs 5 --save_strategy epoch --learning_rate 5e-5 --per_device_train_batch_size 4 --gradient_accumulation_steps 4 --deepspeed ds_config.json
[2024-04-19 20:30:42,551] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-04-19 20:30:43,102] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1]}
[2024-04-19 20:30:4

# Inference
### Step 3
Let the model generate a story from a specified prompt.

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "./models/distilgpt2-finetuned_gen0_100/checkpoint-11105" 
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


prompt = "When the time came she knew she had to take a very difficult descision, a descision of life and death for the whole planet."  # replace with your own prompt
inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")


# Generate text
output_sequences = model.generate(
    input_ids=inputs,
    attention_mask=None,
    max_length=500,  # determines the maximum length of the generated text
    temperature=0.7,  # controls randomness: lower values make text less random
    top_k=50,  # the K most likely next words are considered for each step
    top_p=0.9,  # only the most probable tokens with probabilities that add up to top_p are considered for each step
    repetition_penalty=1.2,  # penalty applied to repeated words
    do_sample=True,  # set to True to return diverse samples
    num_return_sequences=1,  # number of independently computed samples to generate
    pad_token_id=tokenizer.eos_token_id,
)

# Decode the output sequences to get the generated text
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)


print(generated_text)

  from .autonotebook import tqdm as notebook_tqdm


When the time came she knew she had to take a very difficult descision, a descision of life and death for the whole planet. ”   She felt like an old woman now. This was no way out, it would be impossible at this point in her life after all these years that she 'd finally been able to stop herself from killing so many others on Earth because there weren't enough people left around anymore. But something about her body made me happy too. It did n´t matter how much longer I stayed alive - or even if she went through some kind of pain again... until my eyesight turned black as well. And then everything changed ; suddenly reality hit its peak!


### Step 4
Clear impurities in generated text and write to ouput file.

In [2]:
import re

# Define regex pattern for impurities
#pattern = r"(<newline>|<newline \d+ :>|<newline\*>|\[.*?\]|“|”|``|''|--|__________________________________________________________________|\*)"

# Remove the prompt (first sentence) by finding the first period followed by a space or end of text
text_without_prompt = re.split(r'\.\s*[”]*\s*(?=[A-Z])', generated_text, 1)[1] if '.' in generated_text else generated_text


# Regex to remove specified impurities
#cleaned_text = re.sub(r"\<[^\>]*\>|\[WP\]|\-\-", "", text_without_prompt)
#cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Remove extra spaces and strip leading/trailing spaces
# Apply regex to remove impurities
#cleaned_text = re.sub(pattern, "", cleaned_text)

print(text_without_prompt)

with open("./outputs/gen0/story0.txt", "w") as f:
    f.write(text_without_prompt)


She felt like an old woman now. This was no way out, it would be impossible at this point in her life after all these years that she 'd finally been able to stop herself from killing so many others on Earth because there weren't enough people left around anymore. But something about her body made me happy too. It did n´t matter how much longer I stayed alive - or even if she went through some kind of pain again... until my eyesight turned black as well. And then everything changed ; suddenly reality hit its peak!


# Evaluation
### Step 5
Evaluate the output according to predefined metrics.

In [4]:
from metrics.LexicalDiversity.lexical_diversity import *
from metrics.SemanticDiversity.sementic_diversity import *
from metrics.SyntacticDiversity.syntactic_diversity import *
import nltk
from nltk.tokenize import sent_tokenize

with open("./outputs/gen0/story0.txt", 'r') as f:
    story = f.read()


# Download the required Punkt tokenizer models
#nltk.download('punkt')

# Tokenize the text into sentences
sentences = sent_tokenize(story)
#print(type(sentences))

#print(story.split("."))
print("GEN0")
print("Lexical Diversity: ")
print("Distinct 2: ", calculate_distinct_n(story, 2))
print("Distinct 3: ", calculate_distinct_n(story, 3))
print(f"Self_BLEU Score: {1-calculate_self_bleu(sentences)}")
print("Over-ALL-TTR: ", calculate_ttr(story, truncate_length=300))
print("Mean-Segmental-TTR: ", calculate_mean_segmental_ttr(story, segment_size=50))
print("\nSemantic Diversity: ")
print("Semantic diversity (average):", calculate_semantic_diversity(sentences, 'average'))
print("Semantic diversity (centroid):", calculate_semantic_diversity(sentences, 'centroid'))

import spacy

# Load a spaCy model for dependency parsing
nlp = spacy.load("en_core_web_sm")

graphs = construct_dependency_graphs(sentences)

syntactic_diversity = calculate_syntactic_diversity(graphs)
print("\nSyntactic diversity:", syntactic_diversity)

GEN0
Lexical Diversity: 
Distinct 2:  1.0
Distinct 3:  1.0
Self_BLEU Score: 1.0
Over-ALL-TTR:  0.9233333333333333
Mean-Segmental-TTR:  0.9750000000000001

Semantic Diversity: 
Semantic diversity (average): 0.8032973862543591
Semantic diversity (centroid): 0.4419145343195856

Syntactic diversity: 0.8122145754831417


# Generate Synthetic Data
### Step 6
Contribute to the synthetic dataset by producing stories from the finetuned model.
We use 25% of the original prompt data as our prompt list.

In [1]:
import random
from transformers import pipeline, set_seed, GPT2LMHeadModel, GPT2Tokenizer
import os

prompts = []
prompt_files = ["train", "test", "valid"]
for name in prompt_files:
    # Path to the file with prompts
    file_path = './data/hd/prepro/'+name+'.wp_source'

    # Read prompts from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        prompts += ([line.strip() for line in file.readlines() if line.strip()])

#print(prompts[0:10])
# Randomly select 25% of the prompts
sample_size = len(prompts) // 4  # 25%
selected_prompts = random.sample(prompts, sample_size)

print(selected_prompts[0:10])


  from .autonotebook import tqdm as notebook_tqdm


["[ WP ] Disney 's next children 's animated feature turns way too dark partway through .", '[ Prompt ] Write something as cliché and forgettable as possible .', "[ WP ] You 've heard of the city that never sleeps , yeah ? Well , welcome to the city that never wakes !", '[ WP ] You know you are about to die . Write a letter to the person that might find your body .', '[ PI ] More than I Can Say - FirstChapter - 3,493 Words', "[ FF ] A New York City taxi driver , it 's late at night and the shift is almost over , but ahead the taxi driver sees one last customer flag them down . In dialog only , give me a conversation where the customer starts convinces the taxi driver his world is nothing but a simulation .", '[ WP ] Any Last words ... ?', '[ WP ] Change the genre of your story as many times as you can .', '[ WP ] Write a story where evil triumphs over good , and evil is just pure evil without any redeeming qualities', '[ WP ] You are an ordinary person . Today a demon summoned you to m

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import re

model_name = "./models/distilgpt2-finetuned_gen0/checkpoint-27765"
tokenizer = GPT2Tokenizer.from_pretrained(model_name, padding_side='left')
model = GPT2LMHeadModel.from_pretrained(model_name)

# Define the device based on CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

if torch.cuda.is_available():
    model = torch.nn.DataParallel(model)
    model.cuda()
else:
    model.to("cpu")

batch_size = 16

def generate_text_batch(prompts):
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move all tensors to the right device
    outputs = model.module.generate(
        **inputs, 
        max_length=500, 
        num_return_sequences=1, 
        temperature=0.7,  # More randomness
        repetition_penalty=1.2,  # Increase penalty to reduce repetitions
        top_k=50, 
        top_p=0.9
    )
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

output_synth_data = './data/sd/gen0/gen0_sd.txt'

try:
    with open(output_synth_data, 'a', encoding='utf-8') as file:
        for i in range(0, len(selected_prompts), batch_size):
            batch_prompts = selected_prompts[i:i + batch_size]
            generated_texts = generate_text_batch(batch_prompts)
            
            for prompt, generated_text in zip(batch_prompts, generated_texts):
                prompt_length = len(tokenizer.encode(prompt))
                #print(prompt)
                # Remove the prompt by slicing the tokens to skip the prompt length
                generated_text_tokens = tokenizer.encode(generated_text)[prompt_length:]
                clean_generated_text = tokenizer.decode(generated_text_tokens, skip_special_tokens=True)

                # Remove leading and ending spaces and special characters
                clean_generated_text = re.sub(r'^[\s\W]+', '', clean_generated_text)
                clean_generated_text = re.sub(r'^[\s\W]+|[\s\W]+$', '', clean_generated_text)


                output_text = f"{prompt}\n{clean_generated_text}\n\n"
                #print(output_text)
                file.write(output_text)
    print("Finished generating stories.")
except Exception as e:
    print(f"An error occurred: {e}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

KeyboardInterrupt: 