## Cap to 1000 words

In [None]:
data = ["train", "test", "valid"]
for name in data:
    with open("data/hd/raw/" + name + ".wp_target") as f:
        stories = f.readlines()
    stories = [" ".join(i.split()[0:1000]) for i in stories]
    with open("data/hd/prepro/" + name + ".wp_target", "w") as o:
        for line in stories:
            o.write(line.strip() + "\n")

Combine each prompt with its corresponding story to a new txt file.

In [None]:
import re

# Define regex pattern for impurities
pattern = r"(<newline>)"

names = ["train", "test", "valid"]

for name in names:
    # Python script to concatenate prompts and stories
    with open('data/hd/prepro/' + name + '.wp_source', 'r', encoding='utf-8') as sources, \
         open('data/hd/prepro/' + name + '.wp_target', 'r', encoding='utf-8') as targets, \
         open('data/hd/prepro/combined0/' + name + '_combined.txt', 'w', encoding='utf-8') as outfile:
        for prompt, story in zip(sources, targets):
            cleaned_prompt = re.sub(r"\<[^\>]*\>|\[ WP \]|\-\-", "", prompt[6:])
            cleaned_story = re.sub(pattern, "", story)
            outfile.write(cleaned_prompt.strip() + "\n" + cleaned_story.strip() + "\n\n")

To achieve this structure for your model training across different generations, you'll need to partition your human-generated data (from train_combined.txt) into decreasing portions for each subsequent generation, ensuring that each generation sees a unique subset of the human data for the first time. Here’s how you can split the data and prepare the training files for each generation:
Step 1: Calculate Split Sizes


In [5]:
def count_entries(filepath):
    """Counts the number of double-newline-separated entries in a file."""
    with open(filepath, 'r', encoding='utf-8') as file:
        content = file.read().strip()
    return len(content.split('\n\n'))

def calculate_portions(total_entries, initial_portion=0.40):
    """Calculate the number of entries for each dataset based on reducing portions."""
    portions = [initial_portion]
    current_portion = initial_portion

    # Assuming a reduction of 25% relatively per generation
    while current_portion > 0.05:  # Continue until the portion is very small
        current_portion *= 0.75
        portions.append(current_portion)

    # Calculate the number of entries per portion
    entries_per_portion = [int(total_entries * p) for p in portions]
    return entries_per_portion

# Example usage
total_entries = count_entries('./data/hd/initial_combined/train_combined.txt')
entries_distribution = calculate_portions(total_entries)
print("Entries per file:", entries_distribution)


Entries per file: [109040, 81780, 61335, 46001, 34500, 25875, 19406, 14555, 10916]




First, calculate the size of each portion based on the total number of lines in train_combined.txt. You'll want to divide the data so that each subsequent file has 25% less human data than the previous one.
Step 2: Split the Data

You can use Python to handle the reading, splitting, and writing of the data files.

In [2]:
def split_data(filepath, portions):
    import random
    
    # Read the entire file and split by entries
    with open(filepath, 'r', encoding='utf-8') as file:
        content = file.read().strip().split('\n\n')
    
    random.shuffle(content)  # Shuffle to randomize the entries distribution

    # Calculate the starting index for each portion
    total_len = len(content)
    portions_indices = [0]
    cumulative_sum = 0

    for portion in portions:
        cumulative_sum += int(total_len * portion)
        portions_indices.append(cumulative_sum)

    # Write out each portion to a different file
    for i in range(len(portions_indices) - 1):
        with open(f'data/hd/combined{i}/train_combined{i}.txt', 'w', encoding='utf-8') as f:
            for entry in content[portions_indices[i]:portions_indices[i+1]]:
                f.write(entry + "\n\n")

# Example usage
portions = [0.40, 0.30, 0.20, 0.10]  # Adjust based on total data and requirements
split_data('./data/hd/initial_combined/train_combined.txt', portions)


In [6]:
print(count_entries('./data/hd/combined0/train_combined0.txt'))
print(count_entries('./data/hd/combined1/train_combined1.txt'))
print(count_entries('./data/hd/combined2/train_combined2.txt'))
print(count_entries('./data/hd/combined3/train_combined3.txt'))

109040
81780
54520
27260


To ensure that there is no data leakage between the different train_combined#.txt files you've created, you'll want to verify that each dataset is unique and that entries from one dataset do not appear in another. Here are a few strategies you could employ to check for data leakage:
1. Hash Checking

You can calculate a unique hash for each entry in every dataset and ensure that no hash appears in more than one dataset.

In [8]:
import hashlib

def compute_hash(text):
    """Computes an MD5 hash for the given text."""
    return hashlib.md5(text.encode('utf-8')).hexdigest()

def check_leakage(files):
    seen_hashes = {}
    leakage = False
    leakage_count = 0

    for filename in files:
        with open(filename, 'r', encoding='utf-8') as file:
            content = file.read().strip()
        entries = content.split('\n\n')

        for entry in entries:
            entry_hash = compute_hash(entry)
            if entry_hash in seen_hashes:
                leakage_count += 1
                print(f"Leakage detected! Entry in {filename} already appears in {seen_hashes[entry_hash]}")
                #print(entry)
                #break
                leakage = True
            else:
                seen_hashes[entry_hash] = filename

    if not leakage:
        print("No data leakage detected among the files.")
    
    print("Total Leakages:",leakage_count)

# List your files
files = ['./data/hd/combined0/train_combined0.txt', './data/hd/combined1/train_combined1.txt', 
         './data/hd/combined2/train_combined2.txt', './data/hd/combined3/train_combined3.txt']
check_leakage(files)


Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combin

In [9]:
def read_entries(filename):
    """Reads entries from a file and returns them as a set."""
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read().strip()
    return set(content.split('\n\n'))

def check_intersection(files):
    entries_sets = {filename: read_entries(filename) for filename in files}
    all_files = list(entries_sets.keys())
    for i in range(len(all_files)):
        for j in range(i + 1, len(all_files)):
            intersection = entries_sets[all_files[i]].intersection(entries_sets[all_files[j]])
            if intersection:
                print(f"Leakage detected between {all_files[i]} and {all_files[j]}!")
            else:
                print(f"No leakage between {all_files[i]} and {all_files[j]}.")

# List your files
files = ['./data/hd/combined0/train_combined0.txt', './data/hd/combined1/train_combined1.txt', 
         './data/hd/combined2/train_combined2.txt', './data/hd/combined3/train_combined3.txt']
check_intersection(files)


Leakage detected between ./data/hd/combined0/train_combined0.txt and ./data/hd/combined1/train_combined1.txt!
Leakage detected between ./data/hd/combined0/train_combined0.txt and ./data/hd/combined2/train_combined2.txt!
Leakage detected between ./data/hd/combined0/train_combined0.txt and ./data/hd/combined3/train_combined3.txt!
Leakage detected between ./data/hd/combined1/train_combined1.txt and ./data/hd/combined2/train_combined2.txt!
Leakage detected between ./data/hd/combined1/train_combined1.txt and ./data/hd/combined3/train_combined3.txt!
Leakage detected between ./data/hd/combined2/train_combined2.txt and ./data/hd/combined3/train_combined3.txt!


## Initiate Tokenizer (irrelevant)

In [5]:
from transformers import OPTForCausalLM, AutoTokenizer

model_name = "facebook/opt-350m"
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(tokenizer)

  from .autonotebook import tqdm as notebook_tqdm


GPT2TokenizerFast(name_or_path='facebook/opt-350m', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}


In [7]:
import pickle

def tokenize_and_save(file_prefix, batch_size=100):
    prompts_file = f"data/hd/prepro/{file_prefix}.wp_source"
    stories_file = f"data/hd/prepro/{file_prefix}.wp_target"
    output_file = f'data/hd/prepro/tokenized/tokenized_{file_prefix}_data.pkl'
    
    with open(prompts_file, encoding="utf-8") as p, open(stories_file, encoding="utf-8") as f, open(output_file, 'wb') as d:
        while True:
            prompts = [next(p, None) for _ in range(batch_size)]
            stories = [next(f, None) for _ in range(batch_size)]
            # Break if the first item is None, indicating end of file
            if prompts[0] is None or stories[0] is None:
                break
            
            # Filter out None values in case of mismatched lengths (shouldn't happen with well-formed data)
            prompts = [prompt for prompt in prompts if prompt]
            stories = [story for story in stories if story]
            
            # Tokenize batch
            tokenized_batch = tokenizer(prompts, stories, padding=True, max_length=1024, truncation=True, return_tensors="pt")
            
            # Save tokenized batch immediately to reduce memory usage
            pickle.dump(tokenized_batch, d)

# Example usage
for name in ["train", "test", "valid"]:
    tokenize_and_save(name, batch_size=100)