## 1. limit story length to 1000 words

We limit the length of eac story from all raw datasets to 1000 words.

In [None]:
data = ["train", "test", "valid"]
for name in data:
    with open("data/hd/raw/" + name + ".wp_target") as f:
        stories = f.readlines()
    stories = [" ".join(i.split()[0:1000]) for i in stories]
    with open("data/hd/prepro/" + name + ".wp_target", "w") as o:
        for line in stories:
            o.write(line.strip() + "\n")

Clean texts from unwanted artifacts and Combine each prompt with its corresponding story to a new txt file.

In [21]:
import re

# Define regex pattern for impurities
pattern = r"(<newline>)|(newline)|\(Edit\s*:\s*[^\)]*\)|[^a-zA-Z\s.]|'\s+(?=\.)'|'(?i)\s*\bif you like(d)? this\s*((story|stories))?[^.]*\.?'"
pattern2 = r"\s+\."

names = ["train", "test", "valid"]

for name in names:
    # Python script to concatenate prompts and stories
    with open('data/hd/prepro/' + name + '.wp_source', 'r', encoding='utf-8') as sources, \
         open('data/hd/prepro/' + name + '.wp_target', 'r', encoding='utf-8') as targets, \
         open('data/hd/initial_combined/' + name + '_combined.txt', 'w', encoding='utf-8') as outfile:
        for prompt, story in zip(sources, targets):
            cleaned_prompt = re.sub(r"\<[^\>]*\>|\[ WP \]|\-\-", "", prompt[6:])
            cleaned_prompt = re.sub(pattern, "", cleaned_prompt)
            cleaned_prompt = re.sub(pattern2, ".", cleaned_prompt)
            cleaned_prompt = re.sub(r'\s{2,}', ' ', cleaned_prompt)
            cleaned_story = re.sub(pattern, "", story)
            cleaned_story = re.sub(pattern2, ".", cleaned_story)
            cleaned_story = re.sub(r'\s{2,}', ' ', cleaned_story)
            outfile.write(cleaned_prompt.strip() + "\n" + cleaned_story.strip() + "\n\n")

We partition our human-generated data (from train_combined.txt) into decreasing portions for each subsequent generation, ensuring that each generation sees a unique subset of the human data for the first time.



Step 1: Calculate Split Sizes

In [5]:
def count_entries(filepath):
    """Counts the number of double-newline-separated entries in a file."""
    with open(filepath, 'r', encoding='utf-8') as file:
        content = file.read().strip()
    return len(content.split('\n\n'))

def calculate_portions(total_entries, initial_portion=0.40):
    """Calculate the number of entries for each dataset based on reducing portions."""
    portions = [initial_portion]
    current_portion = initial_portion

    # Assuming a reduction of 25% relatively per generation
    while current_portion > 0.05:  # Continue until the portion is very small
        current_portion *= 0.75
        portions.append(current_portion)

    # Calculate the number of entries per portion
    entries_per_portion = [int(total_entries * p) for p in portions]
    return entries_per_portion

# Example usage
total_entries = count_entries('./data/hd/initial_combined/train_combined.txt')
entries_distribution = calculate_portions(total_entries)
print("Entries per file:", entries_distribution)


Entries per file: [109040, 81780, 61335, 46001, 34500, 25875, 19406, 14555, 10916]


Step 2: Split the Data

In [22]:
def split_data(filepath, portions):
    import random
    
    # Read the entire file and split by entries
    with open(filepath, 'r', encoding='utf-8') as file:
        content = file.read().strip().split('\n\n')
    
    random.shuffle(content)  # Shuffle to randomize the entries distribution

    # Calculate the starting index for each portion
    total_len = len(content)
    portions_indices = [0]
    cumulative_sum = 0

    for portion in portions:
        cumulative_sum += int(total_len * portion)
        portions_indices.append(cumulative_sum)

    # Write out each portion to a different file
    for i in range(len(portions_indices) - 1):
        with open(f'data/hd/combined{i}/train_combined{i}.txt', 'w', encoding='utf-8') as f:
            for entry in content[portions_indices[i]:portions_indices[i+1]]:
                f.write(entry + "\n\n")

# Example usage
portions = [0.40, 0.30, 0.20, 0.10]  # Adjust based on total data and requirements
split_data('./data/hd/initial_combined/train_combined.txt', portions)


In [6]:
print(count_entries('./data/hd/combined0/train_combined0.txt'))
print(count_entries('./data/hd/combined1/train_combined1.txt'))
print(count_entries('./data/hd/combined2/train_combined2.txt'))
print(count_entries('./data/hd/combined3/train_combined3.txt'))

109040
81780
54520
27260


To ensure that there is no data leakage between the different train_combined#.txt files we want to verify that each dataset is unique and that entries from one dataset do not appear in another.
1. Hash Checking
calculate a unique hash for each entry in every dataset and ensure that no hash appears in more than one dataset.

In [8]:
import hashlib

def compute_hash(text):
    """Computes an MD5 hash for the given text."""
    return hashlib.md5(text.encode('utf-8')).hexdigest()

def check_leakage(files):
    seen_hashes = {}
    leakage = False
    leakage_count = 0

    for filename in files:
        with open(filename, 'r', encoding='utf-8') as file:
            content = file.read().strip()
        entries = content.split('\n\n')

        for entry in entries:
            entry_hash = compute_hash(entry)
            if entry_hash in seen_hashes:
                leakage_count += 1
                print(f"Leakage detected! Entry in {filename} already appears in {seen_hashes[entry_hash]}")
                #print(entry)
                #break
                leakage = True
            else:
                seen_hashes[entry_hash] = filename

    if not leakage:
        print("No data leakage detected among the files.")
    
    print("Total Leakages:",leakage_count)

# List your files
files = ['./data/hd/combined0/train_combined0.txt', './data/hd/combined1/train_combined1.txt', 
         './data/hd/combined2/train_combined2.txt', './data/hd/combined3/train_combined3.txt']
check_leakage(files)


Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combined0.txt
Leakage detected! Entry in ./data/hd/combined0/train_combined0.txt already appears in ./data/hd/combined0/train_combin