# Preparing Data

Here we grab the data used from the Human vs LLM Text Corpus, extract a subset we want to use, modify some of it and then save it to a series of files.  
Not necessarily in that order, but you get the idea.


In [None]:
import json
import pathlib
import random
import shutil

import nltk
import openai
import pandas as pd
from openai import OpenAI
from tqdm import tqdm

tqdm.pandas()

print("Done")

Load the csv file.

In [None]:
data_df = pd.read_csv(pathlib.Path("data/original/Human vs LLM Text Corpus/data.csv"))
prompts_df = pd.read_csv(pathlib.Path("data/original/Human vs LLM Text Corpus/prompts.csv"))

print("Done")

Extracting the data we want.

* Select only Human texts.
* Add the prompt to each row of human text.
* Remove all columns but the ones we care about.
* Cleaning the text a little.

In [None]:
# Grab the human sources
human_sources = data_df[data_df["source"] == "Human"]

# Add in the prompts

# Rename the column in the prompts dataframe to make things easier
prompts_df = prompts_df.rename(columns={"Prompt ID": "prompt_id"})
# Merge the dataframes
human_sources = human_sources.merge(prompts_df[['prompt_id', 'Prompt']], on='prompt_id', how='left')
# Rename the Prompt column to prompt as I like lowercase more
human_sources = human_sources.rename(columns={"Prompt": "prompt"})
# Remove any whose prompt is undefined
human_sources = human_sources[human_sources["prompt"] != "Undefined"]

# Grab only the text and prompt columns
human_sources = human_sources.loc[:, ["text", "prompt"]]

# Strip the leading and trailing whitespace from the texts
human_sources["text"] = human_sources["text"].str.strip()
human_sources["text"] = human_sources["text"].str.replace(" ", " ")

print("Done")

Adding a new column for each sentence and another for the number of sentences for each text.  
This takes a bit because of the number of human texts.

In [None]:
tokenizer = nltk.tokenize.PunktTokenizer()


def _modify(_row: pd.Series):
    _sentence_spans = list(tokenizer.span_tokenize(_row["text"]))
    _sentences = [
        _row["text"][start:stop] for start, stop in _sentence_spans
    ]

    return pd.Series({
        "sentence_spans": _sentence_spans,
        "sentences": _sentences
    })


# Create a row for the spans of the sentences and the actual sentences
tqdm.pandas(desc="Tokenizing texts to sentences")
human_sources[["sentence_spans", "sentences"]] = human_sources.progress_apply(
    _modify,
    axis=1
)

# And another row for the number of sentences
tqdm.pandas(desc="Counting the number of sentences in each text")
human_sources["num_sentences"] = human_sources["sentences"].apply(
    lambda _sentences: len(_sentences)
)

print("Done")

Filter to the texts that we want.  
That is texts with more than 40 sentences,
50 of each with the number of sentences from 40 to 60,
inclusive of 40, but not 60.  
Why this?
Well, because I wanted some even longish data.

In [None]:
MIN_NUM_SENTENCES = 40
MAX_NUM_SENTENCES = 60
NUM_PER_SUBSET = 50

picked_sentences_list = []
for num in range(MIN_NUM_SENTENCES, MAX_NUM_SENTENCES):
    _subset = human_sources[human_sources["num_sentences"] == num]
    if len(_subset) < NUM_PER_SUBSET:
        picked_sentences_list.append(_subset)
    else:
        picked_sentences_list.append(_subset.sample(n=50, random_state=42))
picked_sentences = pd.concat(picked_sentences_list)

print("Done")

Now we select 50% of the data to be modified in some way by an AI.

In [None]:
unmodified_texts = []
to_be_ai_modified_texts = []

for num in range(MIN_NUM_SENTENCES, MAX_NUM_SENTENCES):
    subset = picked_sentences[picked_sentences["num_sentences"] == num]

    unmodified = subset.sample(frac=0.5, random_state=42)
    to_be_ai_modified = subset.drop(unmodified.index)

    unmodified_texts.append(unmodified)
    to_be_ai_modified_texts.append(to_be_ai_modified)

unmodified_texts = pd.concat(unmodified_texts)
to_be_ai_modified_texts = pd.concat(to_be_ai_modified_texts)

print("Done")

Due to working with a LLM, some of the response may not be in the correct format as requested.  
This is a simple dat to track the types of failures that occur during modification.

In [None]:
num_skips = 0
reasons_for_skipping_rows: dict[str, int] = {}
last_skip_reason: str = ""


def log_row_skip(_reason_for_skip: str):
    global num_skips, last_skip_reason

    if _reason_for_skip not in reasons_for_skipping_rows:
        reasons_for_skipping_rows[_reason_for_skip] = 0
    reasons_for_skipping_rows[_reason_for_skip] += 1

    num_skips += 1
    last_skip_reason = _reason_for_skip


def print_failures():
    print("Types of failures:")
    padding = max([
        len(_key) for _key in reasons_for_skipping_rows.keys()
    ])

    for _key, _value in reasons_for_skipping_rows.items():
        print(f"\t{_key:<{padding}} : {_value}")


def reset_skip_logging():
    global num_skips, reasons_for_skipping_rows, last_skip_reason

    num_skips = 0
    reasons_for_skipping_rows = {}
    last_skip_reason = ""


print("Done")

Helper function to select sentences to be modified.

Either you can select sentences randomly, or in a cluster.
This function simply selects `n` items from the given list.

In [None]:
def select_sentences(_sentences: list, n: int) -> tuple[list, list, list]:
    """
    :param _sentences: List of sentences to select from.
    :param n: Number of sentences to select.
    :return: A tuple containing three lists:
        - Sentences before the selected range.
        - The selected sentences.
        - Sentences after the selected range.
    """

    _last_available_start = len(_sentences) - n
    _start = random.randint(0, _last_available_start)
    _end = _start + n

    _sentences_before = _sentences[:_start]
    _selected_sentences = _sentences[_start:_end]
    _sentences_after = _sentences[_end:]

    return _sentences_before, _selected_sentences, _sentences_after


print("Done")

Storage for the modified texts alongside a helper function for adding a row of data.

In [None]:
ai_modified_texts = pd.DataFrame(columns=[
    # The text before and after modification
    "text",
    "modified_text",  # The entire text with the modified sentences
    "modified_text_subset",  # The response from our AI for modification
    # The number of sentences before and after modification
    "num_sentences",
    "modified_num_sentences"
    # Sentences before and after modification
    "sentences",
    "modified_sentences",
    # The spans of the sentences before and after modification
    "sentence_spans",
    "modified_sentence_spans",
    # If the span in the corresponding index is AI modified
    # With reference to "modified_sentence_spans" only
    "span_ai_modified",
    # Is false if an error occurred
    "success"
])


def add_row_to_ai_modified_texts(_original_row: pd.Series,
                                 _modified_text: str | None,
                                 _modified_text_subset: str | None,
                                 _modified_num_sentences: int | None,
                                 _modified_sentences: list | None,
                                 _modified_sentence_spans: list | None,
                                 _span_ai_modified: list | None,
                                 _success: bool):
    global ai_modified_texts

    _new_row = pd.DataFrame(data={
        "text": _original_row["text"],
        "modified_text": _modified_text,
        "modified_text_subset": _modified_text_subset,
        "num_sentences": _original_row["num_sentences"],
        "modified_num_sentences": _modified_num_sentences,
        "sentences": [_original_row["sentences"]],
        "modified_sentences": [_modified_sentences],
        "sentence_spans": [_original_row["sentence_spans"]],
        "modified_sentence_spans": [_modified_sentence_spans],
        "span_ai_modified": [_span_ai_modified],
        "success": _success
    })

    ai_modified_texts = pd.concat([ai_modified_texts, _new_row])


print("Done")

A collection of definitions and helper functions with relation to the OpenAI API.

In [None]:
with pathlib.Path("api_key.txt").open() as f:
    api_key = f.read().strip()

MODEL = "gpt-4o-mini"
# MODEL = "gpt-3.5-turbo"
CLIENT = OpenAI(api_key=api_key)

END_PROMPT_TAG = "<<END PROMPT>>"
START_TEXT = "<<START TEXT>>"
MISSING_SENTENCES_TAG = "<<MISSING SENTENCES>>"
MODIFICATION_ROLE = f"""
You will received a prompt that was used to write a piece of text, followed by a token {END_PROMPT_TAG}.
Then, I will indicate the number of sentences that need to be generated.
After that, you will see a token {START_TEXT} marking the start of the text wherein a chunk of sentences are missing. The missing sentences are represented by a single token {MISSING_SENTENCES_TAG}.

Using the prompt and the text before and after the {MISSING_SENTENCES_TAG} token, create the specified number of consecutive sentences to fill this gap.
Ensure that your new sentences flow seamlessly with the surrounding text, maintaining the style, tone, and context.

Please provide only the new sentences, with no additional formatting or notes.
""".strip()


def create_modification_prompt(_prompt_for_text: str, _num_missing_sentences: int,
                               _text_before_missing: str, _text_after_missing) -> str:
    _prompt = _prompt_for_text
    _prompt += f"\n{END_PROMPT_TAG}\n"

    _prompt += f"Create {_num_missing_sentences} new sentences."

    _prompt += f"\n{START_TEXT}\n"
    _prompt += _text_before_missing
    _prompt += f"\n{MISSING_SENTENCES_TAG}\n"
    _prompt += _text_after_missing

    return _prompt


def send_prompt_with_role(_prompt: str, _role: str):
    """
    :param _prompt: The prompt message to be sent to the chat model.
    :param _role: The text for the role to be sent to the chat model.
    :return: The response from the chat model, what is returned from `CLIENT.chat.completions.create`.
    """

    _response = CLIENT.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": _role},
            {"role": "user", "content": _prompt}
        ]
    )

    return _response

Now we modify the AI sentences.

Here we are using a simplified prompt.  
We need to select sentences to remove, then ask ChatGPT to regenerate them.  
Finally we reconstruct the text with these new sentences.

Steps:

1. Grab an entry to work with.
2. Calculate the number of sentences to modify.
3. Extract the sentences to be modified.
4. Form a prompt to ChatGPT and send it.
5. Acquire the response.
6. Reform the text with the modified sentences.
7. Perform some double checks to provide assurance that the modifications were successful.
8. Save the modified text and associated information.

In [None]:
reset_skip_logging()

# A dictionary for number of sentences to percentage of text to be modified multiplied by 10
# That is if processing_info[25] is 20, then 20% of the text is to be modified
# This value is incremented by 10 after use and reset to 10 after using 50.
processing_info = {
    i: 10
    for i in range(MIN_NUM_SENTENCES, MAX_NUM_SENTENCES)
}

# Seed the RNG for consistency
# At least with selecting the sentences
random.seed(42)

progress_bar = tqdm(total=len(to_be_ai_modified_texts),
                    desc="Modifying texts",
                    postfix={"skips": num_skips})

for row_index, (_, row) in enumerate(to_be_ai_modified_texts.iterrows()):
    # Calculate the number of sentences to modify
    percentage_to_be_modified = processing_info[row["num_sentences"]]
    percentage_to_be_modified /= 100
    num_sentences_to_modify = round(percentage_to_be_modified * row["num_sentences"])

    # Update the number of sentences to modify
    processing_info[row["num_sentences"]] += 10
    if processing_info[row["num_sentences"]] > 50:
        processing_info[row["num_sentences"]] = 10

    # Collect the sentences to be modified
    sentences_with_spans: list[tuple[str, tuple[int, int]]]
    sentences_with_spans = list(zip(row["sentences"], row["sentence_spans"]))
    _selection_results = select_sentences(sentences_with_spans, num_sentences_to_modify)
    # Some type hints to make PyCharm like me
    sentences_before: list[tuple[str, tuple[int, int]]]
    selected_sentences: list[tuple[str, tuple[int, int]]]
    sentences_after: list[tuple[str, tuple[int, int]]]
    # Unpack our tuple
    sentences_before, selected_sentences, sentences_after = _selection_results

    # Grab the text before and after the removed sentences
    if sentences_before:
        selected_sentences_start = selected_sentences[0][1][0]
    else:
        selected_sentences_start = 0

    if sentences_after:
        selected_sentences_end = selected_sentences[-1][1][1]
    else:
        selected_sentences_end = sentences_with_spans[-1][1][1]

    text_before = row["text"][:selected_sentences_start]
    text_after = row["text"][selected_sentences_end:]

    # Create the prompt for ChatGPT
    modification_prompt = create_modification_prompt(
        row["prompt"],
        num_sentences_to_modify,
        text_before,
        text_after
    )

    # To our AI Overlord, ChatGPT
    try:
        modification_response = send_prompt_with_role(_prompt=modification_prompt, _role=MODIFICATION_ROLE)

        # Extract our data from the response
        modified_text_subset = modification_response.choices[0].message.content

    except openai.BadRequestError:
        log_row_skip("openai.BadRequestError")
        add_row_to_ai_modified_texts(_original_row=row,
                                     _modified_text=None,
                                     _modified_text_subset=None,
                                     _modified_num_sentences=None,
                                     _modified_sentences=None,
                                     _modified_sentence_spans=None,
                                     _span_ai_modified=None,
                                     _success=False)
        # Update the progress bar
        progress_bar.update()
        progress_bar.set_postfix(skips=num_skips)
        continue

    # Verify that we have an appropriate number of sentences
    sentence_spans_in_response = list(tokenizer.span_tokenize(modified_text_subset))
    sentences_in_response = [
        modified_text_subset[start:stop] for start, stop in sentence_spans_in_response
    ]
    num_sentences_in_response = len(sentences_in_response)

    num_sentences_in_response_variance = int(num_sentences_to_modify // 4)  # Allow a variance of 25% when receiving sentences
    min_num_sentences_in_response = num_sentences_to_modify - num_sentences_in_response_variance
    max_num_sentences_in_response = num_sentences_to_modify + num_sentences_in_response_variance

    if num_sentences_in_response < min_num_sentences_in_response or num_sentences_in_response > max_num_sentences_in_response:
        log_row_skip("Too large of a mismatch in received number of sentences")
        add_row_to_ai_modified_texts(_original_row=row,
                                     _modified_text=None,
                                     _modified_text_subset=modified_text_subset,
                                     _modified_num_sentences=None,
                                     _modified_sentences=None,
                                     _modified_sentence_spans=None,
                                     _span_ai_modified=None,
                                     _success=False)
        # Update Mr. Progress Bar
        progress_bar.update()
        progress_bar.set_postfix(skips=num_skips)
        continue

    # Recreate the text with the modified sentences in place
    modified_text = text_before + modified_text_subset + text_after

    # Now we regenerate the sentence spans and individual sentences from those spans
    modified_text_sentence_spans = list(tokenizer.span_tokenize(modified_text))
    modified_text_sentences = [
        modified_text[start:stop] for start, stop in modified_text_sentence_spans
    ]
    modified_text_num_sentences = len(modified_text_sentences)

    # Double check we have the expected number of sentences
    expected_number_of_sentences = len(sentences_before) + num_sentences_in_response + len(sentences_after)
    if modified_text_num_sentences != expected_number_of_sentences:
        log_row_skip("Expected number of sentences after text regeneration does not match expected value")
        add_row_to_ai_modified_texts(_original_row=row,
                                     _modified_text=modified_text,
                                     _modified_text_subset=modified_text_subset,
                                     _modified_num_sentences=modified_text_num_sentences,
                                     _modified_sentences=modified_text_sentences,
                                     _modified_sentence_spans=modified_text_sentence_spans,
                                     _span_ai_modified=None,
                                     _success=False)
        # Update the progress bar
        progress_bar.update()
        progress_bar.set_postfix(skips=num_skips)
        continue

    # We know the number of sentences before and after the modifications themselves
    # We know how many sentences there were in the modifications
    # So we can create a list of bools for if a sentence was modified from that
    is_sentence_modified = [False for _ in range(len(sentences_before))] \
                           + [True for _ in range(num_sentences_in_response)] \
                           + [False for _ in range(len(sentences_after))]

    # We have a valid new text, add this to the data frame
    add_row_to_ai_modified_texts(_original_row=row,
                                 _modified_text=modified_text,
                                 _modified_text_subset=modified_text_subset,
                                 _modified_num_sentences=modified_text_num_sentences,
                                 _modified_sentences=modified_text_sentences,
                                 _modified_sentence_spans=modified_text_sentence_spans,
                                 _span_ai_modified=is_sentence_modified,
                                 _success=True)

    # The name's Bar, Progress Bar
    progress_bar.update()

progress_bar.close()
print(f"Finished with {num_skips / len(to_be_ai_modified_texts):.2%} failed")

print_failures()

print("Done")

Print the data

In [None]:
ai_modified_texts

Finally, we save the data to files.  
I will only be saving those that were successfully modified.

In [None]:
save_dir = pathlib.Path("./data/prepared")

original_texts_dir = save_dir / "unmodified_texts"
ai_modified_texts_dir = save_dir / "ai_modified_texts"

# Clear the directory if it exists
if original_texts_dir.exists():
    shutil.rmtree(original_texts_dir)
if ai_modified_texts_dir.exists():
    shutil.rmtree(ai_modified_texts_dir)

# Recreate them
original_texts_dir.mkdir()
ai_modified_texts_dir.mkdir()

# Save the original texts
for i, (_, row) in enumerate(unmodified_texts.iterrows()):
    # Make the data a little more usable
    data = {
        "text": row["text"],
        "sentences": row["sentences"],
        "ai_modified": [0 for _ in range(len(row["sentences"]))]
    }

    file = original_texts_dir / f"Text {i}.json"
    with file.open("w") as f:
        f.write(json.dumps(data))

# Save the AI modified texts
i = 0
for _, row in ai_modified_texts.iterrows():
    if not row["success"]:
        continue

    data = {
        "text": row["modified_text"],
        "sentences": row["modified_sentences"],
        "ai_modified": row["span_ai_modified"]
    }

    file = ai_modified_texts_dir / f"Text {i}.json"
    i += 1
    with file.open("w") as f:
        f.write(json.dumps(data))

print("Done")