# Preparing Data

Here we grab the data used from the Human vs LLM Text Corpus, extract a subset we want to use, modify some of it and then save it to a series of files.  
Not necessarily in that order, but you get the idea.


In [None]:
import json
import pathlib
import random
import shutil
from typing import Literal

import nltk
import openai
import pandas as pd
from openai import OpenAI
from tqdm import tqdm

tqdm.pandas()

print("Done")

Load the csv file.

In [None]:
df = pd.read_csv(pathlib.Path("data/original/Human vs LLM Text Corpus/data.csv"))

print("Done")

Extracting the data we want.

* Remove all columns but the ones we care about.
* Select only Human texts.
* Cleaning the text a little.

In [None]:
# Grab only the text and source columns
human_sources = df.loc[:, ["text", "source"]]
# Select only the human sources
human_sources = human_sources[human_sources["source"] == "Human"]

# Strip the leading and trailing whitespace from the texts
human_sources["text"] = human_sources["text"].str.strip()

print("Done")

Adding a new column for each sentence and another for the number of sentences for each text.  
This takes a bit because of the number of human texts.

In [None]:
tokenizer = nltk.tokenize.PunktTokenizer()


def _modify(_row: pd.Series):
    _sentence_spans = list(tokenizer.span_tokenize(_row["text"]))
    _sentences = [
        _row["text"][start:stop] for start, stop in _sentence_spans
    ]

    return pd.Series({
        "sentence_spans": _sentence_spans,
        "sentences": _sentences
    })


# Create a row for the spans of the sentences and the actual sentences
tqdm.pandas(desc="Tokenizing texts to sentences")
human_sources[["sentence_spans", "sentences"]] = human_sources.progress_apply(
    _modify,
    axis=1
)

# And another row for the number of sentences
tqdm.pandas(desc="Counting the number of sentences in each text")
human_sources["num_sentences"] = human_sources["sentences"].apply(
    lambda _sentences: len(_sentences)
)

print("Done")

Filter to the texts that we want.  
That is texts with more than 40 sentences,
50 of each with the number of sentences from 40 to 60,
inclusive of 40, but not 60.  
Why this?
Well, because I wanted some even longish data.

In [None]:
MIN_NUM_SENTENCES = 40
MAX_NUM_SENTENCES = 60

picked_sentences = pd.concat([
    human_sources[human_sources["num_sentences"] == num].sample(n=50, random_state=42)
    for num in range(MIN_NUM_SENTENCES, MAX_NUM_SENTENCES)
])

print("Done")

Now we select 50% of the data to be modified in some way by an AI.

In [None]:
unmodified_texts = []
to_be_ai_modified_texts = []

for num in range(MIN_NUM_SENTENCES, MAX_NUM_SENTENCES):
    subset = picked_sentences[picked_sentences["num_sentences"] == num]

    unmodified = subset.sample(frac=0.5, random_state=42)
    to_be_ai_modified = subset.drop(unmodified.index)

    unmodified_texts.append(unmodified)
    to_be_ai_modified_texts.append(to_be_ai_modified)

unmodified_texts = pd.concat(unmodified_texts)
to_be_ai_modified_texts = pd.concat(to_be_ai_modified_texts)

print("Done")

Due to working with a LLM, some of the response may not be in the correct format as requested.  
This is a simple dat to track the types of failures that occur during modification.

In [None]:
num_skips = 0
reasons_for_skipping_rows: dict[str, int] = {}
last_skip_reason: str = ""


def log_row_skip(_reason_for_skip: str):
    global num_skips, last_skip_reason

    if _reason_for_skip not in reasons_for_skipping_rows:
        reasons_for_skipping_rows[_reason_for_skip] = 0
    reasons_for_skipping_rows[_reason_for_skip] += 1

    num_skips += 1
    last_skip_reason = _reason_for_skip


def print_failures():
    print("Types of failures:")
    padding = max([
        len(_key) for _key in reasons_for_skipping_rows.keys()
    ])

    for _key, _value in reasons_for_skipping_rows.items():
        print(f"\t{_key:<{padding}} : {_value}")


print("Done")

Helper function to select sentences to be modified.

Either you can select sentences randomly, or in a cluster.
This function simply selects `n` items from the given list.

In [None]:
def select_sentences(_sentences: list, n: int, select_by: Literal["group", "random"]) -> list:
    """
    :param _sentences: List of sentences to select from
    :param n: Number of sentences to be selected
    :param select_by: Criterion for selecting sentences, either 'group' or 'random'
    :return: List of selected sentences
    """

    if select_by == "random":
        _sampled_sentences = random.sample(_sentences, n)
        return _sampled_sentences

    if select_by != "group":
        raise ValueError(f"Unknown select_by value: '{select_by}'")

    _last_available_start = len(_sentences) - n
    _start = random.randint(0, _last_available_start)

    _sampled_sentences = _sentences[_start:_start + n]
    return _sampled_sentences


print("Done")

Storage for the modified texts alongside a helper function for adding a row of data.

In [None]:
ai_modified_texts = pd.DataFrame(columns=[
    # The text before and after modification
    "text",
    "modified_text",
    # The number of sentences before and after modification
    "num_sentences",
    "modified_num_sentences"
    # Sentences before and after modification
    "sentences",
    "modified_sentences",
    # The spans of the sentences before and after modification
    "sentence_spans",
    "modified_sentence_spans",
    # If the span in the corresponding index is AI modified
    # With reference to "modified_sentence_spans" only
    "span_ai_modified",
    # Is false if an error occurred
    "success"
])


def add_row_to_ai_modified_texts(_original_row: pd.Series,
                                 _modified_text: str | None,
                                 _modified_num_sentences: int | None,
                                 _modified_sentences: list | None,
                                 _modified_sentence_spans: list | None,
                                 _span_ai_modified: list | None,
                                 _success: bool):
    global ai_modified_texts

    _new_row = pd.DataFrame(data={
        "text": _original_row["text"],
        "modified_text": _modified_text,
        "num_sentences": _original_row["num_sentences"],
        "modified_num_sentences": _modified_num_sentences,
        "sentences": [_original_row["sentences"]],
        "modified_sentences": [_modified_sentences],
        "sentence_spans": [_original_row["sentence_spans"]],
        "modified_sentence_spans": [_modified_sentence_spans],
        "span_ai_modified": [_span_ai_modified],
        "success": _success
    })

    ai_modified_texts = pd.concat([ai_modified_texts, _new_row])


print("Done")

A collection of definitions and helper functions with relation to the OpenAI API.

In [None]:
with pathlib.Path("api_key.txt").open() as f:
    api_key = f.read().strip()

MODEL = "gpt-4o-mini"
# MODEL = "gpt-3.5-turbo"
CLIENT = OpenAI(api_key=api_key)

END_TEXT_TAG = "<<42 END TEXT 42>>"
END_SENTENCE_TAG = "<<42 END SENTENCE 42>>"
MODIFICATION_ROLE = f"""
I will give you text.
Then a token {END_TEXT_TAG}.
Followed by a series of sentences found in that text.
Each sentence if followed by a token {END_SENTENCE_TAG}.

You are to modify only those sentences, keeping the core meaning and intent the same.
However, make more significant changes beyond simple word substitutions:
    Rephrase sentences in a way to alter their structure or expression while maintaining the meaning.
    You may change the sentence's style, using different phrasing, or by reordering information.
    The formatting/whitespace for each sentence should be kept similar. For example if the sentence consists of a title section then a content section with two newlines inbetween, your response should contain the same.
You are only to create one sentence for each sentence requested.

Please provide the modified sentences in a JSON format:
    ["sentence1", "sentence2", ...]
in the same order they were received.
Make sure to provide a valid JSON response.
Return **only** this array with no additional formatting (e.g., no extra arrays, headers, or comments).
""".strip()

JSON_FIXER_ROLE = """
I will give you text that is supposed to be a valid JSON array of strings.
However, the text may contain formatting errors such as:
- Multiple arrays instead of one
- Extra commas or brackets
- Missing or extra quotation marks

Your job is to fix these issues and return the text as a valid JSON array.
Please ensure the response is strictly in the format:
    ["sentence1", "sentence2", ...]
and contains no errors.
""".strip()


def create_modification_prompt(_text: str, _sentences_to_modify: list[str]) -> str:
    _prompt = _text
    _prompt += f"\n{END_TEXT_TAG}\n"
    for _sentence in _sentences_to_modify[:-1]:
        _prompt += _sentence
        _prompt += f"\n{END_SENTENCE_TAG}\n"
    _prompt += _sentences_to_modify[-1]
    _prompt += f"\n{END_SENTENCE_TAG}"

    return _prompt


def send_prompt_with_role(_prompt: str, _role: str):
    """
    :param _prompt: The prompt message to be sent to the chat model.
    :param _role: The text for the role to be sent to the chat model.
    :return: The response from the chat model, what is returned from `CLIENT.chat.completions.create`.
    """

    _response = CLIENT.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": _role},
            {"role": "user", "content": _prompt}
        ]
    )

    return _response

Now we modify the AI sentences.

I have flattened this loop compared to the first version making this a bit easier to read and understand.

Steps:

1. Grab an entry to work with.
2. Calculate the number of sentences to modify.
3. Extract the sentences to be modified.
4. Form a prompt to ChatGPT and send it.
5. Acquire the response.  
   Attempt to decode it.  
   Check we received the correct number of sentences.  
   If this fails then skip.
6. Reform the text with the modified sentences.
7. Perform some double checks to provide assurance that the modifications were successful.
8. Save the modified text and associated information.

In [None]:
# A dictionary for number of sentences to percentage of text to be modified multiplied by 10
# That is if processing_info[25] is 20, then 20% of the text is to be modified
# This value is incremented by 10 after use and reset to 10 after using 50.
processing_info = {
    i: 10
    for i in range(MIN_NUM_SENTENCES, MAX_NUM_SENTENCES)
}

# Seed the RNG for consistency
# At least with selecting the sentences
random.seed(42)

progress_bar = tqdm(total=len(to_be_ai_modified_texts),
                    desc="Modifying texts",
                    postfix={"skips": num_skips})

for _, row in to_be_ai_modified_texts.iterrows():
    # Calculate the number of sentences to modify
    percentage_to_be_modified = processing_info[row["num_sentences"]]
    percentage_to_be_modified /= 100
    num_sentences_to_modify = round(percentage_to_be_modified * row["num_sentences"])

    # Update the number of sentences to modify
    processing_info[row["num_sentences"]] += 10
    if processing_info[row["num_sentences"]] > 50:
        processing_info[row["num_sentences"]] = 10

    # Collect the sentences to be modified
    sentences_with_spans = list(zip(row["sentences"], row["sentence_spans"]))
    sampled_sentences = select_sentences(sentences_with_spans, num_sentences_to_modify, select_by="group")
    # sampled_sentences = select_sentences(sentences_with_spans, num_sentences_to_modify, select_by="random")
    sentences_to_modify, sentence_spans_to_modify = zip(*sampled_sentences)

    # Create the prompt for ChatGPT
    modification_prompt = create_modification_prompt(row["text"], sentences_to_modify)

    # To our AI Overlord, ChatGPT
    try:
        modification_response = send_prompt_with_role(_prompt=modification_prompt, _role=MODIFICATION_ROLE)

        # Extract our data from the response
        modification_response_json_text = modification_response.choices[0].message.content
        response_list = json.loads(modification_response_json_text)

    except openai.BadRequestError:
        log_row_skip("openai.BadRequestError")
        response_list = []
    except json.JSONDecodeError:
        # Attempt to fix this decode error with another request to ChatGPT
        try:
            fix_response = send_prompt_with_role(
                # Name 'modification_response_json_text' can be undefined
                _prompt=modification_response_json_text,  # NOQA
                _role=JSON_FIXER_ROLE)

            # Extract the new data from the response
            fix_response_json_text = fix_response.choices[0].message.content
            response_list = json.loads(fix_response_json_text)

        except openai.BadRequestError:
            log_row_skip("openai.BadRequestError")
            response_list = []
        except json.JSONDecodeError:
            log_row_skip("json.JSONDecodeError")
            response_list = []

    # Making sure that the correct number of sentences was received
    if response_list and (len(response_list) != num_sentences_to_modify):
        log_row_skip("Wrong number of sentences received")
        response_list = []

    # Stop here if we have a failed request
    if not response_list:
        add_row_to_ai_modified_texts(_original_row=row,
                                     _modified_text=None,
                                     _modified_num_sentences=None,
                                     _modified_sentences=None,
                                     _modified_sentence_spans=None,
                                     _span_ai_modified=None,
                                     _success=False)
        # Update the progress abr
        progress_bar.update()
        progress_bar.set_postfix(skips=num_skips)
        continue

    # Collect the whitespace inbetween the normal sentences
    whitespace_between_sentences = [
        row["text"][start:stop]
        for start, stop in zip(
            [a for _, a in row["sentence_spans"][:-1]],
            [b for b, _ in row["sentence_spans"][1:]]
        )
    ]

    # Create a list of new sentences to form our text
    new_sentences = []
    is_sentence_modified = []
    for i, sentence_span in enumerate(row["sentence_spans"]):
        if sentence_span in sentence_spans_to_modify:
            is_sentence_modified.append(True)
            new_sentences.append(
                response_list[sentence_spans_to_modify.index(sentence_span)]
            )
        else:
            is_sentence_modified.append(False)
            new_sentences.append(row["sentences"][i])

    # Regenerate the text using the modified sentences and the collected whitespace
    new_sentences_and_whitespace = []
    for sentence, whitespace in zip(new_sentences, whitespace_between_sentences):
        new_sentences_and_whitespace.append(sentence)
        new_sentences_and_whitespace.append(whitespace)
    new_sentences_and_whitespace.append(new_sentences[-1])
    modified_text = "".join(new_sentences_and_whitespace)

    # Now we regenerate the sentence spans and individual sentences from those spans
    modified_text_sentence_spans = list(tokenizer.span_tokenize(modified_text))
    modified_text_sentences = [
        modified_text[start:stop] for start, stop in modified_text_sentence_spans
    ]
    modified_text_num_sentences = len(modified_text_sentences)

    # Double-check our sentences after tokenization match the ones before
    # Sorry about the walrus
    if failure_matching_sentences_after_re_tokenization := (new_sentences != modified_text_sentences):
        log_row_skip("New sentences and modified text sentences don't match after re-tokenization")

    # Double check we have the correct number of sentences
    if failure_number_sentences_after_modification := (row["num_sentences"] != modified_text_num_sentences):
        log_row_skip("Number of sentences after modification don't match before modification")

    if failure_matching_sentences_after_re_tokenization or failure_number_sentences_after_modification:
        add_row_to_ai_modified_texts(_original_row=row,
                                     _modified_text=modified_text,
                                     _modified_num_sentences=modified_text_num_sentences,
                                     _modified_sentences=modified_text_sentences,
                                     _modified_sentence_spans=modified_text_sentence_spans,
                                     _span_ai_modified=None,
                                     _success=False)
        # Update Mr. Progress Bar
        progress_bar.update()
        progress_bar.set_postfix(skips=num_skips)
        continue

    # We have a valid new text, add this to the data frame
    add_row_to_ai_modified_texts(_original_row=row,
                                 _modified_text=modified_text,
                                 _modified_num_sentences=modified_text_num_sentences,
                                 _modified_sentences=modified_text_sentences,
                                 _modified_sentence_spans=modified_text_sentence_spans,
                                 _span_ai_modified=is_sentence_modified,
                                 _success=True)

    progress_bar.update()

progress_bar.close()
print(f"Finished with {num_skips / len(to_be_ai_modified_texts):.2%} failed")

print_failures()

print("Done")

Print the data

In [None]:
ai_modified_texts

Finally, we save the data to files.  
I will only be saving those that were successfully modified.

In [None]:
save_dir = pathlib.Path("./data/prepared")

original_texts_dir = save_dir / "unmodified_texts"
ai_modified_texts_dir = save_dir / "ai_modified_texts"

# Clear the directory if it exists
if original_texts_dir.exists():
    shutil.rmtree(original_texts_dir)
if ai_modified_texts_dir.exists():
    shutil.rmtree(ai_modified_texts_dir)

# Recreate them
original_texts_dir.mkdir()
ai_modified_texts_dir.mkdir()

# Save the original texts
for i, (_, row) in enumerate(unmodified_texts.iterrows()):
    # Make the data a little more usable
    data = {
        "text": row["text"],
        "sentences": row["sentences"],
        "ai_modified": [0 for _ in range(len(row["sentences"]))]
    }

    file = original_texts_dir / f"Text {i}.json"
    with file.open("w") as f:
        f.write(json.dumps(data))

# Save the AI modified texts
i = 0
for _, row in ai_modified_texts.iterrows():
    if not row["success"]:
        continue

    data = {
        "text": row["modified_text"],
        "sentences": row["modified_sentences"],
        "ai_modified": row["span_ai_modified"]
    }

    file = ai_modified_texts_dir / f"Text {i}.json"
    i += 1
    with file.open("w") as f:
        f.write(json.dumps(data))

print("Done")