# Preparing Data

Here we grab the data used from the Human vs LLM Text Corpus, extract a subset we want to use, modify some of it and then save it to a series of files.  


In [None]:
import json
import pathlib
import random
import shutil

import nltk
import openai
import pandas as pd
from openai import OpenAI
from tqdm import tqdm

tqdm.pandas()

print("Done")

Load the csv file.

In [None]:
df = pd.read_csv(pathlib.Path("data/original/Human vs LLM Text Corpus/data.csv"))

print("Done")

Extracting the data we want.

* Remove all columns but the ones we care about.
* Select only Human texts
* Remove the source

In [None]:
# Grab only the text and source columns
human_sources = df.loc[:, ["text", "source"]]
# Select only the human sources
human_sources = human_sources[human_sources["source"] == "Human"]

print("Done")

Adding a new column for each sentence and another for the number of sentences for each text.  
This takes a bit because of the number of human texts.

In [None]:
tokenizer = nltk.tokenize.PunktTokenizer()


def _modify(_row: pd.Series):
    _sentence_spans = list(tokenizer.span_tokenize(_row["text"]))
    _sentences = [
        _row["text"][start:stop] for start, stop in _sentence_spans
    ]

    return pd.Series({
        "sentence_spans": _sentence_spans,
        "sentences": _sentences
    })


# Create a row for the spans of the sentences and the actual sentences
tqdm.pandas(desc="Tokenizing texts to sentences")
human_sources[["sentence_spans", "sentences"]] = human_sources.progress_apply(
    _modify,
    axis=1
)

# And another row for the number of sentences
tqdm.pandas(desc="Counting the number of sentences in each text")
human_sources["num_sentences"] = human_sources["sentences"].apply(
    lambda _sentences: len(_sentences)
)

print("Done")

Filter to the texts that we want.  
That is texts with more than 40 sentences,
50 of each with the number of sentences from 40 to 60,
inclusive of 40, but not 60.  
Why this?
Well, because I wanted some even longish data.

In [None]:
picked_sentences = pd.concat([
    human_sources[human_sources["num_sentences"] == num].sample(n=50, random_state=42)
    for num in range(40, 60)
])

print("Done")

Now we select 50% of the data to be modified in some way by an AI.

In [None]:
unmodified_texts = []
to_be_ai_modified_texts = []

for num in range(40, 60):
    subset = picked_sentences[picked_sentences["num_sentences"] == num]

    unmodified = subset.sample(frac=0.5, random_state=42)
    to_be_ai_modified = subset.drop(unmodified.index)

    unmodified_texts.append(unmodified)
    to_be_ai_modified_texts.append(to_be_ai_modified)

unmodified_texts = pd.concat(unmodified_texts)
to_be_ai_modified_texts = pd.concat(to_be_ai_modified_texts)

print("Done")

Now we modify the AI sentences.

So steps in English as this is a doozy:

1. Grab all sentences of a specific length
2. Separate those sentences into 5 subsets  
   Each subset has a different amount modified
3. Find the number of sentences to modify, lets say n
4. Randomly select n number of sentences to modify
5. Form a prompt to ChatGPT asking it to modify the n chosen sentences
6. Extract the modified sentences from ChatGPT  
   Skip if there is an error or mismatch in the received number of sentences
7. Recreate the text with the AI modified sentences preserving the whitespace
8. Add an entry for this text into a dataframe for later use

In [None]:
with pathlib.Path("api_key.txt").open() as f:
    api_key = f.read().strip()

# MODEL = "gpt-4o-mini"
MODEL = "gpt-3.5-turbo"
CLIENT = OpenAI(api_key=api_key)
ROLE = """I will give you text.
Then a token <<42 END TEXT 42>>.
Followed by a series of sentences found in that text.
Each sentence if followed by a token <<42 END SENTENCE 42>>.
The formatting/whitespace for each sentence should be kept similar.

You are to modify only those sentences, keeping the content the same.
You are only to create one sentence for each sentence requested.

Please provide the modified sentences in a json format:
    ["sentence1", "sentence2", ...]
in the same order they were received.
With no other formatting such as ```json ... ```
"""

ai_modified_texts = pd.DataFrame(columns=["text",
                                          "num_sentences",
                                          "sentences",
                                          "sentence_spans",
                                          "span_ai_modified",  # If the span in the corresponding index is AI modified
                                          "success"  # Is false if an error occurred
                                          ])

random.seed(42)

fails = 0
successes = 0
iterations = 0

progress_bar = tqdm(total=len(to_be_ai_modified_texts), desc="Modifying texts",
                    postfix={
                        "fails": fails,
                        "successes": successes
                    })

last_failure = ""
types_of_failures = {}


def add_failure(failure: str):
    global last_failure

    if failure not in types_of_failures:
        types_of_failures[failure] = 0
    types_of_failures[failure] += 1

    last_failure = failure


for num_sentences in range(40, 60):
    # Grab the subset we want
    subset = to_be_ai_modified_texts[to_be_ai_modified_texts["num_sentences"] == num_sentences]

    # Split this into 5 equal distinct sets to be modified
    # Index 0 is to have 10% modified, index 1 is 20% and so on
    sample_sets = []
    num_per_sample = len(subset) // 5
    for _ in range(5):
        sample_sets.append(
            subset.sample(n=num_per_sample, random_state=42)
        )
        subset = subset.drop(sample_sets[-1].index)

    # Now we need to modify the texts
    # The first entry into sub_subsets is the one to be 10% modified, the second 20% and so on
    for index, percentage_modification in enumerate(range(10, 60, 10)):
        working_set = sample_sets[index]

        # Turn this into an actual percentage
        percentage_modification /= 100
        # And into the number of sentences to sample
        num_sentences_to_modify = round(percentage_modification * num_sentences)

        for _, row in working_set.iterrows():
            iterations += 1

            # Select a percentage of the sentences to modify
            sentences_with_spans = list(zip(row["sentences"], row["sentence_spans"]))
            sampled_sentences_with_spans = random.sample(sentences_with_spans, k=num_sentences_to_modify)
            # Sprinkle in some python black magic
            sentences_to_modify, sentence_spans_to_modify = zip(*sampled_sentences_with_spans)

            # Create the prompt for ChatGPT
            prompt = row["text"]
            prompt += "\n<<42 END TEXT 42>>\n"
            for sentence in sentences_to_modify[:-1]:
                prompt += sentence
                prompt += "\n<<42 END SENTENCE 42>>\n"
            prompt += sentences_to_modify[-1]
            prompt += "\n<<42 END SENTENCE 42>>"

            # Send to ChatGPT
            try:
                response = CLIENT.chat.completions.create(
                    model=MODEL,
                    messages=[
                        {"role": "system", "content": ROLE},
                        {"role": "user", "content": prompt}
                    ]
                )

                # Grab the content of the response
                response_json_text = response.choices[0].message.content
                response_list = json.loads(response_json_text)
            except openai.BadRequestError:
                add_failure("openai.BadRequestError")
                response_list = []
            except json.JSONDecodeError:
                add_failure("json.JSONDecodeError")
                response_list = []

            # Make sure we received the correct number of sentences
            if response_list and len(response_list) != len(sentences_to_modify):
                add_failure("Mismatch in number of sentences in response")
                response_list = []

            # Handling a failed request
            if not response_list:
                # Leave in the old data
                failed_row = pd.DataFrame(data={
                    "text": row["text"],
                    "num_sentences": row["num_sentences"],
                    "sentences": [row["sentences"]],
                    "sentence_spans": [row["sentence_spans"]],
                    "span_ai_modified": [[False for _ in range(row["num_sentences"])]],
                    "success": False
                })
                ai_modified_texts = pd.concat([ai_modified_texts, failed_row])
                # Update the progress bar
                fails += 1
                progress_bar.update()
                progress_bar.set_postfix(fails=fails, successes=successes)
                continue

            # Now we modify the original text to include the AI modified sentences
            # I wish to preserve the whitespace between the sentences
            # Hence why the following is unnecessarily complicated

            # Extract the space between spans
            the_stuff_in_between_spans = [
                row["text"][start:stop]
                for start, stop in zip(
                    [a for _, a in row["sentence_spans"][:-1]],
                    [b for b, _ in row["sentence_spans"][1:]]
                )
            ]

            # Now create a new list of sentences, replacing the modified sentences
            new_sentences = []
            is_sentence_modified = []
            for i, sentence_span in enumerate(row["sentence_spans"]):
                if sentence_span in sentence_spans_to_modify:
                    is_sentence_modified.append(True)
                    new_sentences.append(
                        response_list[sentence_spans_to_modify.index(sentence_span)]
                    )
                else:
                    is_sentence_modified.append(False)
                    new_sentences.append(
                        row["sentences"][i]
                    )

            # Finally we zip the new sentences together with the old whitespace
            result = []
            for item1, item2, in zip(new_sentences, the_stuff_in_between_spans):
                result.append(item1)
                result.append(item2)
            result.append(new_sentences[-1])
            modified_text = "".join(result)

            # After all that, we have received our modified text
            # Now regenerate the extra data
            modified_text_sentence_spans = list(tokenizer.span_tokenize(modified_text))
            modified_text_sentences = [
                modified_text[start:stop] for start, stop in modified_text_sentence_spans
            ]
            modified_text_num_sentences = len(modified_text_sentences)

            if new_sentences != modified_text_sentences:
                add_failure("Mismatch from combining then tokenizing")
                failed_row = pd.DataFrame(data={
                    "text": row["text"],
                    "num_sentences": modified_text_num_sentences,
                    "sentences": [modified_text_sentences],
                    "sentence_spans": [modified_text_sentence_spans],
                    "span_ai_modified": [None],
                    "success": False
                })
                # Update the progress bar
                fails += 1
                progress_bar.update()
                progress_bar.set_postfix(fails=fails, successes=successes)
                continue

            if row["num_sentences"] != modified_text_num_sentences:
                add_failure("Mismatch in number of sentences in modified text")
                failed_row = pd.DataFrame(data={
                    "text": row["text"],
                    "num_sentences": modified_text_num_sentences,
                    "sentences": [modified_text_sentences],
                    "sentence_spans": [modified_text_sentence_spans],
                    "span_ai_modified": [None],
                    "success": False
                })
                ai_modified_texts = pd.concat([ai_modified_texts, failed_row])
                # Update the progress bar
                fails += 1
                progress_bar.update()
                progress_bar.set_postfix(fails=fails, successes=successes)
                continue

            # And add to the dataframe
            # print("Success")
            new_row = pd.DataFrame(data={
                "text": modified_text,
                "num_sentences": modified_text_num_sentences,
                "sentences": [modified_text_sentences],
                "sentence_spans": [modified_text_sentence_spans],
                "span_ai_modified": [is_sentence_modified],
                "success": True
            })
            ai_modified_texts = pd.concat([ai_modified_texts, new_row])
            # Update the progress bar
            successes += 1
            progress_bar.update()
            progress_bar.set_postfix(fails=fails, successes=successes)

progress_bar.close()
print(f"Finished with {fails / len(to_be_ai_modified_texts):.2%} failed")
print(f"Iterations: {iterations}")

print("Types of failures:")
for key, value in types_of_failures.items():
    print(f"\t{key}: {value}")

print("Done")

Print the data

In [None]:
ai_modified_texts

Finally, we save the data to files.  
I will only be saving those that were successfully modified.

In [None]:
save_dir = pathlib.Path("./data/prepared")

original_texts_dir = save_dir / "unmodified_texts"
ai_modified_texts_dir = save_dir / "ai_modified_texts"

# Clear the directory if it exists
if original_texts_dir.exists():
    shutil.rmtree(original_texts_dir)
if ai_modified_texts_dir.exists():
    shutil.rmtree(ai_modified_texts_dir)

# Recreate them
original_texts_dir.mkdir()
ai_modified_texts_dir.mkdir()

# Save the original texts
for i, (_, row) in enumerate(unmodified_texts.iterrows()):
    # Make the data a little more usable
    sentences = nltk.sent_tokenize(row["text"])
    data = {
        "text": row["text"],
        "sentences": sentences,
        "ai_modified": [0 for _ in range(len(sentences))]
    }

    file = original_texts_dir / f"Text {i}.json"
    with file.open("w") as f:
        f.write(json.dumps(data))

# Save the AI modified texts
i = 0
for _, row in ai_modified_texts.iterrows():
    if not row["success"]:
        continue
    
    sentences = nltk.sent_tokenize(row["text"])
    data = {
        "text": row["text"],
        "sentences": row["sentences"],
        "ai_modified": row["span_ai_modified"]
    }
    
    file = ai_modified_texts_dir / f"Text {i}.json"
    i += 1
    with file.open("w") as f:
        f.write(json.dumps(data))

print("Done")