In [1]:
import numpy as np
import pandas as pd
import pickle as pkl

import re
import unicodedata

In [2]:
def clean_essay(text):
    # normalize text
    text = unicodedata.normalize("NFKD", text)
    text = text.encode("ascii", "ignore").decode("ascii")
    # remove citations
    m = re.search(r"\n(Work Cited|Works Cited)", text, flags=re.IGNORECASE)
    if m:
        text = text[: m.start()]
    return text.strip()

In [3]:
prompt_mapping = {
    1: "Phones and driving",
    2: "Car-free cities",
    3: "Summer projects",
    4: '"A Cowboy Who Rode the Waves"',
    5: "Mandatory extracurricular activities",
    6: "Exploring Venus",
    7: "Facial action coding system",
    8: "The Face on Mars",
    9: "Community service",
    10: "Grades for extracurricular activities",
    11: "Driverless cars",
    12: "Does the electoral college work?",
    13: "Cell phones at school",
    14: "Distance learning",
    15: "Seeking multiple opinions",
}

#### Import Datasets


In [4]:
# Persuade corpus
persuade = pd.read_csv("persuade_final_cleaned.csv")
persuade = persuade[["text", "generated", "prompt"]]

# DAIGT public train set
public_train = pd.read_csv("train_essays.csv")

# Feedback Prize OUTFOX dataset
feedback_train_human = pkl.load(
    open("../feedback_OUTFOX/common/train/train_humans.pkl", "rb")
)
feedback_test_human = pkl.load(
    open("../feedback_OUTFOX/common/test/test_humans.pkl", "rb")
)
feedback_valid_human = pkl.load(
    open("../feedback_OUTFOX/common/valid/valid_humans.pkl", "rb")
)
feedback_train_chatgpt = pkl.load(
    open("../feedback_OUTFOX/chatgpt/train/train_lms.pkl", "rb")
)
feedback_test_chatgpt = pkl.load(
    open("../feedback_OUTFOX/chatgpt/test/test_lms.pkl", "rb")
)
feedback_valid_chatgpt = pkl.load(
    open("../feedback_OUTFOX/chatgpt/valid/valid_lms.pkl", "rb")
)
feedback_human = pd.DataFrame(
    [*feedback_train_human, *feedback_test_human, *feedback_valid_human]
)
feedback_chatgpt = pd.DataFrame(
    [*feedback_train_chatgpt, *feedback_test_chatgpt, *feedback_valid_chatgpt]
)
feedback = pd.concat([feedback_human, feedback_chatgpt])
feedback.rename({0: "text"}, axis=1, inplace=True)
feedback["generated"] = np.concatenate(
    [np.zeros(len(feedback_human)), np.ones(len(feedback_chatgpt))]
)
feedback["text"] = feedback["text"].apply(clean_essay)
feedback.reset_index(drop=True, inplace=True)
feedback.to_csv("../feedback.csv", index=False)

# Claude Instant dataset
claude = pd.read_csv("../claude_instant.csv")
claude = pd.DataFrame(
    pd.concat(
        [
            claude["essay_text"],
            pd.Series(1, index=claude.index),
            claude["prompt_id"].apply(lambda x: prompt_mapping[x]),
        ],
        axis=1,
    )
)
claude.rename(
    {"essay_text": "text", 1: "generated", "prompt_id": "prompt"}, axis=1, inplace=True
)
claude["text"] = claude["text"].apply(clean_essay)

# Llama 70B and Falcon 180B dataset
llama_falcon = pd.read_csv("../llama_falcon/llama_falcon_v3.csv")
llama_falcon = llama_falcon[["text", "generated", "prompt_name"]]
llama_falcon.rename({"prompt_name": "prompt"}, axis=1, inplace=True)
llama_falcon["text"] = llama_falcon["text"].apply(clean_essay)

# Llama 13B
llama_a = pd.read_csv("../llama_13b/essays_a.csv")
llama_b = pd.read_csv("../llama_13b/essays_b.csv")
llama_13b = pd.concat([llama_a, llama_b])[["text", "generated", "prompt_name"]]
llama_13b.rename({"prompt_name": "prompt"}, axis=1, inplace=True)
llama_13b["prompt"] = llama_13b["prompt"].str.replace(
    "A Cowboy Who Rode the Waves", '"A Cowboy Who Rode the Waves"'
)
llama_13b["text"] = llama_13b["text"].apply(clean_essay)

In [5]:
# Combine all datasets
combined_df = pd.concat([persuade, claude, llama_falcon, llama_13b])
print(combined_df.shape)
print("\n", combined_df["prompt"].value_counts())
print("\n", combined_df["generated"].value_counts())

(35144, 4)

 prompt
Facial action coding system              3317
Does the electoral college work?         3203
Car-free cities                          3093
Driverless cars                          3035
Exploring Venus                          2995
The Face on Mars                         2726
"A Cowboy Who Rode the Waves"            2495
Distance learning                        2308
Summer projects                          1890
Mandatory extracurricular activities     1832
Cell phones at school                    1793
Grades for extracurricular activities    1767
Seeking multiple opinions                1699
Community service                        1669
Phones and driving                       1322
Name: count, dtype: int64

 generated
0.0    25644
1.0     8500
Name: count, dtype: int64


In [6]:
combined_df.to_csv("persuade_combined.csv", index=False)