### Data Collection and Compilation

In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("Hello-SimpleAI/HC3", name = 'all')
train_data = dataset['train']
df = train_data.to_pandas()

In [3]:
data = pd.melt(df, id_vars=["id", "source"], value_vars=["question", "human_answers", "chatgpt_answers"],
                    var_name="label", value_name="text")

questions = data[data['label'] == 'question']

data.drop(data[data['label'] == 'question'].index, inplace = True)

In [4]:
questions = questions.rename(columns={'text': 'question'})
questions = questions.drop(columns=['source', 'label'])
questions.head(10)

Unnamed: 0,id,question
0,0,"Why is every book I hear about a "" NY Times # ..."
1,1,"If salt is so bad for cars , why do we use it ..."
2,2,Why do we still have SD TV channels when HD lo...
3,3,Why has nobody assassinated Kim Jong - un He i...
4,4,How was airplane technology able to advance so...
5,5,Why do humans have different colored eyes ? Wh...
6,6,Why I can not fabricate a religion that preven...
7,7,What has changed that we frequently now throw ...
8,8,magic the gathering What is it . how popular i...
9,9,What are prions and are they a big deal ? My A...


In [5]:
data = data.drop(columns=['source'])
data.head(10)

Unnamed: 0,id,label,text
24322,0,human_answers,"[Basically there are many categories of "" Best..."
24323,1,human_answers,[salt is good for not dying in car crashes and...
24324,2,human_answers,[The way it works is that old TV stations got ...
24325,3,human_answers,[You ca n't just go around assassinating the l...
24326,4,human_answers,[Wanting to kill the shit out of Germans drive...
24327,5,human_answers,[Melanin ! Many of the the first known humans ...
24328,6,human_answers,[Because you 're a minor and your parents get ...
24329,7,human_answers,[It 's three fold : * Stuff is cheaper to mass...
24330,8,human_answers,"[EDIT , Nov 21 : By popular demand , now inclu..."
24331,9,human_answers,"[Like viruses , they are ( groups of ) molecul..."


In [6]:
human_answers = data[data['label'] == 'human_answers']
chatgpt_answers = data[data['label'] == 'chatgpt_answers']


In [7]:
human_answers = human_answers.rename(columns={'text': 'human_answer'})
human_answers = human_answers.drop(columns=['label'])
human_answers.head(10)

Unnamed: 0,id,human_answer
24322,0,"[Basically there are many categories of "" Best..."
24323,1,[salt is good for not dying in car crashes and...
24324,2,[The way it works is that old TV stations got ...
24325,3,[You ca n't just go around assassinating the l...
24326,4,[Wanting to kill the shit out of Germans drive...
24327,5,[Melanin ! Many of the the first known humans ...
24328,6,[Because you 're a minor and your parents get ...
24329,7,[It 's three fold : * Stuff is cheaper to mass...
24330,8,"[EDIT , Nov 21 : By popular demand , now inclu..."
24331,9,"[Like viruses , they are ( groups of ) molecul..."


In [8]:
chatgpt_answers = chatgpt_answers.rename(columns={'text': 'chatgpt_answers'})
chatgpt_answers = chatgpt_answers.drop(columns=['label'])
chatgpt_answers.head(10)

Unnamed: 0,id,chatgpt_answers
48644,0,[There are many different best seller lists th...
48645,1,[Salt is used on roads to help melt ice and sn...
48646,2,[There are a few reasons why we still have SD ...
48647,3,[It is generally not acceptable or ethical to ...
48648,4,[After the Wright Brothers made the first powe...
48649,5,[The color of your eyes is determined by the a...
48650,6,[The First Amendment to the United States Cons...
48651,7,[There are a few different factors that can co...
48652,8,[Magic: The Gathering is a collectible card ga...
48653,9,[Prions are tiny particles that can cause seri...


In [9]:
final_df = questions.merge(human_answers, on='id').merge(chatgpt_answers, on='id')
print(final_df.shape)
final_df.head(10)

(24322, 4)


Unnamed: 0,id,question,human_answer,chatgpt_answers
0,0,"Why is every book I hear about a "" NY Times # ...","[Basically there are many categories of "" Best...",[There are many different best seller lists th...
1,1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,[Salt is used on roads to help melt ice and sn...
2,2,Why do we still have SD TV channels when HD lo...,[The way it works is that old TV stations got ...,[There are a few reasons why we still have SD ...
3,3,Why has nobody assassinated Kim Jong - un He i...,[You ca n't just go around assassinating the l...,[It is generally not acceptable or ethical to ...
4,4,How was airplane technology able to advance so...,[Wanting to kill the shit out of Germans drive...,[After the Wright Brothers made the first powe...
5,5,Why do humans have different colored eyes ? Wh...,[Melanin ! Many of the the first known humans ...,[The color of your eyes is determined by the a...
6,6,Why I can not fabricate a religion that preven...,[Because you 're a minor and your parents get ...,[The First Amendment to the United States Cons...
7,7,What has changed that we frequently now throw ...,[It 's three fold : * Stuff is cheaper to mass...,[There are a few different factors that can co...
8,8,magic the gathering What is it . how popular i...,"[EDIT , Nov 21 : By popular demand , now inclu...",[Magic: The Gathering is a collectible card ga...
9,9,What are prions and are they a big deal ? My A...,"[Like viruses , they are ( groups of ) molecul...",[Prions are tiny particles that can cause seri...


In [10]:
final_df.to_csv("HC3_Data.csv")