In [76]:
import json

# Input and output file paths
input_file = "HC3/finance.jsonl"
output_file = "answers_train.json"

# Function to extract human and ChatGPT answers from each line of JSONL
def extract_data_from_line(line):
    # Parse JSON from the line
    json_data = json.loads(line)
    # Extract human answers with label 0
    human_answers = [{"text": answer, "label": 0} for answer in json_data["human_answers"]]
    # Extract ChatGPT answers with label 1
    chatgpt_answers = [{"text": answer, "label": 1} for answer in json_data["chatgpt_answers"]]
    # Return both sets of answers
    return human_answers, chatgpt_answers

# Function to create a new JSON file with only the answers
def create_new_json(input_file, output_file):
    new_data = []  # List to hold all answers
    with open(input_file, "r") as f:
        for line in f:
            # Extracting human and ChatGPT answers from each line
            human_answers, chatgpt_answers = extract_data_from_line(line)
            # Adding human answers to new_data
            for answer in human_answers:
                new_data.append(answer)
            # Adding ChatGPT answers to new_data
            for answer in chatgpt_answers:
                new_data.append(answer)
    
    # Writing the new_data list to the output file in JSON format
    with open(output_file, "w") as f:
        json.dump(new_data, f, indent=4)

# Calling create_new_json function with the input and output file paths
create_new_json(input_file, output_file)


In [77]:
# Creating new paths to an input and output file 
input_file = "HC3/open_qa.jsonl"
output_file = "answers_test.json"

# Calling the create_new_json function with new files
create_new_json(input_file, output_file)

In [78]:
# Importing the load_dataset function from the datasets library
from datasets import load_dataset

# Loading dataset from JSON files into 'dataset' variable
# 'train' and 'test' are the keys, and their corresponding values are the file paths
dataset = load_dataset('json', data_files={'train': 'answers_train.json', 
                                            'test': 'answers_test.json'})

# Importing the AutoTokenizer class from the transformers library
from transformers import AutoTokenizer

# Instantiating an AutoTokenizer object with a pre-trained DistilBERT model
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [89]:
# Creating a preprocess_function 
def preprocess_function(data):
    # Tokenize the text using the tokenizer
    # "text" is the key containing the text data in each example
    return tokenizer(data["text"])

# Apply the preprocess_function to the dataset using the map() method
# batched as True enables mapping in batches for better performance
# This function will tokenize the text data in the "text" field of each element in the dataset
tokenized_answers = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/8436 [00:00<?, ? examples/s]

Map:   0%|          | 0/4748 [00:00<?, ? examples/s]

In [81]:
# Importing the DataCollatorWithPadding class from the transformers library
from transformers import DataCollatorWithPadding

# Creating a DataCollatorWithPadding object
# This class handles padding of sequences to ensure they all have the same length
# It takes the tokenizer object as an argument, which is used for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [82]:
# Importing the evaluate module
import evaluate

# Loading the accuracy evaluation metric using the load function from the evaluate module
accuracy = evaluate.load("accuracy")


In [83]:
# Importing the numpy library as np
import numpy as np

# Defining a function compute_metrics 
def compute_metrics(eval_pred):
    # Unpacking the eval_pred tuple into predictions and labels
    predictions, labels = eval_pred
    
    # Computing the index of the maximum value along axis 1 in the predictions array
    predictions = np.argmax(predictions, axis=1)
    
    # Returning the accuracy score based on the predicted labels and true labels
    return accuracy.compute(predictions=predictions, references=labels)


In [84]:
# Dictionary mapping from labels to corresponding class names
id2label = {0: "HUMAN", 1: "CHATGPT"}

# Dictionary mapping from class names to corresponding labels
label2id = {"HUMAN": 0, "CHATGPT": 1}


In [85]:
# Importing necessary classes from the transformers library
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Instantiating a model for sequence classification using AutoModelForSequenceClassification loaded from the pre-trained DistilBERT model 
# num_labels specifies the number of labels for classification (Human and Chatgpt)
# id2label and label2id are dictionaries mapping label indices to label names and vice versa
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
# Importing classes from the transformers library
from transformers import TrainingArguments, Trainer

# Defining training arguments for the Trainer
training_args = TrainingArguments(
    output_dir="My_first_model",  # Directory where the model checkpoints and evaluation results will be saved
    learning_rate=2e-5,  # Learning rate for the optimizer
    per_device_train_batch_size=16,  # Batch size for training per device
    per_device_eval_batch_size=16,  # Batch size for evaluation per device
    num_train_epochs=2,  # Number of training epochs
    weight_decay=0.01,  # Weight decay to apply during optimization
    evaluation_strategy="epoch",  # Evaluation strategy (evaluated after each epoch)
    save_strategy="epoch",  # Model saving strategy (save checkpoint after each epoch)
    load_best_model_at_end=True,  # Load the best model at the end of training
)

# Instantiate the Trainer class
trainer = Trainer(
    model=model,  # The model to be trained
    args=training_args,  # Training arguments
    train_dataset=tokenized_answers["train"],  # Training dataset
    eval_dataset=tokenized_answers["test"],  # Evaluation dataset
    tokenizer=tokenizer,  # Tokenizer used for tokenizing input sequences
    data_collator=data_collator,  # Data collator for batch processing
    compute_metrics=compute_metrics,  # Function for computing evaluation metrics
)

# Start training the model
trainer.train()


  0%|          | 0/1056 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
trainer.push_to_hub()