In [7]:
#if running on Google collab, mounting the drive might be convenient
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
import urllib.request

url = "https://huggingface.co/datasets/Hello-SimpleAI/HC3/resolve/main/all.jsonl"
file_name = "all.jsonl"

urllib.request.urlretrieve(url, file_name)

print("File downloaded successfully.")


File downloaded successfully.


In [18]:
import os

current_directory = os.getcwd()
file_path = os.path.join(current_directory, 'all.jsonl')

with open(file_path, 'r', encoding='utf-8') as json_file:
    json_list = list(json_file)

In [21]:
#the data is extracted as a dictionary of a sequential number index and the raw text
#it is divided into data_ai and.json and data_human.json
#note that the dictionary is 1-indexed
#{1:"Hello, this is an example text written by a human", 2:"As an AI language model, I cannot provide an example text written by a AI."}

import json

data_ai = {}
data_human = {}

counter_ai = 0
counter_human = 0

counter = 0
for json_str in json_list:
    result = json.loads(json_str)
    counter = counter + 1
    data_ai[counter] = result['human_answers']
    data_human[counter] = result['chatgpt_answers']

In [None]:
#save the files as data_ai.json and data_human.json
import json

with open("data_ai.json", "w") as write_file:
    json.dump(data_ai, write_file)

with open("data_human.json", "w") as write_file:
    json.dump(data_human, write_file)

In [36]:
import pandas as pd

#this datapath is from Google drive mount, change as you need
data_path = "/content/drive"

#loading the data again from the saved file (not really needed if you run the whole thing in a single run)
with open(data_path + "data_ai.json", "r") as read_file:
  data_ai = json.load(read_file)

with open(data_path + "data_human.json", "r") as read_file:
  data_human = json.load(read_file)

In [2]:
#packages required for lexical feature extraction
!pip install lexicalrichness
!pip install readability
!pip install nltk

Installing collected packages: readability
Successfully installed readability-0.3.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
#importing packages (you may need to install more libraries as these were imported in Google collab environment)
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize
import re
import statistics
import numpy as np
import string 

from lexicalrichness import LexicalRichness
import readability

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
#'spacial puncts array, needs to be provided separately
special_puncts = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']

In [5]:
#functions to extract stylometric characteristics from a piece of text
#to do - check if word count, sentence count, and paragraph count is actualy deterimental to study or not
def word_count(document):
    tokens = word_tokenize(document)
    nonPunct = re.compile('.*[A-Za-z0-9].*')
    filtered = [w for w in tokens if nonPunct.match(w)]
    return len(filtered)

def sentence_count(document):
    tokens = sent_tokenize(document)
    nonPunct = re.compile('.*[A-Za-z0-9].*')
    filtered = [w for w in tokens if nonPunct.match(w)]
    return len(filtered)

def paragraph_count(document):
    tokens = document.splitlines()
    nonPunct = re.compile('.*[A-Za-z0-9].*')
    filtered = [w for w in tokens if nonPunct.match(w)]
    return len(filtered)

def word_count_sent(document):
    tokens = sent_tokenize(document)
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in tokens if nonPunct.match(w)]
    word_counts = [word_count(sent) for sent in filtered]
    if len(word_counts) == 0:
        return 0, 0
    mean = sum(word_counts) / len(word_counts)
    if len(word_counts) < 2:
      stdev = 0
    else:
      stdev = statistics.stdev(word_counts)
    return mean, stdev

def word_count_para(document):
    tokens = document.splitlines()
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in tokens if nonPunct.match(w)]
    word_counts = [word_count(para) for para in filtered]
    if len(word_counts) == 0:
        return 0, 0
    mean = sum(word_counts) / len(word_counts)
    if len(word_counts) < 2:
      stdev = 0
    else:
      stdev = statistics.stdev(word_counts)
    return mean, stdev


def sent_count_para(document):
    tokens = document.splitlines()
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in tokens if nonPunct.match(w)]
    sent_counts = [sentence_count(para) for para in filtered]
    if len(sent_counts) == 0:
        return 0, 0
    mean = sum(sent_counts) / len(sent_counts)
    if len(sent_counts) < 2:
      stdev = 0
    else:
      stdev = statistics.stdev(sent_counts)
    return mean, stdev

#to do - check if total count is helpful to the classification or not
def total_punc_count(document):
    punct_count = 0
    for char in document:
        if char in string.punctuation:
            punct_count += 1
    return punct_count

def special_punc_count(document, special_puncts):
    punct_count = 0
    total_puncts = 0
    for char in document:
        if char in string.punctuation:
            total_puncts += 1
            if char in special_puncts:
                punct_count += 1
    if total_puncts == 0:
        return 0
    else:
        return float(punct_count) / total_puncts

def special_punc_count_sent(document, special_puncts):
    tokens = sent_tokenize(document)
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in tokens if nonPunct.match(w)]
    punct_count = 0
    total_sentences = len(filtered)
    if total_sentences == 0:
        return 0
    for sent in filtered:
        for char in sent:
            if char in string.punctuation and char in special_puncts:
                punct_count += 1
    return float(punct_count) / total_sentences

def special_punc_count_para(document, special_puncts):
    tokens = document.splitlines()
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in tokens if nonPunct.match(w)]
    punct_count = 0
    total_paragraphs = len(filtered)
    if total_paragraphs == 0:
        return 0
    for para in filtered:
        for char in para:
            if char in string.punctuation and char in special_puncts:
                punct_count += 1
    return float(punct_count) / total_paragraphs

#relies on stylometric basics for readability and lexical richness
def readability_score(document):
    try:
        r = readability.getmeasures(document, lang='en')
        fk = r['readability grades']['Kincaid']
        f = r['readability grades']['FleschReadingEase']
        ari = r['readability grades']['ARI']
    except:
        return 0, 0, 0
    else:
        return fk, f, ari

def lexical_richness(document):
    sample_size = 10
    iterations = 50
    lex = LexicalRichness(document)
    ret_list = []
    words = document.split()
    try:
        if len(words) > 45:
            ret_list.append(lex.mattr(window_size=25))
        else:
            window_size = max(1, len(words) // 3)
            if window_size > len(words):
                window_size = len(words)
            ret_list.append(lex.mattr(window_size=window_size))
    except Exception:
        ret_list.append(0)
    ret_list.append(lex.mtld(threshold=0.72))
    return ret_list

Still to do: https://huggingface.co/docs/transformers/perplexity
Perplexity is considered the most common metric, maybe worth implementing as features

In [6]:
#function call to call all the above feature extractors in a single go
#set verbose = True to understand what each number corresponds to
#to do - consider reporting words/sentence per sentence, and change network architecture to be able to process that
def extract_features(document, special_puncts, verbose=False):
    results = []
    results.append(word_count(document))
    results.append(sentence_count(document))
    results.append(paragraph_count(document))

    words_per_sent = word_count_sent(document)
    results.append(words_per_sent[0])
    results.append(words_per_sent[1])

    words_per_para = word_count_para(document)
    results.append(words_per_para[0])
    results.append(words_per_para[1])

    sent_per_para = sent_count_para(document)
    results.append(sent_per_para[0])
    results.append(sent_per_para[1])

    results.append(total_punc_count(document))
    
    special_punc_result = special_punc_count(document, special_puncts)
    results.append(special_punc_result)
    
    special_punc_sent_result = special_punc_count_sent(document, special_puncts)
    results.append(special_punc_sent_result)
    
    special_punc_para_result = special_punc_count_para(document, special_puncts)
    results.append(special_punc_para_result)
    
    readability_results = readability_score(document)
    results.extend(readability_results)
    
    lexical_richness_results = lexical_richness(document)
    results.extend(lexical_richness_results)
    
    if verbose:
        verbose_results = []
        verbose_results.append("Word Count: Number of words in the document")
        verbose_results.append("Sentence Count: Number of sentences in the document")
        verbose_results.append("Paragraph Count: Number of paragraphs in the document")
        verbose_results.append("Word Count per Sentence: Average number of words per sentence")
        verbose_results.append("Word Count per Sentence: Standard deviation of number of words per sentence")
        verbose_results.append("Word Count per Paragraph: Average number of words per paragraph")
        verbose_results.append("Word Count per Sentence: Standard deviation of number of words per paragraph")
        verbose_results.append("Sentence Count per Paragraph: Average number of sentences per paragraph")
        verbose_results.append("Word Count per Sentence: Standard deviation of number of sentences per paragraph")
        verbose_results.append("Total Punctuation Count: Number of punctuation marks in the document")
        verbose_results.append("Averaged Special Punctuation Count: Average number of special punctuation marks per total punctuation marks")
        verbose_results.append("Averaged Special Punctuation Count per Sentence: Average number of special punctuation marks per sentence")
        verbose_results.append("Averaged Special Punctuation Count per Paragraph: Average number of special punctuation marks per paragraph")
        verbose_results.append("Flesch-Kincaid Reading Grade: Readability grade based on the Flesch-Kincaid formula")
        verbose_results.append("Flesch Reading Ease Score: Readability score based on the Flesch Reading Ease formula")
        verbose_results.append("Automated Readability Index: Readability score based on the Automated Readability Index formula")
        verbose_results.append("Lexical Richness (MATTR): Measure of lexical richness based on the Moving-Average Type-Token Ratio")
        verbose_results.append("Lexical Richness (MTLD): Measure of lexical richness based on the Measure of Textual Lexical Diversity")
        
        return list(zip(results, verbose_results))
    
    return results

In [47]:
#calling the extract_features per every line and appending it to the 'features' array
#keeping a separate labels array, 1s for ai and 0s for human (this is the standard way for MLPClassifier module)

features = []
labels = []

for n in range(1, len(data_ai)):
    try:
        text = data_ai[str(n)]#[0]
        if len(text.split()) < 10:
            continue
        features.append(extract_features(text, special_puncts))
        labels.append(1)
    except:
        continue

for n in range(1, len(data_human)):
    try:
        text = data_human[str(n)]#[0]
        if len(text.split()) < 10:
            continue
        features.append(extract_features(text, special_puncts))
        labels.append(0)
    except:
        continue

In [48]:
#saving the labels

features = np.array(features)
labels = np.array(labels)

#np.savetxt('/content/extracted_features.txt', features)
np.savetxt('/content/extracted_labels.txt', labels)

In [51]:
#saving the standardized labels (this may or may not be a good idea considering that this would limit the performance to the dataset)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

normalized_features = scaler.fit_transform(features)

np.savetxt('/content/normalized_features.txt', normalized_features)