In [1]:
import pandas as pd
import os
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import nltk
nltk.download('punkt_tab')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to /home/rpole/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Embedding the dataset from the paper  **ChatGPT Generated Text Detection**

In [11]:
! git clone https://github.com/rexshijaku/chatgpt-generated-text-detection-corpus.git

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning into 'chatgpt-generated-text-detection-corpus'...
remote: Enumerating objects: 280, done.[K
remote: Counting objects: 100% (280/280), done.[K
remote: Compressing objects: 100% (280/280), done.[K
remote: Total 280 (delta 6), reused 258 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (280/280), 241.07 KiB | 8.31 MiB/s, done.
Resolving deltas: 100% (6/6), done.


Extract titles of the essays

In [2]:
with open('chatgpt-generated-text-detection-corpus/full_texts/questions.txt') as f:
    titles = [line.rstrip('\n') for line in f]

In [4]:
titles[:5]

['2) Why do you think people attend college or university? Use specific reasons and examples to support your answer.',
 '3) Do you agree or disagree with the following statement? Parents are the best teachers. Use specific reasons and examples to support your answer.',
 '5) It has been said, “Not everything that is learned is contained in books.” Compare and contrast knowledge gained from experience with knowledge gained from books. In your opinion, which source is more important? Why?',
 '6) A company has announced that it wishes to build a large factory near your community. Discuss the advantages and disadvantages of this new influence on your community. Do you support or oppose the factory? Explain your position.',
 '7) If you could change one important thing about your hometown, what would you change? Use reasons and specific examples to support your answer.']

Convert to dataframe while regex-ing the index

In [5]:
# extract indices from the beginning of each title
indices = [int(title.split(' ')[0][:-1]) for title in titles]
titles = [title.split(' ', 1)[1] for title in titles]

# create a dataframe
title_df = pd.DataFrame({'essay_id': indices, 'title': titles})

In [6]:
title_df.head()

Unnamed: 0,essay_id,title
0,2,Why do you think people attend college or univ...
1,3,Do you agree or disagree with the following st...
2,5,"It has been said, “Not everything that is lear..."
3,6,A company has announced that it wishes to buil...
4,7,If you could change one important thing about ...


In [7]:
for file in os.listdir('chatgpt-generated-text-detection-corpus/full_texts/human/'):
    with open(f'chatgpt-generated-text-detection-corpus/full_texts/human/{file}') as f:
        a = f.read()
    break

Extracting human written essays

In [8]:
def get_essay(author):
    """
    Function to get the essays of a given author.
    
    Parameters:
    author (str): the author of the essays
    
    Returns:
    file_ids (list): the ids of the essays
    essays (list): the essays of the author tokenized into sentences
    """
    file_ids = []
    essays = []
    for file in os.listdir(f'chatgpt-generated-text-detection-corpus/full_texts/{author}/'):
        with open(f'chatgpt-generated-text-detection-corpus/full_texts/{author}/{file}') as f:
            file_ids.append(int(file.split('.')[0]))
            sentences = nltk.tokenize.sent_tokenize(f.read().replace('\n', ' '))
            essays.append(sentences)
    
    return file_ids, essays

In [9]:
# get the essays of the human author
human_ids, human_essays = get_essay('human')
# get the essays of the chatgpt
machine_ids, machine_essays = get_essay('chatgpt')
# create the corresponding dataframes
human_df = pd.DataFrame({'essay_id': human_ids, 'sentence': human_essays, 'sentence_id': [[*range(len(essay))] for essay in human_essays]}).explode(['sentence', 'sentence_id'])
machine_df = pd.DataFrame({'essay_id': machine_ids, 'sentence': machine_essays, 'sentence_id': [[*range(len(essay))] for essay in machine_essays]}).explode(['sentence', 'sentence_id'])
# set the corresponding author columns
human_df['author'] = 'human'
machine_df['author'] = 'chatgpt'
# merge the two dataframes
df = pd.concat([human_df, machine_df]).reset_index(drop=True)
# set the embedding_id column based on the author, essay_id and sentence_id
df['embedding_id'] = df.groupby(['author', 'essay_id', 'sentence_id'], sort=False).ngroup()
# add the correspoding topic of the essay
df = df.merge(title_df, on='essay_id', how='left')

In [10]:
df.head()

Unnamed: 0,essay_id,sentence,sentence_id,author,embedding_id,title
0,70,This question of whether or not to give the st...,0,human,0,Many teachers assign homework to students ever...
1,70,"Of course, doing homework will scare our leisu...",1,human,1,Many teachers assign homework to students ever...
2,70,"So personally, I would prefer to do homework e...",2,human,2,Many teachers assign homework to students ever...
3,70,There are numerous reasons why I think that da...,3,human,3,Many teachers assign homework to students ever...
4,70,The main reason is daily homework can help stu...,4,human,4,Many teachers assign homework to students ever...


Little sanity check

In [11]:
print('Unique authors:', df['author'].unique())
print('Number of unique essay id:', len(df['essay_id'].unique()))
print('Number of essays per author: \n', df.groupby('author')['essay_id'].nunique())
print('Number of sentences:', len(df))
print('Number of titles:', len(df['title'].unique()))
print('Number of sentences per author: \n', df['author'].value_counts())

Unique authors: ['human' 'chatgpt']
Number of unique essay id: 126
Number of essays per author: 
 author
chatgpt    126
human      126
Name: essay_id, dtype: int64
Number of sentences: 4424
Number of titles: 126
Number of sentences per author: 
 author
human      2582
chatgpt    1842
Name: count, dtype: int64


In [13]:
# Save the dataframe
df.to_csv('essay_dataset.csv', index=False)

Sentences are embedded in the script ``get_embeddings.py``

In [2]:
# Load the obtained embeddings
sentence_embeddings = np.load('sentence_embeddings.npy')

In [4]:
sentence_embeddings.shape

(4424, 4096)