In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import opendatasets as od

import os
import warnings

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Flatten, Dropout, Embedding, Conv1D, MaxPooling1D, TextVectorization, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
np.random.seed(42)
tf.keras.backend.clear_session()

print("GPUs Available: ", tf.config.list_physical_devices('GPU'))

GPUs Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
data_path_dict = {
    'data_path': '../data',
    'model_data': '../data/model_data',
    'data_subset': '../data/model_data/data_subset',
    'models': '../data/models'
}

for file_path_key in data_path_dict:
    if not os.path.exists(data_path_dict[file_path_key]):
        print(f'Path does not Exist: {data_path_dict[file_path_key]}')

        os.makedirs(data_path_dict[file_path_key])

In [3]:
if not (os.path.exists('../data/human-vs-llm-text-corpus') or os.path.exists('../data/human_vs_llm_text_corpus')):
    od.download(dataset_id_or_url="https://www.kaggle.com/datasets/starblasters8/human-vs-llm-text-corpus", data_dir='../data/')
    os.rename('../data/human-vs-llm-text-corpus/', '../data/human_vs_llm_text_corpus/')

In [4]:
raw_data_path = '../data/human_vs_llm_text_corpus/data.parquet'
data_subset_path = '../data/model_data/data_subset/'

In [5]:
raw_data = pd.read_parquet(raw_data_path)

raw_data = raw_data[raw_data['source'].isin(['Human', 'GPT-3.5'])]
raw_data = raw_data.groupby('source').apply(lambda row: row.sample(n=26000)).reset_index(drop=True)

raw_data['source'] = np.where(raw_data['source'] == 'Human', 0, 1)

X_train, X_test, y_train, y_test = train_test_split(raw_data['text'], raw_data['source'], test_size=0.02, stratify=raw_data['source'], random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.02, stratify=y_train, random_state=42)

X_train = X_train.values
X_test = X_test.values
X_val = X_val.values

X_train.shape, X_test.shape, X_val.shape

((49940,), (1040,), (1020,))

In [6]:
def build_model_cnn(X_train: np.ndarray, sequence_length: int, max_features: int):

    vectorizer_layer = TextVectorization(max_tokens=max_features, output_sequence_length=sequence_length)
    vectorizer_layer.adapt(X_train)

    model = Sequential()

    model.add(vectorizer_layer)
    model.add(Embedding(input_dim=max_features, output_dim=1024, input_length=sequence_length))
    model.add(Conv1D(filters=2048, kernel_size=4, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Conv1D(filters=1024, kernel_size=4, activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv1D(filters=512, kernel_size=4, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(BatchNormalization())
    model.add(Conv1D(filters=128, kernel_size=4, activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv1D(filters=64, kernel_size=4, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [7]:
model = build_model_cnn(X_train=X_train, sequence_length=1000, max_features=5000)

print(model.summary())

checkpoint_callback = ModelCheckpoint('../data/models/cnn_model_checkpoint/ai_detection_cnn_model_v1.ckpt', monitor='val_accuracy', verbose=0, save_best_only=True, mode='max', save_weights_only=True)
logger_callback = CSVLogger('../data/models/ai_detection_model_v1.log', separator=',', append=False)

model_history = model.fit(
    X_train, y_train, epochs=10, verbose=1, batch_size=64, callbacks=[checkpoint_callback, logger_callback],
    validation_data=(X_val, y_val))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 1000)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 1000, 1024)        5120000   
                                                                 
 conv1d (Conv1D)             (None, 997, 2048)         8390656   
                                                                 
 batch_normalization (BatchN  (None, 997, 2048)        8192      
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 997, 2048)         0         
                                                                 
 conv1d_1 (Conv1D)           (None, 994, 1024)         8

KeyboardInterrupt: 

In [None]:
model = build_model_cnn(X_train=X_train, sequence_length=1000, max_features=5000)
model.load_weights('../data/models/cnn_model_checkpoint/ai_detection_cnn_model_v1.ckpt')
model.evaluate(X_test, y_test)