In [30]:
import pandas as pd
import numpy as np
import string
import tensorflow as tf
import keras
from keras import layers
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron

In [31]:
# Augmented data for LLM - Detect AI Generated Text by jdragonxherrera on Kaggle
df = pd.read_csv('../Data/training.csv')

In [32]:
max_features = 20000
embedding_dim = 64
sequence_length = 500

In [33]:
shuffled_df = df.sample(frac=1, random_state=1818)
# Remove 80% of the data
num_samples_to_remove = int(0.4 * len(shuffled_df))
remaining_df = shuffled_df.iloc[num_samples_to_remove:]


train_df, val_df = train_test_split(remaining_df, train_size=0.8, random_state=1818)

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

vectorize_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

ds = tf.data.Dataset.from_tensor_slices((df['text'].values))
ds = ds.batch(512)
vectorize_layer.adapt(ds)

# Vectorize the data

train_text = np.array(vectorize_layer(train_df['text']))
val_text = np.array(vectorize_layer(val_df['text']))

train_labels = np.array(train_df['label'])
val_labels = np.array(val_df['label'])

In [None]:
# A integer input for vocab indices.
inputs = keras.Input(shape=(None,), dtype="int64")

x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(64, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(64, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# Add a vanilla hidden layer:
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# Output layer
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

epochs = 3

# Fit the model using the train and val datasets.
model.fit(train_text, train_labels, validation_data=(val_text, val_labels), epochs=epochs)

In [35]:
vectorizer = TfidfVectorizer(max_features=max_features)
vectorizer.fit(df['text'])


In [36]:
train_vectors = vectorizer.transform(train_df['text'])

In [37]:
perceptron_model = Perceptron(max_iter=1000)
perceptron_model.fit(train_vectors, train_labels)

In [38]:
combined_data = pd.read_csv('../Data/Detect AI Generated Text/combined_essays.csv')
# Drop the id, prompt_id, text_len, and model columns
combined_data = combined_data.drop(['id', 'prompt_id', 'text_len', 'model'], axis=1)
# Rename the 'generated' column to 'label'
combined_data = combined_data.rename(columns={'generated': 'label'})
# Convert the label column to a boolean and the text column to a string
combined_data['label'] = combined_data['label'].astype(bool)
combined_data['text'] = combined_data['text'].astype(str)

cd_text = np.array(vectorize_layer(combined_data['text']))
cd_labels = np.array(combined_data['label'])

In [9]:
class Classifier():
    def __init__(self, tensor_model, perceptron_model):
        self.tensor_model = tensor_model
        self.perceptron_model = perceptron_model

    def predict(self, X):
        # Convert the dataset to the expected format for Keras model
        tensor_X = np.array(vectorize_layer(X))
        # Make Keras model predictions
        tensor_predictions = self.tensor_model.predict(tensor_X)
        tensor_predictions = np.array(tensor_predictions).flatten()
        # Convert the dataset to the expected format for perceptron model
        perceptron_X = vectorizer.transform(X)
        # Make perceptron model predictions
        perceptron_predictions = self.perceptron_model.predict(perceptron_X)
        # Turn the perceptron predictions into a 1D array of floats
        perceptron_predictions = np.array(perceptron_predictions).flatten().astype(float)
        # Average the predictions along the array axis
        predictions = np.mean([tensor_predictions, perceptron_predictions], axis=0)
        
        return predictions
    
classifier = Classifier(tensor_model=model, perceptron_model=perceptron_model)

In [10]:
test_essays = pd.read_csv('../Data/Detect AI Generated Text/test_essays.csv')

predictions = classifier.predict(test_essays['text'])

test_essays['generated'] = predictions

submission = test_essays[['id', 'generated']]

submission.to_csv('../Data/Detect AI Generated Text/submission.csv', index=False)

