In [1]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import string
import spacy
from tensorflow.keras.optimizers import Adam


#test set
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
#submission format
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
#import data train
org_train = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
train = pd.read_csv("/kaggle/input/datasetllm/train_v2_drcat_02.csv", sep=',')


2024-06-07 23:21:59.926092: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-07 23:21:59.926220: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-07 23:22:00.096595: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
nlp = spacy.load("en_core_web_sm")
def clean_essay_spacy(essay):
    # Remove punctuation and non-alphanumeric characters
    essay = ''.join([char for char in essay if char not in string.punctuation and not char.isdigit()])

    # Convert to lowercase and process with spaCy
    doc = nlp(essay.lower())

    # Lemmatization and removing stopwords
    lemmatized_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove extra spaces
    cleaned_essay = ' '.join(lemmatized_words).strip()

    return cleaned_essay

In [3]:
# Apply the cleaning function and store the results directly
train['cleaned_essay'] = train['text'].apply(clean_essay_spacy)
test['cleaned_essay'] = test['text'].apply(clean_essay_spacy)
org_train['cleaned_essay'] = org_train['text'].apply(clean_essay_spacy)

# Initialize the tokenizer with an OOV token for unknown words
tokenizer = Tokenizer(oov_token='<OOV>')

# Prepare all texts from cleaned essays
all_texts = train['cleaned_essay'].tolist() + test['cleaned_essay'].tolist() + org_train['cleaned_essay'].tolist()
tokenizer.fit_on_texts(all_texts)

# Vocabulary size (1 is added for the OOV token)
vocabulary_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocabulary_size}")

Vocabulary size: 77605


In [4]:
# Convert texts to sequences of integers
train_sequences = tokenizer.texts_to_sequences(train['cleaned_essay'])
test_sequences = tokenizer.texts_to_sequences(test['cleaned_essay'])
org_train_sequences = tokenizer.texts_to_sequences(org_train['cleaned_essay'])

# Find the maximum length of sequences to set uniform input size
max_length = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in test_sequences), max(len(seq) for seq in org_train_sequences))

# Pad sequences to the same length
train_padded = pad_sequences(train_sequences, maxlen=max_length)
test_padded = pad_sequences(test_sequences, maxlen=max_length)
org_train_padded = pad_sequences(org_train_sequences, maxlen=max_length)


In [5]:
train_labels = train['label'].values

# Set up the model
model = Sequential([
    Embedding(input_dim=vocabulary_size, output_dim=100, input_length=max_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    Conv1D(filters=64, kernel_size=5, activation='relu'),  # Second convolutional layer
    GlobalMaxPooling1D(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_padded, train_labels, epochs=5, batch_size=32, validation_split=0.1)                                



Epoch 1/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m299s[0m 235ms/step - accuracy: 0.9027 - loss: 0.2011 - val_accuracy: 0.8723 - val_loss: 0.3420
Epoch 2/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 235ms/step - accuracy: 0.9932 - loss: 0.0214 - val_accuracy: 0.9532 - val_loss: 0.1477
Epoch 3/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m293s[0m 232ms/step - accuracy: 0.9972 - loss: 0.0089 - val_accuracy: 0.9443 - val_loss: 0.1873
Epoch 4/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 232ms/step - accuracy: 0.9981 - loss: 0.0065 - val_accuracy: 0.7976 - val_loss: 0.9749
Epoch 5/5
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m293s[0m 232ms/step - accuracy: 0.9992 - loss: 0.0031 - val_accuracy: 0.8101 - val_loss: 0.8285


<keras.src.callbacks.history.History at 0x7afd3fe22110>

In [6]:
# Predict on the test set
test_probabilities = model.predict(test_padded)

# Assuming your test DataFrame has an 'id' column that you need to include in the submission
submission_df = pd.DataFrame({
    'id': test['id'],  # Replace 'id' with the appropriate column name for IDs in your test DataFrame
    'probability': test_probabilities.flatten()  # Flatten to convert predictions from 2D to 1D if necessary
})

# Save the DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

print("Submission saved to 'submission.csv'.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
Submission saved to 'submission.csv'.
