In [1]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import string
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization


2024-06-06 23:36:43.053275: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-06 23:36:43.053511: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-06 23:36:43.242850: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
#test set
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
#submission format
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
#import data train
org_train = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
train = pd.read_csv("/kaggle/input/datasetllm/train_v2_drcat_02.csv", sep=',')

In [3]:
nlp = spacy.load("en_core_web_sm")
def clean_essay_spacy(essay):
    # Remove punctuation and non-alphanumeric characters
    essay = ''.join([char for char in essay if char not in string.punctuation and not char.isdigit()])

    # Convert to lowercase and process with spaCy
    doc = nlp(essay.lower())

    # Lemmatization and removing stopwords
    lemmatized_words = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove extra spaces
    cleaned_essay = ' '.join(lemmatized_words).strip()

    return cleaned_essay

In [4]:
cleaned_train_essays = []
cleaned_test_essays = []
cleaned_org_train_essays = []

# Apply the cleaning function and accumulate cleaned essays
train['cleaned_essay'] = train['text'].apply(clean_essay_spacy)
cleaned_train_essays.extend(train['cleaned_essay'].tolist())

test['cleaned_essay'] = test['text'].apply(clean_essay_spacy)
cleaned_test_essays.extend(test['cleaned_essay'].tolist())

org_train['cleaned_essay'] = org_train['text'].apply(clean_essay_spacy)
cleaned_org_train_essays.extend(org_train['cleaned_essay'].tolist())

# Build vocabulary of unique words from all cleaned essays
all_cleaned_essays = cleaned_train_essays + cleaned_test_essays + cleaned_org_train_essays
vocabulary = set(word for essay in all_cleaned_essays for word in essay.split())

# Print the size of the vocabulary
print(f"Vocabulary size: {len(vocabulary)}")

Vocabulary size: 77604


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(all_cleaned_essays)

# Now, if you need to transform the documents separately after fitting to the whole corpus
train_tfidf = tfidf_vectorizer.transform(cleaned_train_essays)
test_tfidf = tfidf_vectorizer.transform(cleaned_test_essays)
org_train_tfidf = tfidf_vectorizer.transform(cleaned_org_train_essays)

# Print the shape of the matrices
print("Train TF-IDF shape:", train_tfidf.shape)
print("Test TF-IDF shape:", test_tfidf.shape)
print("Original Train TF-IDF shape:", org_train_tfidf.shape)

Train TF-IDF shape: (44868, 77578)
Test TF-IDF shape: (3, 77578)
Original Train TF-IDF shape: (1378, 77578)


In [6]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam

# Assuming `train_tfidf` and `test_tfidf` are your training and testing TF-IDF matrices
# Use actual labels from your datasets
y_train = train['label'].values

# Build the model
model = Sequential([
    Input(shape=(train_tfidf.shape[1],)),  # Set input shape to the number of features in TF-IDF
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='tanh'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Sigmoid activation for binary classification
])

# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_tfidf, y_train, epochs=10, batch_size=32, validation_split=0.1)



Epoch 1/10
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 184ms/step - accuracy: 0.9368 - loss: 0.1573 - val_accuracy: 0.9002 - val_loss: 0.2523
Epoch 2/10
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 185ms/step - accuracy: 0.9846 - loss: 0.0484 - val_accuracy: 0.8320 - val_loss: 0.6028
Epoch 3/10
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 188ms/step - accuracy: 0.9928 - loss: 0.0245 - val_accuracy: 0.9445 - val_loss: 0.1641
Epoch 4/10
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m235s[0m 186ms/step - accuracy: 0.9918 - loss: 0.0265 - val_accuracy: 0.8830 - val_loss: 0.4357
Epoch 5/10
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 187ms/step - accuracy: 0.9936 - loss: 0.0204 - val_accuracy: 0.9505 - val_loss: 0.1861
Epoch 6/10
[1m1262/1262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 189ms/step - accuracy: 0.9931 - loss: 0.0228 - val_accuracy: 0.8819 - val_loss:

In [7]:
import pandas as pd

# Predict on the test set
test_predictions = model.predict(test_tfidf).flatten()

output_df = pd.DataFrame({
    'id': test['id'],  # Ensure this matches the column name for the IDs in your test DataFrame
    'probability': test_predictions  # This will be your model's predictions
})

# Save the updated DataFrame to a CSV file, ready for submission
output_df.to_csv('submission.csv', index=False)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 218ms/step
