In [None]:
import numpy as np
import re
import pandas as pd
import spacy
import string
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
import nltk

In [None]:
nltk.data.path.append("/kaggle/input/nltk-data/nltk-data")

In [None]:
train_essays_df = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
display(train_essays_df)

train_essays_df['generated'].value_counts()

train_essays_df[train_essays_df['generated']==1]

In [None]:
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation
print(stop_words)
print(punctuations)

In [None]:
def clean_text(text):
    doc = nlp(text.lower().strip())
    tokens = [word.lemma_ for word in doc if word.text not in stop_words and word.text not in punctuations]
    return " ".join(tokens)


In [None]:
train_essays_df['cleaned_text'] = train_essays_df['text'].apply(clean_text)

In [None]:
X = train_essays_df['cleaned_text']
y = train_essays_df['generated']
tfidf = TfidfVectorizer(max_features=5000, min_df=3, max_df=0.7, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(X)

In [None]:
X_tfidf_shuffled, y_shuffled = shuffle(X_tfidf, y, random_state=42)

In [None]:
model = SVC(probability=True)
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
clf = GridSearchCV(model, parameters, cv=5)

# Fit and Tune Model
clf.fit(X_tfidf_shuffled, y_shuffled)
print("Best Parameters:", clf.best_params_)

In [None]:
skf = StratifiedKFold(n_splits=5)
accuracies = []
for train_index, test_index in skf.split(X_tfidf_shuffled, y_shuffled):
    X_train, X_test = X_tfidf_shuffled[train_index], X_tfidf_shuffled[test_index]
    y_train, y_test = y_shuffled.iloc[train_index], y_shuffled.iloc[test_index]

    model.set_params(**clf.best_params_)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_pred))

print("Average accuracy:", np.mean(accuracies))

In [None]:
test_df = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_df['cleaned_text'] = test_df['text'].apply(clean_text)
test_tfidf = tfidf.transform(test_df['cleaned_text'])

In [None]:
submit = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
submit['generated'] = model.predict_proba(test_tfidf)[:,1]

submit.to_csv('submission.csv', index=False)

In [None]:
submit.head()