In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv
/kaggle/input/daigt-proper-train-dataset/train_drcat_03.csv
/kaggle/input/daigt-proper-train-dataset/train_drcat_02.csv
/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv
/kaggle/input/daigt-proper-train-dataset/train_drcat_01.csv


Trying a simple approach for text classification which is to convert text passages into vectors and then use standard ML algorithms such as logistic regression or tree-based models

Instead of using Deep learning methods we can use statistical methods like tf-idf + machine learning algorithms

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report

This time will consider the LLM-Detect AI Generated Text as test data and daigt as train data

In [3]:
train = pd.read_csv('/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv')
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')


In [4]:
train.head()

Unnamed: 0,essay_id,text,label,source,prompt,fold
0,E897534557AF,"In recent years, technology has had a profoun...",1,mistral7binstruct_v2,\nTask: Write an essay discussing the positive...,1
1,DFBA34FFE11D,Should students participate in an extracurricu...,0,persuade_corpus,,2
2,af37ecf5,The electoral college is a symbol of mockery a...,0,train_essays,,5
3,5EC2696BAD78,This is why I think the principle should allow...,0,persuade_corpus,,8
4,llama_70b_v1843,I strongly believe that meditation and mindful...,1,llama_70b_v1,Some schools have implemented meditation and m...,0


In [5]:
print("train:",train.label.value_counts())
print("unique essays:",train['text'].nunique())

train: label
0    29792
1    14414
Name: count, dtype: int64
unique essays: 44155


In [6]:
train.rename(columns = {'essay_id': 'id',
                        'label' : 'generated',
                        'prompt': 'prompt_id'}, inplace=True)
train['prompt_id'] = pd.factorize(train['prompt_id'])[0]

In [7]:
train = train[['id', 'prompt_id', 'text', 'generated']]
train

Unnamed: 0,id,prompt_id,text,generated
0,E897534557AF,0,"In recent years, technology has had a profoun...",1
1,DFBA34FFE11D,-1,Should students participate in an extracurricu...,0
2,af37ecf5,-1,The electoral college is a symbol of mockery a...,0
3,5EC2696BAD78,-1,This is why I think the principle should allow...,0
4,llama_70b_v1843,1,I strongly believe that meditation and mindful...,1
...,...,...,...,...
44201,F7341069C4A4,-1,"""Oh man I didn't make the soccer team!"", yelle...",0
44202,AFE6E553DAC2,-1,I believe that using this technology could be ...,0
44203,falcon_180b_v1_600,92,The Face on Mars is a fascinating phenomenon t...,1
44204,A5F84C104693,-1,Texting & Driving\n\nUsing your phone while dr...,0


RDizzl3_seven is a boolean indicating whether the essays were written in response to one of the seven essay prompts for the competition.

We need prompts that correspond to the two prompts of the competition 'Car-Free Cities' 

The train and test files need to have the same columns for ease of training

In [8]:
train.head()

Unnamed: 0,id,prompt_id,text,generated
0,E897534557AF,0,"In recent years, technology has had a profoun...",1
1,DFBA34FFE11D,-1,Should students participate in an extracurricu...,0
2,af37ecf5,-1,The electoral college is a symbol of mockery a...,0
3,5EC2696BAD78,-1,This is why I think the principle should allow...,0
4,llama_70b_v1843,1,I strongly believe that meditation and mindful...,1


TF-IDF

In [9]:
df = pd.concat([train['text'], test['text']], axis=0)

vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
X = vectorizer.fit_transform(df)

Ensemble learning with Logistic Regression, XGBoost

XGBoost

In [10]:
xgb_model = XGBClassifier()
cv = StratifiedKFold(n_splits=5, shuffle=True)
auc_scores = []

# Split the data into training and validation for each fold
for train_idx, val_idx in cv.split(X[:train.shape[0]], train['generated']):
    X_train, X_val = X[:train.shape[0]][train_idx], X[:train.shape[0]][val_idx]
    y_train, y_val = train['generated'].iloc[train_idx], train['generated'].iloc[val_idx]

    # Train the model on the training data
    xgb_model.fit(X_train, y_train)
    
    # Predict probabilities for the positive class on the validation data
    preds_val_xgb = xgb_model.predict_proba(X_val)[:, 1]
    
    # Calculate ROC AUC score for the validation set
    auc_score = roc_auc_score(y_val, preds_val_xgb)
    auc_scores.append(auc_score)

# Print the scores for each fold
for i, score in enumerate(auc_scores, 1):
    print(f'ROC AUC for fold {i}: {score:.4f}')

print('Average ROC AUC:', round(sum(auc_scores)/len(auc_scores), 4))
print('Standard deviation:', round((sum([(x - sum(auc_scores)/len(auc_scores))**2 for x in auc_scores])/len(auc_scores))**0.5, 4))

ROC AUC for fold 1: 0.9980
ROC AUC for fold 2: 0.9985
ROC AUC for fold 3: 0.9984
ROC AUC for fold 4: 0.9981
ROC AUC for fold 5: 0.9984
Average ROC AUC: 0.9983
Standard deviation: 0.0002


Logistic Regression

In [11]:
lr_model = LogisticRegression()
cv = StratifiedKFold(n_splits=5, shuffle=True)
auc_scores = []

# Split the data into training and validation for each fold
for train_idx, val_idx in cv.split(X[:train.shape[0]], train['generated']):
    X_train, X_val = X[:train.shape[0]][train_idx], X[:train.shape[0]][val_idx]
    y_train, y_val = train['generated'].iloc[train_idx], train['generated'].iloc[val_idx]

    # Train the model on the training data
    lr_model.fit(X_train, y_train)
    
    # Predict probabilities for the positive class on the validation data
    preds_val_lr = lr_model.predict_proba(X_val)[:, 1]
    
    # Calculate ROC AUC score for the validation set
    auc_score = roc_auc_score(y_val, preds_val_lr)
    auc_scores.append(auc_score)

# Print the scores for each fold
for i, score in enumerate(auc_scores, 1):
    print(f'ROC AUC for fold {i}: {score:.4f}')

print('Average ROC AUC:', round(sum(auc_scores)/len(auc_scores), 4))
print('Standard deviation:', round((sum([(x - sum(auc_scores)/len(auc_scores))**2 for x in auc_scores])/len(auc_scores))**0.5, 4))

ROC AUC for fold 1: 0.9980
ROC AUC for fold 2: 0.9969
ROC AUC for fold 3: 0.9982
ROC AUC for fold 4: 0.9973
ROC AUC for fold 5: 0.9977
Average ROC AUC: 0.9976
Standard deviation: 0.0005


CatBoost Classifier

Learning rate for CatBoost is taken from here - https://www.kaggle.com/code/batprem/llm-daigt-cv-0-9983-lb-0-960?scriptVersionId=153835105

In [12]:
cat=CatBoostClassifier(iterations=1,
                           verbose=0,
                           random_seed=6543,
                           learning_rate=0.005599066836106983,
                           subsample = 0.35,
                           allow_const_label=True,loss_function = 'CrossEntropy')
for train_idx, val_idx in cv.split(X[:train.shape[0]], train['generated']):
    X_train, X_val = X[:train.shape[0]][train_idx], X[:train.shape[0]][val_idx]
    y_train, y_val = train['generated'].iloc[train_idx], train['generated'].iloc[val_idx]

    # Train the model on the training data
    cat.fit(X_train, y_train)
    
    # Predict probabilities for the positive class on the validation data
    preds_val_lr = cat.predict_proba(X_val)[:, 1]
    
    # Calculate ROC AUC score for the validation set
    auc_score = roc_auc_score(y_val, preds_val_lr)
    auc_scores.append(auc_score)

# Print the scores for each fold
for i, score in enumerate(auc_scores, 1):
    print(f'ROC AUC for fold {i}: {score:.4f}')

print('Average ROC AUC:', round(sum(auc_scores)/len(auc_scores), 4))
print('Standard deviation:', round((sum([(x - sum(auc_scores)/len(auc_scores))**2 for x in auc_scores])/len(auc_scores))**0.5, 4))

ROC AUC for fold 1: 0.9980
ROC AUC for fold 2: 0.9969
ROC AUC for fold 3: 0.9982
ROC AUC for fold 4: 0.9973
ROC AUC for fold 5: 0.9977
ROC AUC for fold 6: 0.8769
ROC AUC for fold 7: 0.8723
ROC AUC for fold 8: 0.8600
ROC AUC for fold 9: 0.8717
ROC AUC for fold 10: 0.8667
Average ROC AUC: 0.9336
Standard deviation: 0.0642


In [13]:
preds_train = cat.predict_proba(X[:train.shape[0]])[:,1]
preds_test = cat.predict_proba(X[train.shape[0]:])[:,1]
print('ROC AUC train:', roc_auc_score(train['generated'], preds_train))

ROC AUC train: 0.8715381433933801


In [14]:
# Create the ensemble model
ensemble = VotingClassifier(estimators=[('lr', lr_model), ('xgb', xgb_model), ('cat', cat)], voting='soft')

ensemble.fit(X_train, y_train)

# Predict on the validation set
y_pred = ensemble.predict(X_val)

# Print the classification report
print(classification_report(y_val, y_pred))

# Print the accuracy score
print(f'Accuracy: {roc_auc_score(y_val, y_pred)}\n')

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5959
           1       0.99      0.97      0.98      2882

    accuracy                           0.99      8841
   macro avg       0.99      0.99      0.99      8841
weighted avg       0.99      0.99      0.99      8841

Accuracy: 0.9854892365934744



In [15]:
preds_train = ensemble.predict_proba(X[:train.shape[0]])[:,1]
preds_test = ensemble.predict_proba(X[train.shape[0]:])[:,1]
print('ROC AUC train:', roc_auc_score(train['generated'], preds_train))

ROC AUC train: 0.9997814398785374


In [16]:
pd.DataFrame({'id':test["id"],'generated':preds_test}).to_csv('submission.csv', index=False)