# Credit
Inspired by VLADIMIR DEMIDOV's work : <br>
https://www.kaggle.com/code/yekenot/llm-detect-by-regression

# Importing library

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report

# importing files

In [2]:
train = pd.read_csv('/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv')
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

In [3]:
train.rename(columns = {'essay_id': 'id',
                        'label' : 'generated',
                        'prompt': 'prompt_id'}, inplace=True)
train['prompt_id'] = pd.factorize(train['prompt_id'])[0]

In [4]:
train = train[['id', 'prompt_id', 'text', 'generated']]
train

Unnamed: 0,id,prompt_id,text,generated
0,E897534557AF,0,"In recent years, technology has had a profoun...",1
1,DFBA34FFE11D,-1,Should students participate in an extracurricu...,0
2,af37ecf5,-1,The electoral college is a symbol of mockery a...,0
3,5EC2696BAD78,-1,This is why I think the principle should allow...,0
4,llama_70b_v1843,1,I strongly believe that meditation and mindful...,1
...,...,...,...,...
44201,F7341069C4A4,-1,"""Oh man I didn't make the soccer team!"", yelle...",0
44202,AFE6E553DAC2,-1,I believe that using this technology could be ...,0
44203,falcon_180b_v1_600,92,The Face on Mars is a fascinating phenomenon t...,1
44204,A5F84C104693,-1,Texting & Driving\n\nUsing your phone while dr...,0


# Logistic Regression

In [5]:
df = pd.concat([train['text'], test['text']], axis=0)

vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
X = vectorizer.fit_transform(df)

In [6]:
lr_model = LogisticRegression()
cv = StratifiedKFold(n_splits=5, shuffle=True)
auc_scores = []

# Split the data into training and validation for each fold
for train_idx, val_idx in cv.split(X[:train.shape[0]], train['generated']):
    X_train, X_val = X[:train.shape[0]][train_idx], X[:train.shape[0]][val_idx]
    y_train, y_val = train['generated'].iloc[train_idx], train['generated'].iloc[val_idx]

    # Train the model on the training data
    lr_model.fit(X_train, y_train)
    
    # Predict probabilities for the positive class on the validation data
    preds_val_lr = lr_model.predict_proba(X_val)[:, 1]
    
    # Calculate ROC AUC score for the validation set
    auc_score = roc_auc_score(y_val, preds_val_lr)
    auc_scores.append(auc_score)

# Print the scores for each fold
for i, score in enumerate(auc_scores, 1):
    print(f'ROC AUC for fold {i}: {score:.4f}')

print('Average ROC AUC:', round(sum(auc_scores)/len(auc_scores), 4))
print('Standard deviation:', round((sum([(x - sum(auc_scores)/len(auc_scores))**2 for x in auc_scores])/len(auc_scores))**0.5, 4))

ROC AUC for fold 1: 0.9975
ROC AUC for fold 2: 0.9974
ROC AUC for fold 3: 0.9979
ROC AUC for fold 4: 0.9975
ROC AUC for fold 5: 0.9975
Average ROC AUC: 0.9976
Standard deviation: 0.0002


# XGBoost

In [7]:
xgb_model = XGBClassifier()
cv = StratifiedKFold(n_splits=5, shuffle=True)
auc_scores = []

# Split the data into training and validation for each fold
for train_idx, val_idx in cv.split(X[:train.shape[0]], train['generated']):
    X_train, X_val = X[:train.shape[0]][train_idx], X[:train.shape[0]][val_idx]
    y_train, y_val = train['generated'].iloc[train_idx], train['generated'].iloc[val_idx]

    # Train the model on the training data
    xgb_model.fit(X_train, y_train)
    
    # Predict probabilities for the positive class on the validation data
    preds_val_xgb = xgb_model.predict_proba(X_val)[:, 1]
    
    # Calculate ROC AUC score for the validation set
    auc_score = roc_auc_score(y_val, preds_val_xgb)
    auc_scores.append(auc_score)

# Print the scores for each fold
for i, score in enumerate(auc_scores, 1):
    print(f'ROC AUC for fold {i}: {score:.4f}')

print('Average ROC AUC:', round(sum(auc_scores)/len(auc_scores), 4))
print('Standard deviation:', round((sum([(x - sum(auc_scores)/len(auc_scores))**2 for x in auc_scores])/len(auc_scores))**0.5, 4))

ROC AUC for fold 1: 0.9986
ROC AUC for fold 2: 0.9981
ROC AUC for fold 3: 0.9984
ROC AUC for fold 4: 0.9985
ROC AUC for fold 5: 0.9982
Average ROC AUC: 0.9984
Standard deviation: 0.0002


# final model (change the name of the model variable as needed)

In [8]:
# Create the ensemble model
ensemble = VotingClassifier(estimators=[('lr', lr_model), ('xgb', xgb_model)], voting='soft')

ensemble.fit(X_train, y_train)

# Predict on the validation set
y_pred = ensemble.predict(X_val)

# Print the classification report
print(classification_report(y_val, y_pred))

# Print the accuracy score
print(f'Accuracy: {roc_auc_score(y_val, y_pred)}\n')

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5959
           1       0.99      0.97      0.98      2882

    accuracy                           0.99      8841
   macro avg       0.99      0.98      0.98      8841
weighted avg       0.99      0.99      0.99      8841

Accuracy: 0.9813141069573382



In [9]:
preds_train = ensemble.predict_proba(X[:train.shape[0]])[:,1]
preds_test = ensemble.predict_proba(X[train.shape[0]:])[:,1]
print('ROC AUC train:', roc_auc_score(train['generated'], preds_train))

ROC AUC train: 0.9996478090562537


In [10]:
pd.DataFrame({'id':test["id"],'generated':preds_test}).to_csv('submission.csv', index=False)