<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
    Importing modules
    </div>

In [8]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import optuna

<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
    Loading data.
    </div>

In [None]:
train_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv', low_memory=True)
test_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
train_prompts = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv')


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
    Merging train essays and train prompts
    </div>

In [9]:
train_data = pd.merge(train_essays, train_prompts, on='prompt_id', how='left')

<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
    Splitting the data into training and validation sets
    </div>

In [10]:
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Feature engineering using TF-IDF    </div>

In [11]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['text'])
X_val_tfidf = tfidf_vectorizer.transform(val_data['text'])

<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Defining optuna based hyperparam optimization for random forest    </div>

In [12]:
def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
    }

    model = RandomForestClassifier(**params, random_state=42)
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(model, X_train_tfidf, train_data['generated'], cv=kfold, scoring='accuracy').mean()

    return score


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Defining optuna based hyperparam optimization for gradient boosting    </div>

In [13]:
def objective_gb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    }

    model = GradientBoostingClassifier(**params, random_state=42)
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(model, X_train_tfidf, train_data['generated'], cv=kfold, scoring='accuracy').mean()

    return score

<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Defining optuna based hyperparam optimization for extra trees   </div>

In [14]:
def objective_et(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
    }

    model = ExtraTreesClassifier(**params, random_state=42)
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(model, X_train_tfidf, train_data['generated'], cv=kfold, scoring='accuracy').mean()

    return score


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Defining and training base mdels    </div>

In [None]:
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=100)

study_gb = optuna.create_study(direction='maximize')
study_gb.optimize(objective_gb, n_trials=100)

study_et = optuna.create_study(direction='maximize')
study_et.optimize(objective_et, n_trials=100)

[I 2023-11-22 11:51:33,576] A new study created in memory with name: no-name-d1543f71-e035-4d4a-abf3-715ec21cd3ce
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  warn(
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  warn(
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype)

<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Fetching the best hyperparameters</div>

In [16]:
best_params_rf = study_rf.best_params
best_params_gb = study_gb.best_params
best_params_et = study_et.best_params


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Training base models on best params   </div>

In [17]:
best_rf_clf = RandomForestClassifier(**best_params_rf, random_state=42)
best_gb_clf = GradientBoostingClassifier(**best_params_gb, random_state=42)
best_et_clf = ExtraTreesClassifier(**best_params_et, random_state=42)


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Creating voting classifier with soft voting    </div>

In [18]:
soft_voting_clf = VotingClassifier(
    estimators=[
        ('rf', best_rf_clf),
        ('gb', best_gb_clf),
        ('et', best_et_clf),
    ],
    voting='soft'
)


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Creating stacking classifier with logistic regression as meta classifier   </div>

In [19]:
stacking_clf = StackingClassifier(
    estimators=[('rf', best_rf_clf), ('gb', best_gb_clf), ('et', best_et_clf)],
    final_estimator=LogisticRegression(),
    stack_method='auto',  
    n_jobs=-1,  


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Training soft voting and stacking classifier    </div>

In [20]:
soft_voting_clf.fit(X_train_tfidf, train_data['generated'])
stacking_clf.fit(X_train_tfidf, train_data['generated'])

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Making predictions on the val set for soft voting</div>

In [21]:
val_predictions_soft = soft_voting_clf.predict(X_val_tfidf)


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Evaluating soft voting model    </div>

In [22]:
accuracy_soft = accuracy_score(val_data['generated'], val_predictions_soft)
print(f'Soft Voting Model Accuracy: {accuracy_soft:.2f}')


Soft Voting Model Accuracy: 1.00


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Making predictions on the val set for stacking model   </div>

In [23]:
val_predictions_stacking = stacking_clf.predict(X_val_tfidf)


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Evaluating stacking model    </div>

In [24]:
accuracy_stacking = accuracy_score(val_data['generated'], val_predictions_stacking)
print(f'Stacking Model Accuracy: {accuracy_stacking:.2f}')


Stacking Model Accuracy: 1.00


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Making predictions on the test set for soft voting    </div>

In [25]:
X_test_tfidf = tfidf_vectorizer.transform(test_essays['text'])
test_predictions_soft = soft_voting_clf.predict_proba(X_test_tfidf)[:, 1]


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Creating submission file for soft voting   </div>

In [26]:
submission_df_soft = pd.DataFrame({'id': test_essays['id'], 'generated': test_predictions_soft})
submission_df_soft.to_csv('submission_soft_voting.csv', index=False)


<div style="background-color:#F0E3D2; color:#19180F; font-size:15px; font-family:Verdana; padding:10px; border: 2px solid #19180F; border-radius:10px"> 
📌
Making preds on test set for stacking and generating submission file    </div>

In [27]:
test_predictions_stacking = stacking_clf.predict_proba(X_test_tfidf)[:, 1]

In [28]:
submission_df_stacking = pd.DataFrame({'id': test_essays['id'], 'generated': test_predictions_stacking})
submission_df_stacking.to_csv('submission.csv', index=False)


In [29]:
submission_df_soft

Unnamed: 0,id,generated
0,0000aaaa,0.00051
1,1111bbbb,0.00051
2,2222cccc,0.00051


In [6]:
submission_df_stacking


Unnamed: 0,id,generated
0,0000aaaa,0.001815
1,1111bbbb,0.001815
2,2222cccc,0.001815
