## Importing Libraries


In [1]:
import sys
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier



## Distilroberta for prediction

In [2]:
import transformers
import datasets
import pandas as pd
import numpy as np
from datasets import Dataset
import os
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from transformers import AutoTokenizer
model_checkpoint = "/kaggle/input/detect-llm-models/distilroberta-finetuned_v5/checkpoint-49654"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def preprocess_function(examples):
    return tokenizer(examples['text'], max_length = 512 , padding=True, truncation=True)
def tf_lower_and_split_punct(text):
    text = tf_text.normalize_utf8(text, 'NFKD')
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
    text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
    text = tf.strings.strip(text)
    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Move your model and data to the GPU
model.to(device);
trainer = Trainer(
    model,
    tokenizer=tokenizer,
)
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_ds = Dataset.from_pandas(test)
test_ds_enc = test_ds.map(preprocess_function, batched=True)
test_preds = trainer.predict(test_ds_enc)
logits = test_preds.predictions
probs = (np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True))[:,0]
sub = pd.DataFrame()
sub['id'] = test['id']
sub['generated'] = probs
sub.to_csv('submission_old.csv', index=False)
sub.head()

  if _pandas_api.is_sparse(col):


  0%|          | 0/1 [00:00<?, ?ba/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Unnamed: 0,id,generated
0,0000aaaa,0.001225
1,1111bbbb,0.001146
2,2222cccc,0.001189


## Loading Data

In [3]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
org_train = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

In [4]:
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

In [5]:
train['text']

0        Phones\n\nModern humans today are always on th...
1        This essay will explain if drivers should or s...
2        Driving while the use of cellular devices\n\nT...
3        Phones & Driving\n\nDrivers should not be able...
4        Cell Phone Operation While Driving\n\nThe abil...
                               ...                        
44863    Dear Senator,\n\nI am writing to you today to ...
44864    Dear Senator,\n\nI am writing to you today to ...
44865    Dear Senator,\n\nI am writing to you today to ...
44866    Dear Senator,\n\nI am writing to you today to ...
44867    Dear Senator,\n\nI am writing to you today to ...
Name: text, Length: 44868, dtype: object

## Levenshtein distance for error correction

In [6]:
from leven_search import LevenSearch, EditCost, EditCostConfig, GranularEditCostConfig
import pickle
from tqdm.auto import tqdm

with open('/kaggle/usr/lib/install_levenshtein_search_library/leven_search.pkl', 'rb') as file:
    lev_search = pickle.load(file)

In [7]:
from collections import defaultdict
import re

def fix_text(text):
    wrong_words = []
    correct_words = dict()
    word_list = re.findall(r'\b\w+\b|[.,\s]', text)

    for t in word_list:
        correct_word = t

        if len(t) > 2:
            result = lev_search.find_dist(t, max_distance=0)
            result = list(result.__dict__['words'].values())

            if len(result) == 0:
                result = lev_search.find_dist(t, max_distance=1)
                result = list(result.__dict__['words'].values())
                if len(result):
                    correct_word = result[0].word
                    wrong_words.append((t, result))

        correct_words[t] = correct_word

    dict_freq = defaultdict(lambda: 0)
    for wrong_word in wrong_words:
        _, result = wrong_word

        for res in result:
            updates = res.updates
            from_char = updates[0].l1
            to_char = updates[0].l2
            if from_char != "" or to_char != "":
                dict_freq[(from_char, to_char)] += 1

    if len(dict_freq):
        max_key = max(dict_freq, key=dict_freq.get)
        count = dict_freq[max_key]
    else:
        count = 0

    if count > 0.06 * len(text.split()):
        gec = GranularEditCostConfig(default_cost=10, edit_costs=[EditCost(max_key[0], max_key[1], 1)])

        for wrong_word in wrong_words:
            word, _ = wrong_word
            result = lev_search.find_dist(word, max_distance=9, edit_cost_config=gec)
            result = list(result.__dict__['words'].values())
            if len(result):
                correct_words[word] = result[0].word
            else:
                correct_word = word

    correct_sentence = []
    for t in word_list:
        correct_sentence.append(correct_words[t])

    return "".join(correct_sentence)

In [8]:
from concurrent.futures import ProcessPoolExecutor
import re
with ProcessPoolExecutor() as executor:
    train['text'] = list(executor.map(fix_text, train['text']))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [9]:
with ProcessPoolExecutor() as executor:
    test['text'] = list(executor.map(fix_text, test['text']))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

## Removing Duplicate Rows

In [10]:
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

## Configuration Parameters

In [11]:
LOWERCASE = False
VOCAB_SIZE = 1000000

## Byte-Pair Encoding Tokenizer Training

In [12]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
dataset = Dataset.from_pandas(test[['text']])
def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)






  if _pandas_api.is_sparse(col):


In [13]:

tokenized_texts_test = []

for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

## TF-IDF Vectorization

In [14]:
def dummy(text):
    return text
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode')

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

print(vocab)

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

{'ĠAaa Ġbbb Ġccc': 0, 'Ġbbb Ġccc .': 6, 'ĠAaa Ġbbb Ġccc .': 1, 'ĠBbb Ġccc Ġidd': 2, 'Ġccc Ġidd .': 7, 'ĠBbb Ġccc Ġidd .': 3, 'ĠCCC Ġidd Ġeee': 4, 'Ġidd Ġeee .': 8, 'ĠCCC Ġidd Ġeee .': 5}


23

In [15]:
y_train = train['label'].values

## Model Training and Prediction

In [16]:
if len(test.text.values) <= 5:
    sub.to_csv('submission_new.csv', index=False)
else:
    clf = MultinomialNB(alpha=0.0225)
    
    sgd_model = SGDClassifier(
        max_iter     = 9000, 
        tol          = 1e-4, 
        random_state = 6743,
        loss         = "modified_huber"
    ) 
    
    p={
        'verbose'          : -1,
        'n_iter'           : 3000,
        'colsample_bytree' : 0.7800,
        'colsample_bynode' : 0.8000, 
        'random_state'     : 6743,
        'metric'           : 'auc',
        'objective'        : 'cross_entropy',
        'learning_rate'    : 0.00581909898961407, 
      }
    lgb=LGBMClassifier(**p)
    
    
    cat = CatBoostClassifier(
        iterations        = 3000,
        verbose           = 0,
        subsample         = 0.35,
        random_seed       = 6543,
        allow_const_label = True,
        loss_function     = 'CrossEntropy',
        learning_rate     = 0.005599066836106983,
    )
    
    
    ensemble = VotingClassifier(
        estimators = [('mnb', clf),
                      ('sgd', sgd_model),
                      ('lgb', lgb), 
                      ('cat', cat)],
        weights    = [0.1, 0.31, 0.28, 0.67], 
        voting     = 'soft', 
        n_jobs     = -1
    )
    
    ensemble.fit(tf_train, y_train_label)
    gc.collect()
    
    final_preds = ensemble.predict_proba(tf_test)[:,1]
    sub['generated'] = final_preds
    sub.to_csv('submission_new.csv', index=False)
    sub

## Model Ensemble

In [17]:
submission1 = pd.read_csv('submission_old.csv')
submission2 = pd.read_csv('submission_new.csv')

In [18]:
submission = submission1.copy()
submission['generated'] = submission1['generated'].rank(pct=True)*0.25+submission2['generated'].rank(pct=True)*0.75
submission.to_csv('submission.csv', index=False)