In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import regex as re
import matplotlib.pyplot as plt
from scipy.sparse import  hstack, csr_matrix, vstack
import random
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Attention, Input, Dense, concatenate, MaxPooling1D, Activation, Add, Flatten, Conv1D, Conv2D
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import tensorflow_text as tf_text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.layers import TextVectorization, Embedding, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Reshape , GlobalAveragePooling1D
from tensorflow.keras import Model, Input

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from transformers import PreTrainedTokenizerFast
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
import sys
import gc

In [2]:
train_extra = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

train_extra = train_extra[train_extra['source'] != 'train_essays']
# train_extra = train_extra[train_extra.RDizzl3_seven]
train_extra.drop(columns=['source','RDizzl3_seven','prompt_name'],inplace=True)
train_extra.rename(columns={'label' : 'generated'}, inplace=True)
df_train = train_extra
df_train.reset_index(inplace=True,drop=True)

In [3]:
df_test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')

In [4]:
df_train

Unnamed: 0,text,generated
0,Phones\n\nModern humans today are always on th...,0
1,This essay will explain if drivers should or s...,0
2,Driving while the use of cellular devices\n\nT...,0
3,Phones & Driving\n\nDrivers should not be able...,0
4,Cell Phone Operation While Driving\n\nThe abil...,0
...,...,...
43485,"Dear Senator,\n\nI am writing to you today to ...",1
43486,"Dear Senator,\n\nI am writing to you today to ...",1
43487,"Dear Senator,\n\nI am writing to you today to ...",1
43488,"Dear Senator,\n\nI am writing to you today to ...",1


In [5]:
train_size = df_train.shape[0]
train_labels = df_train.generated.values

In [6]:
tok_path = '/kaggle/input/huggingface-bert-variants/bert-base-cased/bert-base-cased'

In [7]:
def clean_text(text):
    # Replace actual newline and carriage return characters with whitespace
    text = text.replace("\n", " ")
    text = text.replace("\r", " ")
    
    # Drop punctuation
    text = re.sub(r"\p{P}", " ", text)
    
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text)
    
    # Remove leading and trailing whitespace
    text = text.strip()
    
    # Lower text
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r"\d+", "", text)
    
    return text

df_test['text'] =  df_test['text'].apply(clean_text)
df_train['text'] =  df_train['text'].apply(clean_text)

In [8]:
# Change contractions
contractions = {
    r'\b(can\'t)\b': 'cannot',
    r'\b(don\'t)\b': 'do not',
    r'\b(won\'t)\b': 'will not',
}

# Iterate through contractions and apply replacements to the entire DataFrame column
for pattern, replacement in contractions.items():
    df_test['text'] =  df_test['text'].apply(lambda x: re.sub(pattern, replacement, x, flags=re.IGNORECASE))
    df_train['text'] =  df_train['text'].apply(lambda x: re.sub(pattern, replacement, x, flags=re.IGNORECASE))

In [9]:
text_data = pd.concat([df_train.text,df_test.text]).values

In [10]:
tokenizer = BertTokenizer.from_pretrained(tok_path, do_lower_case = True)

In [11]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in text_data]

In [12]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=512, dtype="long", truncating="post", padding="post")

In [13]:
del tokenized_texts, text_data, tokenizer

In [14]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [15]:
with tf.keras.utils.custom_object_scope({'TransformerBlock': TransformerBlock}):
    model = tf.keras.models.load_model('/kaggle/input/inference-v2-8/8-eps-v2-train')

In [16]:
layer_name = 'concatenate'
Embedding_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)

In [17]:
Embedding_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 512)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 512, 64)              4800000   ['input_1[0][0]']             
                                                                                                  
 bidirectional (Bidirection  (None, 512, 64)              24832     ['embedding[0][0]']           
 al)                                                                                              
                                                                                                  
 transformer_block (Transfo  (None, 512, 64)              37664     ['bidirectional[0][0]']   

In [18]:
preds = Embedding_model.predict(input_ids)



In [19]:
train_features = preds[:train_size]
test_features = preds[train_size:]

In [20]:
del preds, model

In [21]:
LOWERCASE = False
VOCAB_SIZE = 50000

In [22]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
dataset = Dataset.from_pandas(df_test[['text']])
def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenized_texts_test = []

for text in tqdm(df_test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(df_train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))






  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/43490 [00:00<?, ?it/s]

In [23]:
def dummy(text):
    return text
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode')

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_


vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

8955

In [24]:
X_train = hstack([tf_train,csr_matrix(train_features)])
X_test = hstack([tf_test,csr_matrix(test_features)])

In [25]:
del tf_train, tf_test, train_features, test_features

In [26]:
if len(df_test.text.values) <= 5:
    sub.to_csv('submission.csv', index=False)
else:
    clf = MultinomialNB(alpha=0.02)
    sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
    p6={'n_iter': 3000,
        'verbose': -1,
        'objective': 'cross_entropy',
        'metric': 'auc',
        'learning_rate': 0.00581909898961407, 
        'colsample_bytree': 0.78,
        'colsample_bynode': 0.8, 
        'lambda_l1': 4.562963348932286, 
        'lambda_l2': 2.97485, 
        'min_data_in_leaf': 115, 
        'max_depth': 23, 
        'max_bin': 898}
      
    lgb=LGBMClassifier(**p6)
    cat=CatBoostClassifier(iterations=3000,
                           verbose=0,
                           l2_leaf_reg=6.6591278779517808,
                           learning_rate=0.005599066836106983,
                           subsample = 0.4,
                           allow_const_label=True,
                           loss_function = 'CrossEntropy')
    
    weights = [0.07,0.31,0.31,0.31]
 
    ensemble = VotingClassifier(estimators=[('mnb',clf),
                                            ('sgd', sgd_model),
                                            ('lgb',lgb), 
                                            ('cat', cat)
                                           ],
                                weights=weights, voting='soft')
    ensemble.fit(X_train,train_labels)
    preds = ensemble.predict_proba(X_test)[:,1]
    sub['generated'] = preds
    sub.to_csv('submission.csv', index=False)
    gc.collect()