In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import regex as re
from sklearn.metrics.pairwise import linear_kernel
import matplotlib.pyplot as plt
from scipy.sparse import  vstack
import random
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow_addons as tfa
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Attention, Input, Dense, concatenate, MaxPooling1D, Activation, Add, Flatten, Conv1D, Conv2D
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import tensorflow_text as tf_text

from tensorflow.keras.layers import TextVectorization, Embedding, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Reshape , GlobalAveragePooling2D
from tensorflow.keras import Model, Input


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [3]:
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
df_test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
train_extra = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

In [4]:
tok_path = '/kaggle/input/huggingface-bert-variants/bert-base-cased/bert-base-cased'

In [5]:
train_extra = train_extra[train_extra['source'] != 'train_essays']
train_extra = train_extra[train_extra.RDizzl3_seven]
train_extra.drop(columns=['source','RDizzl3_seven'],inplace=True)
train_extra.rename(columns={'label' : 'generated'}, inplace=True)
df_train = train_extra
df_train.reset_index(inplace=True,drop=True)
set(train_extra.prompt_name.tolist())

{'"A Cowboy Who Rode the Waves"',
 'Car-free cities',
 'Does the electoral college work?',
 'Driverless cars',
 'Exploring Venus',
 'Facial action coding system',
 'The Face on Mars'}

In [6]:
label_encoder = LabelEncoder()
df_train['prompt_name'] = label_encoder.fit_transform(df_train['prompt_name'])

In [7]:
df_train

Unnamed: 0,text,generated,prompt_name
0,Cars have been around for awhile and they have...,0,1
1,Have you ever thought what it would be like no...,0,1
2,What you are about to read is going to give yo...,0,1
3,cars have many flaws nd and in this day and ag...,0,1
4,There are many advantages of limiting car usag...,0,1
...,...,...,...
19067,"Dear Senator,\n\nI am writing to you today to ...",1,2
19068,"Dear Senator,\n\nI am writing to you today to ...",1,2
19069,"Dear Senator,\n\nI am writing to you today to ...",1,2
19070,"Dear Senator,\n\nI am writing to you today to ...",1,2


In [8]:
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("\nLabel Mapping:")
print(label_mapping)


Label Mapping:
{'"A Cowboy Who Rode the Waves"': 0, 'Car-free cities': 1, 'Does the electoral college work?': 2, 'Driverless cars': 3, 'Exploring Venus': 4, 'Facial action coding system': 5, 'The Face on Mars': 6}


In [9]:
train_size = df_train.shape[0]

In [10]:
def clean_text(text):
    # Replace actual newline and carriage return characters with whitespace
    text = text.replace("\n", " ")
    text = text.replace("\r", " ")
    
    # Drop punctuation
    text = re.sub(r"\p{P}", " ", text)
    
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text)
    
    # Remove leading and trailing whitespace
    text = text.strip()
    
    # Lower text
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r"\d+", "", text)
    
    return text

df_train['text'] =  df_train['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

In [11]:
# Change contractions
contractions = {
    r'\b(can\'t)\b': 'cannot',
    r'\b(don\'t)\b': 'do not',
    r'\b(won\'t)\b': 'will not',
}

# Iterate through contractions and apply replacements to the entire DataFrame column
for pattern, replacement in contractions.items():
    df_train['text'] =  df_train['text'].apply(lambda x: re.sub(pattern, replacement, x, flags=re.IGNORECASE))
    df_test['text'] =  df_test['text'].apply(lambda x: re.sub(pattern, replacement, x, flags=re.IGNORECASE))

In [12]:
text_data = pd.concat([df_train.text,df_test.text])
text_data

0        cars have been around for awhile and they have...
1        have you ever thought what it would be like no...
2        what you are about to read is going to give yo...
3        cars have many flaws nd and in this day and ag...
4        there are many advantages of limiting car usag...
                               ...                        
19070    dear senator i am writing to you today to expr...
19071    dear senator i am writing to you today to expr...
0                                              aaa bbb ccc
1                                              bbb ccc ddd
2                                              ccc ddd eee
Name: text, Length: 19075, dtype: object

In [13]:
vectorizer = TfidfVectorizer(ngram_range=(3, 5), sublinear_tf=True)
tf_idf = vectorizer.fit_transform(text_data.values)

In [14]:
tf_idf_train = tf_idf[:train_size]
tf_idf_test = tf_idf[train_size:]

In [15]:
del tf_idf

In [16]:
def get_sorted_inds(x):
    
    inds = np.zeros((x.shape[0],2),dtype=int)
    for i in range(x.shape[0]):
        temp = x[i]
        if(temp[temp.argmax()] > 0.99):
            inds[i] = temp.argsort()[-3:-1]
        else : 
            inds[i] = temp.argsort()[-2:]
    return inds

In [17]:
def get_train_indices(matrix,df):
    df_indices = df.index
    tf_idf_matrix = vstack([row for idx, row in enumerate(matrix) if idx in df_indices])
    
    temp = df[df['generated'] == 1]
    flag_1 = temp.empty
    if(not flag_1):
        index_list = temp.index
        mask = [i in index_list for i in df_indices]
        tf_idf_1 = [row for idx, row in enumerate(tf_idf_matrix) if mask[idx]]
        tf_idf_1 = vstack(tf_idf_1)
    
    temp = df[df['generated'] == 0]
    flag_2 = temp.empty
    if(not flag_2):
        index_list = temp.index
        mask = [i in index_list for i in df_indices]
        tf_idf_0 = [row for idx, row in enumerate(tf_idf_matrix) if mask[idx]]
        tf_idf_0 = vstack(tf_idf_0)
    
    cs_1 = linear_kernel(tf_idf_matrix, tf_idf_1)
    cs_0 = linear_kernel(tf_idf_matrix, tf_idf_0)
    
    inds_1 = get_sorted_inds(cs_1)
    inds_0 = get_sorted_inds(cs_0)
    
    return inds_1,inds_0

In [18]:
def make_dataset(df,i1,i0):
    texts = df.text.values
    t1 = df[df['generated']==1].text.values
    t0 = df[df['generated']==0].text.values
    df_labels = df.generated.values
    labels = []
    text_1 = []
    text_2 = []
    for i in range(df.shape[0]):
        text_1.extend([texts[i],texts[i]])
#         print(t1[i1[i][0]])
        text_2.extend([t1[i1[i][0]],t1[i1[i][1]]])
        if(df_labels[i] == 1):
            labels.extend([1, 1])
        else:
            labels.extend([0, 0])
            
        text_1.extend([texts[i],texts[i]])
        text_2.extend([t0[i0[i][0]],t0[i0[i][1]]])
        if(df_labels[i] == 1):
            labels.extend([0, 0])
        else:
            labels.extend([1, 1])
    
    return text_1,text_2,labels

In [19]:
sim_data = pd.DataFrame({'text_1' : [] , 'text_2' : [] , 'similar' : []})

In [20]:
for i in range(7):
    print(i)
    i1,i0 = get_train_indices(tf_idf_train,df_train[df_train['prompt_name']==i])
    x,y,z = make_dataset(df_train[df_train['prompt_name']==i],i1,i0)
    temp = pd.DataFrame({'text_1' : x , 'text_2' : y , 'similar' : z})
    sim_data = pd.concat([sim_data,temp])

0
1
2
3
4
5
6


In [21]:
sim_data

Unnamed: 0,text_1,text_2,similar
0,luke the seagoing cowboy i would love to do th...,introduction have you ever wanted to go on an ...,0.0
1,luke the seagoing cowboy i would love to do th...,dear friends have you ever wanted to go on an ...,0.0
2,luke the seagoing cowboy i would love to do th...,is a seagoing cowboy a real thing people might...,1.0
3,luke the seagoing cowboy i would love to do th...,this persuasive essay is about why you should ...,1.0
4,people should join the seagoing cowboys progra...,i m glad you asked me to write an essay from t...,0.0
...,...,...,...
7567,i remember the first time i saw the face on ma...,some people think that the face on mars was cr...,0.0
7568,i remember the first time i saw the face on ma...,i remember the first time i saw the face on ma...,1.0
7569,i remember the first time i saw the face on ma...,hey listen up i m here to convince you that th...,1.0
7570,i remember the first time i saw the face on ma...,this article is mostly about nasa and it s dis...,0.0


In [22]:
tokenizer = BertTokenizer.from_pretrained(tok_path, do_lower_case = True)

In [None]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in text_data]

In [None]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=512, dtype="long", truncating="post", padding="post")

In [None]:
train_input_ids = input_ids[:train_size]
test_input_ids = input_ids[train_size:]

In [None]:
train_input_ids.shape

In [None]:
t1 = df_train.text.values
t1_ids = np.zeros((sim_data.shape[0],512),dtype=int)
i = 0
for ids in train_input_ids:
    for j in range(4):
        t1_ids[i+j] = ids
    i += 4

In [None]:
t1_ids.shape

In [None]:
t2_inds = []
t2_ids = np.zeros((sim_data.shape[0],512),dtype=int)
t2_texts = sim_data.text_2.values
for i in range(sim_data.shape[0]):
    idx = df_train[df_train['text'] == t2_texts[i]].index[0]
    t2_inds.append(idx*4)
    
    if(i % 1000 == 0):
        print(i)

In [None]:
train_array = np.zeros((2,sim_data.shape[0],512))

In [None]:
train_array[0] = t1_ids
train_array[1] = t2_ids

In [None]:
train_array.shape

In [None]:
del text_data , vectorizer , input_ids, t1_ids , t2_ids, t2_inds

In [None]:
def get_similar_data(tfidf,tfidf_test):
    
    cosine_similarities = linear_kernel(tfidf_test, tfidf)
    related_docs_indices = cosine_similarities.argsort()[:,-4:]

    
    return related_docs_indices

In [None]:
def get_test_indices(df,tfidf_train,tfidf_test):   
    
    temp = df[df['generated'] == 1]
    flag_1 = temp.empty
    if(not flag_1):
        index_list = temp.index
        mask = [i in index_list for i in df.index]
        tf_idf_1 = [row for idx, row in enumerate(tfidf_train) if mask[idx]]
        tf_idf_1 = vstack(tf_idf_1)
    
    temp = df[df['generated'] == 0]
    flag_2 = temp.empty
    if(not flag_2):
        index_list = temp.index
        mask = [i in index_list for i in df.index]
        tf_idf_0 = [row for idx, row in enumerate(tfidf_train) if mask[idx]]
        tf_idf_0 = vstack(tf_idf_0)
    
    cs_1 = linear_kernel(tfidf_test, tf_idf_1)
    cs_0 = linear_kernel(tfidf_test, tf_idf_0)
    
    return cs_1, cs_0

In [None]:
ids1,ids0 = get_test_indices(df_train,tf_idf_train,tf_idf_test) 

In [None]:
test_array = np.zeros((2,df_test.shape[0]*8,512))
def make_test_data(df,ids1,ids0):
    
    df_1 = df[df['generated'] == 1].index.tolist()
    df_0 = df[df['generated'] == 0].index.tolist()
    
    in_1 = train_input_ids[df_1]
    in_0 = train_input_ids[df_0]
    
    ids1 = ids1.argsort()[:,-4:]
    ids0 = ids0.argsort()[:,-4:]
    
    for i in range(df_test.shape[0]):
        temp = ids1[i]
        for j in range(4):
            test_array[0,j] = test_input_ids[i]
            test_array[1,j] = train_input_ids[temp[j]]
            
        temp = ids0[i]
        for j in range(4,8):
            test_array[0,j] = test_input_ids[i]
            test_array[1,j] = train_input_ids[temp[j-4]]

In [None]:
make_test_data(df_train,ids1,ids0)

In [None]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
max_features = 75000
embedding_dim = 64
sequence_length = 512

In [None]:
def createModel(embedding,bidirectional,transformer_block,conv_1d,globalmaxpool,inputs):
    x = embedding(inputs)
    x = bidirectional(x)
    x = transformer_block(x)
    x = conv_1d(x)
    x = globalmaxpool(x)
    model = Model(inputs= inputs, outputs=x)
    return model

In [None]:
def concat_model():
    
    inputs_0 = Input(shape=(sequence_length,), dtype="int64")
    inputs_1 = Input(shape=(sequence_length,), dtype="int64")
    
    embedding = Embedding(max_features, embedding_dim)
    bidirectional = Bidirectional(LSTM(32, return_sequences=True))
    transformer_block = TransformerBlock(embedding_dim, 2, 32)
    conv_1d = Conv1D(128, 3, padding="valid", activation="relu", strides=3)
    globalmaxpool = GlobalMaxPooling1D()

    model1 = createModel(embedding,bidirectional,transformer_block,conv_1d,globalmaxpool,inputs_0)
    model2 = createModel(embedding,bidirectional,transformer_block,conv_1d,globalmaxpool,inputs_1)

    combined = concatenate([model1.output, model2.output])
    
    combined_reshaped = Reshape((2,128, 1))(combined)
    
    conv2d_2 = Conv2D(128, (2, 3), activation="relu")(combined_reshaped)
    avg_pool_2 = GlobalAveragePooling2D()(conv2d_2)
    conv2d_4 = Conv2D(128, (2, 5), activation="relu")(combined_reshaped)
    avg_pool_4 = GlobalAveragePooling2D()(conv2d_4)
    conv2d_5 = Conv2D(128, (2, 7), activation="relu")(combined_reshaped)
    avg_pool_5 = GlobalAveragePooling2D()(conv2d_5)
    
    concatenated_avgpools = concatenate([avg_pool_2, avg_pool_4, avg_pool_5])

    dense = Dense(256, activation="relu")(concatenated_avgpools)
    dense_1 = Dense(128, activation="relu")(dense)
    dense_2 = Dense(32, activation="relu")(dense)

    output = Dense(1, activation="sigmoid")(dense)

    model = Model(inputs=[inputs_0, inputs_1], outputs=output)
    model.summary()
    adam = Adam(learning_rate=1e-5)
    model.compile(optimizer=adam, loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics=['acc', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

    return model

In [None]:
model = concat_model()

In [None]:
epochs = 5
model.fit(x = [train_array[0],train_array[1]],y = sim_data.similar.values,
          epochs=epochs, shuffle=True,batch_size = 64 )

In [None]:
preds = model.predict(test_array)

In [None]:
predictions = []
for i in range(0,preds.shape[0],8):
    
    p0 = preds[i:i+4].mean()
    p1 = preds[i+4 : i+8].mean()
    
    if(p0 > p1):
        predictions.append(p0)
    
    else :
        predictions.append(p1)

In [None]:
subs = pd.DataFrame({'id' : test.id, 'generated' : predictions})

In [None]:
subs.to_csv("submission.csv",index=False)