In [115]:
import numpy as np
import pandas as pd


df = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
df.head()

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


# Generative Model:
**An AI model that is trained to generate new data samples that tries to resemble given set of training samples. Genrative models are used for image generation, Natural Language Processing, data augmnetation and many more. 
Here, in Large Language Model System, we use the same principle to generate the text related to the training samples. The main downside of generative model is low variability distribution mean.
For this we exploit this low variability distribution mean and find out the difference between the AI generated text and student written text.
A low variance will have values grouped around the mean (e.g. a narrow bell shape), whereas a high vaiance will have values spread out from the norm.**


In [116]:
from scipy.stats import wasserstein_distance 

# Wasserstein Distance

**Here I need a probability distribution of words written which are first converted to word 2 vector representation then to a numpy array, with all the array sum to 0, then listing down the highest value of wassertein distance between them, and according to that formulating our context, whether the given thing is generated by LLM or written by a student.**
**Wasserstein distance (also known as Earth Mover's Distance or Wasserstein metric) is a measure of the distance between two probability distributions over a metric space. It's used to quantify the amount of "work" needed to transform one distribution into the other. In the context of probability distributions, it is common to compute the Wasserstein distance between empirical distributions (histograms) or probability density functions.**

**The Wasserstein distance is typically a non-negative real number. A higher Wasserstein distance indicates greater dissimilarity between the two distributions.**

**High Wasserstein Distance: If the Wasserstein distance between two distributions is high, it suggests that the two distributions are significantly different. There is more "work" required to transform one into the other.**

**Low Wasserstein Distance: If the Wasserstein distance is low, it suggests that the two distributions are more similar. The amount of "work" needed to transform one into the other is relatively small.**

In [117]:
df.value_counts('generated')

generated
0    1375
1       3
Name: count, dtype: int64

In [118]:
df.isna().sum()

id           0
prompt_id    0
text         0
generated    0
dtype: int64

# '0' generated by a student and '1' generated by a LLM

In [119]:
index_llm = df.loc[df['generated'] == 1].index.tolist()
print(index_llm)

[704, 740, 1262]


In [120]:
index_llm= [index for index in index_llm if index in df.index]

# Drop rows by index if the indices exist
df1 = df.drop(index_llm, errors='ignore')


In [121]:
df1.head()

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


In [122]:
length = len(df1)
print(length)

1375


# Here we have very low samples for text generated by LLM, so rather than using some pre-trained model, let us dive deep into the way which generative model is tested from the real model

In [123]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [124]:
#from nltk.stem.porter import PorterStemmer

In [125]:
#ps = PorterStemmer()
list_essay = []
for i in range(length+3):
    tr = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    tr = tr.lower()
    tr = tr.split()
    tr = [word for word in tr if word not in stopwords.words('english')]
    tr = ' '.join(tr)
    list_essay.append(tr)

In [126]:
len(list_essay)

1378

In [127]:
list_essay[2]

'america love affair vehicles seems cooling says elisabeth rosenthal understand rosenthal perspective easier suggest america car usage decreasing slowly necessarily bad sense certain positive effects advantages limiting car usage includes increase security health along decrease pollution dependence firstly car usage limited security health likely guaranteed feeling secure highly important individuals everywhere example many people colombia used public transportation car free day leaving streets capital city according andrew selsky eerily devoid traffic jams complications stem traffic jams end feeling confidence plan get point b simple second ago complication personal plans leads become stressed feeling doubt overcomes thoughts car usage limited would control much traffic accumulates thus minimizing chance stress heidrun walter states car always tense much happier way car usage minimize conditions detrimental health also enlarges capacity exercise main purpose car get someone one place 

In [128]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [129]:
import gensim


In [130]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [131]:
# Tokenize each word in the list
#tokenized_words = [word_tokenize(sentence.lower()) for sentence in list_essay]

# Flatten the list of lists into a single list
#essay_llm_1 = [token for sublist in tokenized_words for token in sublist]
#print(tokenized_words)

In [132]:
essay_stu_1 = list_essay[0]
essay_llm_1 = list_essay[704]      #At 704 indexed list, the essay is written by LLM.
essay_llm_2 = list_essay[740]
essay_llm_3 = list_essay[1262]
essay_stu_2 = list_essay[1]
essay_stu_3 = list_essay[2]

In [133]:
list_essay = [str(item) for item in list_essay]


# Here, I take vector size as 1 in word2vec.

In [134]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Assuming list_essay is a list of sentences
tokenized_sentences = [word_tokenize(sentence) for sentence in list_essay]

# Define the Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=1, window=5, min_count=1, workers=4)  # Adjust the parameters as needed

# Training the model
model.train(tokenized_sentences, total_examples=len(tokenized_sentences), epochs=10)

def tokenize_word_vec_1(essay):
    tokenized_words = word_tokenize(essay)  # Use the same tokenizer
    list_essay = []
    
    for word in tokenized_words:
        try:
            vector = model.wv[word]
            list_essay.append(vector)
        except KeyError:
            # Handle the case where the word is not in the vocabulary
            # You might want to ignore it, assign a special vector, or handle it differently based on your use case
            pass
    
    float_list = [float(arr[0]) for arr in list_essay]
    return float_list


In [135]:
vec_llm1 = tokenize_word_vec_1(essay_llm_1)
vec_llm2 = tokenize_word_vec_1(essay_llm_2)
vec_llm3 = tokenize_word_vec_1(essay_llm_3)

In [136]:

list_value_1 = []
list_value_2 = []
list_value_3 = []
distribution_p = np.array(vec_llm1) / np.sum(vec_llm1)
distribution_q = np.array(vec_llm2) / np.sum(vec_llm2)
distribution_r = np.array(vec_llm3) / np.sum(vec_llm3)


In [137]:
def apply_was_1(essay_stu_1, dist_p):
    list_1 = []
    vec_stu = tokenize_word_vec_1(essay_stu_1)
    dist_a = np.array(vec_stu) / np.sum(vec_stu)
    wass_val = wasserstein_distance(dist_a, dist_p)
    list_1.append(wass_val)
    return wass_val

In [138]:
len(list_essay)

1378

In [139]:
for i in range(len(list_essay)):
    value = apply_was_1(list_essay[i], distribution_p)
    list_value_1.append(value)

In [140]:
for i in range(len(list_essay)):
    value = apply_was_1(list_essay[i], distribution_q)
    list_value_2.append(value)

In [141]:
for i in range(len(list_essay)):
    value = apply_was_1(list_essay[i], distribution_r)
    list_value_3.append(value)

In [142]:
list_value_2[1262]

0.002121154764471281

In [143]:


# Create a list of tuples containing (index, value)
indexed_values = list(enumerate(list_value_2))

# Sort the list of tuples based on values
sorted_values_with_index = sorted(indexed_values, key=lambda x: x[1])

# Calculate the number of values to keep (80%)
num_values_to_keep = int(len(sorted_values_with_index) * 0.8)

# Take the top 80% of values and indices
top_values_with_index = sorted_values_with_index[-num_values_to_keep:]

# Extract sorted values and indices from the top values with indices
sorted_values = [value for index, value in top_values_with_index]
sorted_indices = [index for index, value in top_values_with_index]

# Print the original and sorted values with indices
#print("Original Values:", values)
#print("Sorted Values (Top 80%):", sorted_values)
#print("Sorted Indices (Top 80%):", sorted_indices)
count = 0
index = [704, 1262]
for i in range(2):
    for j in sorted_indices:
        if index[i] == j:
            count = count+1
            p = index[i]
if count == 0:
    print("Victory")
else:
    print(p)
       

Victory


In [144]:


# Create a list of tuples containing (index, value)
indexed_values = list(enumerate(list_value_3))

# Sort the list of tuples based on values
sorted_values_with_index = sorted(indexed_values, key=lambda x: x[1])

# Calculate the number of values to keep (80%)
num_values_to_keep = int(len(sorted_values_with_index) * 0.8)

# Take the top 80% of values and indices
top_values_with_index = sorted_values_with_index[-num_values_to_keep:]

# Extract sorted values and indices from the top values with indices
sorted_values = [value for index, value in top_values_with_index]
sorted_indices = [index for index, value in top_values_with_index]

# Print the original and sorted values with indices
#print("Original Values:", values)
#print("Sorted Values (Top 80%):", sorted_values)
#print("Sorted Indices (Top 80%):", sorted_indices)
count = 0
index = [740, 1262]
for i in range(2):
    for j in sorted_indices:
        if index[i] == j:
            count = count+1
            p = index[i]
if count == 0:
    print("Victory")
else:
    print(p)
       

740


In [145]:


# Create a list of tuples containing (index, value)
indexed_values = list(enumerate(list_value_1))

# Sort the list of tuples based on values
sorted_values_with_index = sorted(indexed_values, key=lambda x: x[1])

# Calculate the number of values to keep (80%)
num_values_to_keep = int(len(sorted_values_with_index) * 0.8)

# Take the top 80% of values and indices
top_values_with_index = sorted_values_with_index[-num_values_to_keep:]

# Extract sorted values and indices from the top values with indices
sorted_values = [value for index, value in top_values_with_index]
sorted_indices = [index for index, value in top_values_with_index]

# Print the original and sorted values with indices
#print("Original Values:", values)
#print("Sorted Values (Top 80%):", sorted_values)
#print("Sorted Indices (Top 80%):", sorted_indices)
count = 0
index = [740, 1262]
for i in range(2):
    for j in sorted_indices:
        if index[i] == j:
            count = count+1
            p = index[i]
if count == 0:
    print("Victory")
else:
    print(p)
       

Victory


## Here, we take the vector size of word2vec as 100 and take mean as our word2vec representation.
# **We could study our difference from this in a distribution.**

In [146]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Assuming list_essay is a list of sentences
tokenized_sentences = [word_tokenize(sentence) for sentence in list_essay]

# Define the Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)  # Adjust the parameters as needed

# Training the model
model.train(tokenized_sentences, total_examples=len(tokenized_sentences), epochs=10)

def mean_vector(words, model):
    vectors = [model.wv[word] for word in words if word in model.wv]
    
    if vectors:
        return sum(vectors) / len(vectors)
    else:
        # Handle the case where none of the words are in the vocabulary
        return None

def tokenize_word_vec(essay, model):
    tokenized_words = word_tokenize(essay)
    mean_vec = mean_vector(tokenized_words, model)

    if mean_vec is not None:
        return mean_vec.tolist()  # Convert NumPy array to a Python list
    else:
        return None

# Example usage:
essay = "Your essay text here."
result = tokenize_word_vec(essay, model)

if result is not None:
    print("Mean Vector:", result)
else:
    print("None of the words are in the vocabulary.")


Mean Vector: [-0.1818123161792755, 0.19892928004264832, -0.0527321919798851, -0.1280875951051712, 0.22281667590141296, 0.3986586630344391, -0.2977621853351593, 0.7825870513916016, 0.22386297583580017, 0.3402906656265259, 0.010022010654211044, -0.5425267815589905, -0.10606946051120758, 0.008115455508232117, -0.22401419281959534, 0.2486446499824524, -0.029243454337120056, -0.22155436873435974, 0.029437944293022156, -0.5022236108779907, -0.05768073350191116, -0.23844251036643982, -0.4593018889427185, -0.46067485213279724, 0.03546224534511566, 0.10084310919046402, 0.13307642936706543, -0.3563069701194763, -0.25381070375442505, -0.04381021112203598, 0.0629967749118805, 0.15012551844120026, -0.11343321949243546, -0.2970835566520691, -0.12153058499097824, 0.015092439949512482, -0.1319258064031601, -0.26318755745887756, -0.028551727533340454, -0.7739123106002808, 0.14890047907829285, -0.4444156885147095, -0.6161304712295532, 0.2778577506542206, 0.17100687325000763, -0.15806074440479279, -0.503

In [147]:
vec_llm1 = tokenize_word_vec(essay_llm_1, model)
vec_llm2 = tokenize_word_vec(essay_llm_2, model)
vec_llm3 = tokenize_word_vec(essay_llm_3, model)

In [148]:

list_value_1 = []
list_value_2 = []
list_value_3 = []
distribution_p = np.array(vec_llm1) / np.sum(vec_llm1)
distribution_q = np.array(vec_llm2) / np.sum(vec_llm2)
distribution_r = np.array(vec_llm3) / np.sum(vec_llm3)

In [149]:
def apply_was(essay_stu_1, dist_p, model):
    list_1 = []
    vec_stu = tokenize_word_vec(essay_stu_1, model)
    dist_a = np.array(vec_stu) / np.sum(vec_stu)
    wass_val = wasserstein_distance(dist_a, dist_p)
    list_1.append(wass_val)
    return wass_val

In [150]:
list_val_mean_1 = []
list_val_mean_2 = []
list_val_mean_3 = []

In [151]:
for i in range(len(list_essay)):
    value = apply_was(list_essay[i], distribution_p, model)
    list_val_mean_1.append(value)

In [152]:
for i in range(len(list_essay)):
    value = apply_was(list_essay[i], distribution_q, model)
    list_val_mean_2.append(value)

In [153]:
for i in range(len(list_essay)):
    value = apply_was(list_essay[i], distribution_r, model)
    list_val_mean_3.append(value)

In [154]:
list_val_mean_2[1262]

0.13209492422537786

In [155]:


# Create a list of tuples containing (index, value)
indexed_values = list(enumerate(list_val_mean_1))

# Sort the list of tuples based on values
sorted_values_with_index = sorted(indexed_values, key=lambda x: x[1])

# Calculate the number of values to keep (80%)
num_values_to_keep = int(len(sorted_values_with_index) * 0.8)

# Take the top 80% of values and indices
top_values_with_index = sorted_values_with_index[-num_values_to_keep:]

# Extract sorted values and indices from the top values with indices
sorted_values = [value for index, value in top_values_with_index]
sorted_indices = [index for index, value in top_values_with_index]

# Print the original and sorted values with indices
#print("Original Values:", values)
#print("Sorted Values (Top 80%):", sorted_values)
#print("Sorted Indices (Top 80%):", sorted_indices)
count = 0
index = [740, 1262]
for i in range(2):
    for j in sorted_indices:
        if index[i] == j:
            count = count+1
            p = index[i]
if count == 0:
    print("Victory")
else:
    print(p)
       

1262


In [156]:


# Create a list of tuples containing (index, value)
indexed_values = list(enumerate(list_val_mean_2))

# Sort the list of tuples based on values
sorted_values_with_index = sorted(indexed_values, key=lambda x: x[1])

# Calculate the number of values to keep (80%)
num_values_to_keep = int(len(sorted_values_with_index) * 0.8)

# Take the top 80% of values and indices
top_values_with_index = sorted_values_with_index[-num_values_to_keep:]

# Extract sorted values and indices from the top values with indices
sorted_values = [value for index, value in top_values_with_index]
sorted_indices = [index for index, value in top_values_with_index]

# Print the original and sorted values with indices
#print("Original Values:", values)
#print("Sorted Values (Top 80%):", sorted_values)
#print("Sorted Indices (Top 80%):", sorted_indices)
count = 0
index = [704, 1262]
for i in range(2):
    for j in sorted_indices:
        if index[i] == j:
            count = count+1
            p = index[i]
if count == 0:
    print("Victory")
else:
    print(p)
       

1262


In [157]:


# Create a list of tuples containing (index, value)
indexed_values = list(enumerate(list_val_mean_3))

# Sort the list of tuples based on values
sorted_values_with_index = sorted(indexed_values, key=lambda x: x[1])

# Calculate the number of values to keep (80%)
num_values_to_keep = int(len(sorted_values_with_index) * 0.8)

# Take the top 80% of values and indices
top_values_with_index = sorted_values_with_index[-num_values_to_keep:]

# Extract sorted values and indices from the top values with indices
sorted_values = [value for index, value in top_values_with_index]
sorted_indices = [index for index, value in top_values_with_index]

# Print the original and sorted values with indices
#print("Original Values:", values)
#print("Sorted Values (Top 80%):", sorted_values)
#print("Sorted Indices (Top 80%):", sorted_indices)
count = 0
index = [740, 704]
for i in range(2):
    for j in sorted_indices:
        if index[i] == j:
            count = count+1
            p = index[i]
if count == 0:
    print("Victory")
else:
    print(p)
       

704


## Let's do it for One hot Encoder

In [158]:
from sklearn.preprocessing import MultiLabelBinarizer

In [160]:
mlb = MultiLabelBinarizer()
one_hot_encoded = mlb.fit_transform(list_essay)

In [162]:
vec_llm1 = one_hot_encoded[704]
vec_llm2 = one_hot_encoded[740]
vec_llm3 = one_hot_encoded[1262]

In [163]:
list_value_1 = []
list_value_2 = []
list_value_3 = []
distribution_p = np.array(vec_llm1) / np.sum(vec_llm1)
distribution_q = np.array(vec_llm2) / np.sum(vec_llm2)
distribution_r = np.array(vec_llm3) / np.sum(vec_llm3)

In [170]:
def apply_was_hot(essay_stu_1, dist_p):
    
    dist_a = np.array(essay_stu_1) / np.sum(essay_stu_1)
    wass_val = wasserstein_distance(dist_a, dist_p)
    
    return wass_val

In [171]:
list_one_hot_1 = []
list_one_hot_2 = []
list_one_hot_3 = []

In [173]:
for i in range(len(list_essay)):
    value = apply_was_hot(one_hot_encoded[i], distribution_p)
    list_one_hot_1.append(value)

In [175]:
for i in range(len(list_essay)):
    value = apply_was_hot(one_hot_encoded[i], distribution_p)
    list_one_hot_2.append(value)

In [176]:
for i in range(len(list_essay)):
    value = apply_was_hot(one_hot_encoded[i], distribution_p)
    list_one_hot_3.append(value)

In [186]:


# Create a list of tuples containing (index, value)
indexed_values = list(enumerate(list_one_hot_1))

# Sort the list of tuples based on values
sorted_values_with_index = sorted(indexed_values, key=lambda x: x[1])

# Calculate the number of values to keep (80%)
num_values_to_keep = int(len(sorted_values_with_index) * 0.8)

# Take the top 80% of values and indices
top_values_with_index = sorted_values_with_index[-num_values_to_keep:]

# Extract sorted values and indices from the top values with indices
sorted_values = [value for index, value in top_values_with_index]
sorted_indices = [index for index, value in top_values_with_index]

# Print the original and sorted values with indices
#print("Original Values:", values)
#print("Sorted Values (Top 90%):", sorted_values)
#print("Sorted Indices (Top 90%):", sorted_indices)
count = 0
index = [740, 1262]
for i in range(2):
    for j in sorted_indices:
        if index[i] == j:
            count = count+1
            p = index[i]
if count == 0:
    print("Victory")
else:
    print(p)
       

740


In [187]:


# Create a list of tuples containing (index, value)
indexed_values = list(enumerate(list_one_hot_2))

# Sort the list of tuples based on values
sorted_values_with_index = sorted(indexed_values, key=lambda x: x[1])

# Calculate the number of values to keep (80%)
num_values_to_keep = int(len(sorted_values_with_index) * 0.9)

# Take the top 80% of values and indices
top_values_with_index = sorted_values_with_index[-num_values_to_keep:]

# Extract sorted values and indices from the top values with indices
sorted_values = [value for index, value in top_values_with_index]
sorted_indices = [index for index, value in top_values_with_index]

# Print the original and sorted values with indices
#print("Original Values:", values)
#print("Sorted Values (Top 90%):", sorted_values)
#print("Sorted Indices (Top 90%):", sorted_indices)
count = 0
index = [704, 1262]
for i in range(2):
    for j in sorted_indices:
        if index[i] == j:
            count = count+1
            p = index[i]
if count == 0:
    print("Victory")
else:
    print(p)
       

Victory


In [188]:


# Create a list of tuples containing (index, value)
indexed_values = list(enumerate(list_one_hot_3))

# Sort the list of tuples based on values
sorted_values_with_index = sorted(indexed_values, key=lambda x: x[1])

# Calculate the number of values to keep (80%)
num_values_to_keep = int(len(sorted_values_with_index) * 0.9)

# Take the top 80% of values and indices
top_values_with_index = sorted_values_with_index[-num_values_to_keep:]

# Extract sorted values and indices from the top values with indices
sorted_values = [value for index, value in top_values_with_index]
sorted_indices = [index for index, value in top_values_with_index]

# Print the original and sorted values with indices
#print("Original Values:", values)
#print("Sorted Values (Top 90%):", sorted_values)
#print("Sorted Indices (Top 90%):", sorted_indices)
count = 0
index = [740, 704]
for i in range(2):
    for j in sorted_indices:
        if index[i] == j:
            count = count+1
            p = index[i]
if count == 0:
    print("Victory")
else:
    print(p)
       

740
