In [1]:
# Import Pandas library
import pandas as pd

# Load the first dataset
dataset1_url ="https://github.com/SevithaJanga03/LLM---Detect-AI-Generated-Text/blob/main/final_essays.csv"   # Replace with the actual URL or file path
df = pd.read_csv('merged_essays.csv')

# Display dataset
print(df.head())


         id  prompt_id                                               text  \
0  0059830c          0  Cars. Cars have been around since they became ...   
1  005db917          0  Transportation is a large necessity in most co...   
2  008f63e3          0  "America's love affair with it's vehicles seem...   
3    940276          0  How often do you ride in a car? Do you drive a...   
4  00c39458          0  Cars are a wonderful thing. They are perhaps o...   

   generated  
0          0  
1          0  
2          0  
3          0  
4          0  


In [2]:
# Specify the features (X) and target variable (y)
from sklearn.model_selection import train_test_split
X = df
# X = df.drop('generated', axis=1)  # Assuming 'generated' is the target variable
y = df['generated']

# Split the dataset into training and development sets
# Adjust the test_size and random_state parameters as needed
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.3, random_state=52)

# Display the shapes of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Development set shape:", X_dev.shape, y_dev.shape)

Training set shape: (1041, 4) (1041,)
Development set shape: (447, 4) (447,)


In [3]:
from collections import Counter

my_dict= {}
def build_vocabulary(essay, min_occurrence=5):

    # Flatten the list of sentences into a single list of words
    all_words = [word.lower() for word in essay.split()]

    # Count the occurrences of each word
    word_counts = Counter(all_words)

    # Filter out rare words based on the minimum occurrence threshold
    vocabulary = [word for word, count in word_counts.items() if count >= min_occurrence]

    # Create a reverse index mapping each word to its index in the vocabulary
    reverse_index = {word: index for index, word in enumerate(vocabulary)}

    return reverse_index

for index,row in X_train.iterrows():
  reverse_index = build_vocabulary(row['text'])
  if(row['generated'] == 0):
    Class_voc={'HUMAN':0}
    Class_voc.update(reverse_index)
  else:
    Class_voc={'LLM':1}
    Class_voc.update(reverse_index)
  try:
    my_dict[row['id']].append(Class_voc)
  except KeyError:
    my_dict = {**my_dict, **{row['id']: Class_voc}}

print(my_dict)

{'9f45f83a': {'HUMAN': 0, 'the': 0, 'electoral': 1, 'college': 2, 'has': 3, 'in': 4, 'it': 5, 'of': 6, 'by': 7, 'a': 8, 'vote': 9, 'and': 10, 'as': 11, 'is': 12, 'electors': 13, 'they': 14, 'for': 15, 'votes': 16, 'are': 17, 'their': 18, 'this': 19, 'citizens': 20, 'to': 21, 'president.': 22, 'elect': 23, 'that': 24, 'states': 25}, '9b7c31e7': {'HUMAN': 0, 'the': 0, 'electoral': 1, 'college': 2, 'by': 3, 'vote': 4, 'is': 5, 'for': 6, 'people': 7, 'to': 8, 'and': 9, 'of': 10, 'we': 11}, 'dc6f7cfd': {'HUMAN': 0, 'cars': 0, 'have': 1, 'a': 2, 'to': 3, 'as': 4, 'of': 5, 'people': 6, 'the': 7, 'world': 8, 'without': 9, 'in': 10, 'our': 11, 'many': 12, 'would': 13, 'be': 14, 'and': 15, 'not': 16, 'car': 17, 'this': 18, 'using': 19}, '489d3798': {'HUMAN': 0, 'to': 0, 'election': 1, 'by': 2, 'popular': 3, 'vote': 4, 'the': 5, 'of': 6, 'it': 7, 'is': 8, 'and': 9, 'a': 10, 'electoral': 11, 'college': 12, 'as': 13, 'they': 14, 'have': 15, 'method': 16, 'in': 17}, 'cf290550': {'HUMAN': 0, 'cars': 

In [4]:
def calculate_probability(word, documents):
    num_documents_with_word = sum(1 for document in documents.values() if word in document)
    # Calculate the probability
    probability = num_documents_with_word / len(X_train)
    return probability

def calculate_conditional_probability(word, documents,class_label):
    num_documents_in_class=0
    num_positive_documents=0
    for document in documents.values():
      if(word in document and list(document.items())[0][0]==class_label):
        num_positive_documents=num_positive_documents+1
      if(list(document.items())[0][0]==class_label):
        num_documents_in_class=num_documents_in_class+1
    probability = num_positive_documents / num_documents_in_class
    return probability


# Word for which probability is calculated
target_word = "the"
class_label = "HUMAN"
# Calculate the probability of the word
probability = calculate_probability(target_word, my_dict)
llm_conditional_probability = calculate_conditional_probability(target_word, my_dict,class_label)

# Display the result
print(f"Probability of '{target_word}': {probability:.4f}")
print(f"Probability of '{target_word} in {class_label} class ': {llm_conditional_probability:.4f}")

Probability of 'the': 0.9817
Probability of 'the in HUMAN class ': 0.9916


In [24]:
import nltk
import random
from collections import defaultdict

# Download NLTK data
nltk.download('punkt')

X_train_data = X_train[['text','generated']].values.tolist()
X_dev_data=X_dev[['text','generated']].values.tolist()

def tokenize(text):
    return nltk.word_tokenize(text.lower())

def calculate_probabilities(word_counts, class_counts,smoothing=0):
    vocabulary_size = len(set(word for word, _ in word_counts.keys()))
    probabilities = {}

    for (word, label), count in word_counts.items():
        probabilities[(word, label)] = (count + smoothing) / (class_counts[label] + smoothing * vocabulary_size)

    return probabilities

def count_words(data):
    # Initialize dictionaries to store word counts and class counts
    word_counts = defaultdict(int)
    class_counts = defaultdict(int)

    # Iterate through each entry in the dataset
    for text, label in data:
        # Tokenize the text using the tokenize function
        words = tokenize(text)
        # Update word counts and class counts based on the tokenized words and labels
        for word in words:
            word_counts[(word, label)] += 1
            class_counts[label] += 1
    # Step 2: Remove words with counts less than 5
    filtered_word_counts = {word: count for word, count in word_counts.items() if count >= 5}

    # Return the dictionaries containing word counts and class counts
    return filtered_word_counts, class_counts

def predict_class(text, probabilities, class_probabilities):
    words = tokenize(text)
    log_prob_human = 0
    log_prob_llm = 0

    for word in words:
        log_prob_human *= probabilities.get((word, 0), 0)
        log_prob_llm *= probabilities.get((word, 1), 0)

    log_prob_human += class_probabilities[0]
    log_prob_llm += class_probabilities[1]

    return 0 if log_prob_human > log_prob_llm else 1

def evaluate_accuracy(dev_data, probabilities, class_probabilities):
    correct_predictions = 0

    for text, true_label in dev_data:
        predicted_label = predict_class(text, probabilities, class_probabilities)
        if predicted_label == true_label:
            correct_predictions += 1

    accuracy = correct_predictions / len(dev_data)
    return accuracy

words_count,class_count = (count_words(X_train_data))
print('\nWords count ("word","label" ==> count) : ',words_count)
print('Class Count ("label" ==> count) : ',class_count)


class_probabilities = {label: count / len(X_train_data) for label, count in class_count.items()}
print('\nProbability of Classes : ',class_probabilities)

probabilities_of_each_word = calculate_probabilities(words_count, class_count)
print('\nProbability of word in that particular class ("word","label" ==> probability) : ',probabilities_of_each_word)

accuracy = evaluate_accuracy(X_dev_data, probabilities_of_each_word, class_probabilities)
print(f"\nAccuracy on dev data: {accuracy}")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Words count ("word","label" ==> count) :  {('the', 0): 33971, ('electoral', 0): 5121, ('college', 0): 4287, ('has', 0): 2021, ('been', 0): 810, ('in', 0): 11242, ('constitution', 0): 138, ('since', 0): 406, ('out', 0): 735, ('founding', 0): 183, ('fathers', 0): 171, ('established', 0): 102, ('it', 0): 6183, ('.', 0): 24001, ('was', 0): 1510, ('an', 0): 1421, ('agreement', 0): 9, ('between', 0): 191, ('election', 0): 1428, ('of', 0): 15277, ('president', 0): 2617, ('by', 0): 2646, ('a', 0): 12972, ('vote', 0): 4302, ('congress', 0): 220, ('and', 0): 11969, ('popular', 0): 1700, ('citizens', 0): 761, ('as', 0): 2962, ('we', 0): 3179, ('all', 0): 2042, ('know', 0): 575, ('is', 0): 8795, ('process', 0): 592, ('which', 0): 1129, ('meeting', 0): 45, ('electors', 0): 1794, ('where', 0): 795, ('they', 0): 3921, ('for', 0): 6558, ('vice', 0): 89, (',', 0): 22237, ('counting', 0): 53, ('votes', 0): 1799, ('there', 0): 2122, ('are', 0): 5093, ('two', 0): 428, ('political', 0): 174, ('parties', 0

In [27]:
def derive_top_words(probabilities, label, top_n=10):
    words_prob = [(word, prob) for (word, l), prob in probabilities.items() if l == label]
    words_prob.sort(key=lambda x: x[1], reverse=True)
    return words_prob[:top_n]


def calculate_probabilities(word_counts, class_counts,smoothing=0):
    vocabulary_size = len(set(word for word, _ in word_counts.keys()))
    probabilities = {}

    for (word, label), count in word_counts.items():
        probabilities[(word, label)] = (count + smoothing) / (class_counts[label] + smoothing * vocabulary_size)
    print(probabilities)
    return probabilities

def apply_smoothing(train_data, dev_data):
    smoothing_values = [1, 2, 3, 4, 5]

    for smoothing in smoothing_values:
        word_counts, class_counts = count_words(train_data)
        class_probabilities = {label: count / len(train_data) for label, count in class_counts.items()}
        probabilities = calculate_probabilities(word_counts, class_counts, smoothing)

        accuracy = evaluate_accuracy(dev_data, probabilities, class_probabilities)
        print(f"\nAccuracy on dev data with smoothing {smoothing}: {accuracy}")




apply_smoothing(X_train_data, X_dev_data)

top_words_human = derive_top_words(probabilities_of_each_word, 0)
top_words_llm = derive_top_words(probabilities_of_each_word, 1)

print("\nTop 10 words predicting human essays:")
for word, prob in top_words_human:
    print(f"{word}: {prob}")

print("\nTop 10 words predicting LLM-generated essays:")
for word, prob in top_words_llm:
    print(f"{word}: {prob}")


{('the', 0): 0.05738649224812749, ('electoral', 0): 0.008652231640613123, ('college', 0): 0.007243414540208721, ('has', 0): 0.0034156213153689447, ('been', 0): 0.001369964830249364, ('in', 0): 0.01899200318926461, ('constitution', 0): 0.00023480285006740025, ('since', 0): 0.0006875162588304454, ('out', 0): 0.0012432726449611985, ('founding', 0): 0.0003108181612402996, ('fathers', 0): 0.00029054741159419313, ('established', 0): 0.00017399060112908076, ('it', 0): 0.010446192984293548, ('.', 0): 0.040544877750487346, ('was', 0): 0.002552425226272243, ('an', 0): 0.0024020838330636196, ('agreement', 0): 1.6892291371755415e-05, ('between', 0): 0.00032433199433770394, ('election', 0): 0.0024139084370238487, ('of', 0): 0.02580804275776792, ('president', 0): 0.004422401881125567, ('by', 0): 0.004471389526103658, ('a', 0): 0.021914369596578297, ('vote', 0): 0.0072687529772663545, ('congress', 0): 0.00037331963931579463, ('and', 0): 0.02022007277199123, ('popular', 0): 0.002873378762335596, ('cit