In [1]:
# step1:Load and Prepare the Data
import csv
import random
from collections import defaultdict, Counter

# Function to load data from a CSV file
def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        headers = next(reader)
        data = [row for row in reader]
    return headers, data

# Load training data
train_headers, train_data = load_data('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')

In [2]:
# Step 2: Preprocess the Text Data
import re
import string

# Function to clean and tokenize text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    return text.split()

# Preprocess essays in the training data
for row in train_data:
    text_index = train_headers.index('text')
    row[text_index] = preprocess_text(row[text_index])

In [3]:
# Step 3: Split the Data into Training and Development Sets bold text
# Split the data manually
def split_data(data, split_ratio=0.8):
    random.shuffle(data)
    split_point = int(len(data) * split_ratio)
    return data[:split_point], data[split_point:]

train_data, dev_data = split_data(train_data)

In [4]:
# Step 4: Build the Vocabulary and the Reverse Index
# Function to build vocabulary and reverse index
def build_vocabulary_and_index(data):
    word_counts = Counter(word for row in data for word in row[train_headers.index('text')])
    vocabulary = {word for word, count in word_counts.items() if count >= 5}
    reverse_index = {word: idx for idx, word in enumerate(vocabulary)}
    return vocabulary, reverse_index

vocabulary, reverse_index = build_vocabulary_and_index(train_data)

In [5]:
# Step 5: Calculate Probabilities
# Function to calculate word occurrence probabilities
def calculate_probabilities(data, vocabulary):
    word_occurrences = defaultdict(int)
    class_word_occurrences = defaultdict(lambda: defaultdict(int))
    class_counts = defaultdict(int)
    
    for row in data:
        label = int(row[train_headers.index('generated')])
        class_counts[label] += 1
        words = set(row[train_headers.index('text')])
        for word in words:
            if word in vocabulary:
                word_occurrences[word] += 1
                class_word_occurrences[label][word] += 1

    total_docs = len(data)
    word_probs = {word: count / total_docs for word, count in word_occurrences.items()}
    word_given_class_probs = {
        label: {word: (count / class_counts[label]) for word, count in word_counts.items()}
        for label, word_counts in class_word_occurrences.items()
    }
    
    return word_probs, word_given_class_probs

word_probs, word_given_class_probs = calculate_probabilities(train_data, vocabulary)


In [6]:
# Step 6: Define the Classifier and Evaluate Accuracy
# Function to classify a new document
def classify(document, vocabulary, reverse_index, word_given_class_probs):
    doc_words = set(document)
    class_scores = defaultdict(float)
    
    for word in doc_words:
        if word in vocabulary:
            for class_label, probs in word_given_class_probs.items():
                word_idx = reverse_index[word]
                class_scores[class_label] += probs.get(word_idx, 0)
    
    return max(class_scores, key=class_scores.get)

# Function to evaluate the classifier
def evaluate(data, classify_func):
    correct = 0
    for row in data:
        label = int(row[train_headers.index('generated')])
        prediction = classify_func(row[train_headers.index('text')], vocabulary, reverse_index, word_given_class_probs)
        if prediction == label:
            correct += 1
    return correct / len(data)

accuracy = evaluate(dev_data, classify)
print(f"Development Set Accuracy: {accuracy:.2%}")


Development Set Accuracy: 99.64%


In [7]:
# Step 7: Experiment with Smoothing and Identify Top Predictive Words
def train_naive_bayes(data, vocabulary):
    word_given_class_counts = defaultdict(lambda: defaultdict(int))
    class_counts = defaultdict(int)
    
    # Count how many times each word appears in documents of each class
    for row in data:
        label = int(row[train_headers.index('generated')])
        class_counts[label] += 1
        words = row[train_headers.index('text')]
        for word in words:
            if word in vocabulary:
                word_given_class_counts[label][word] += 1
    
    # Apply Laplace smoothing to word counts and convert them to probabilities
    word_given_class_probs = {
        label: {
            word: (word_count + 1) / (sum(class_word_counts.values()) + len(vocabulary))
            for word, word_count in class_word_counts.items()
        } for label, class_word_counts in word_given_class_counts.items()
    }
    
    # Calculate class probabilities
    total_docs = sum(class_counts.values())
    class_probs = {label: count / total_docs for label, count in class_counts.items()}
    
    return word_given_class_probs, class_probs

# Run the training function
word_given_class_probs, class_probs = train_naive_bayes(train_data, vocabulary)


In [8]:
# Step 8: Prepare Submission
def smooth_and_top_words(word_given_class_probs, vocabulary, smooth_factor=1):
    smoothed_probs = {}
    top_words_probs = {}  # Change the name of this variable to match what you return

    # Apply smoothing and calculate probabilities
    for label, word_probs in word_given_class_probs.items():
        total_words = sum(word_probs.values()) + smooth_factor * len(vocabulary)
        smoothed_probs[label] = {
            word: (count + smooth_factor) / total_words
            for word, count in word_probs.items()
        }
        # Get the top 10 words by probability
        top_words_probs[label] = sorted(smoothed_probs[label].items(), key=lambda item: item[1], reverse=True)[:10]

    return top_words_probs

top_words_probs = smooth_and_top_words(word_given_class_probs, vocabulary)

# Print top words
for label, words in top_words_probs.items():  # Use top_words_probs here
    print(f"Class {label}:")
    for word, prob in words:
        print(f"  {word}: {prob:.6f}")

# Function to save top words to a CSV file
def save_top_words_to_csv(top_words_probs, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Class", "Word", "Probability"])
        for label, words in top_words_probs.items():
            for word, prob in words:
                writer.writerow([label, word, prob])

# Call the function to save the data
save_top_words_to_csv(top_words_probs, 'submission.csv')


Class 0:
  the: 0.000261
  to: 0.000253
  of: 0.000253
  a: 0.000251
  and: 0.000251
  in: 0.000251
  is: 0.000249
  that: 0.000249
  for: 0.000248
  it: 0.000248
Class 1:
  the: 0.000247
  for: 0.000246
  to: 0.000246
  of: 0.000246
  and: 0.000246
  a: 0.000246
  states: 0.000246
  is: 0.000246
  that: 0.000246
  more: 0.000246
