In [5]:
# step1:Load and Prepare the Data
import csv
import random
from collections import defaultdict, Counter

# Function to load data from a CSV file
def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        headers = next(reader)
        data = [row for row in reader]
    return headers, data

# Load training data
train_headers, train_data = load_data('train_essays.csv')

In [6]:
# Step 2: Preprocess the Text Data
import re
import string

# Function to clean and tokenize text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    return text.split()

# Preprocess essays in the training data
for row in train_data:
    text_index = train_headers.index('text')
    row[text_index] = preprocess_text(row[text_index])

In [7]:
# Step 3: Split the Data into Training and Development Sets bold text
# Split the data manually
def split_data(data, split_ratio=0.8):
    random.shuffle(data)
    split_point = int(len(data) * split_ratio)
    return data[:split_point], data[split_point:]

train_data, dev_data = split_data(train_data)

In [8]:
# Step 4: Build the Vocabulary and the Reverse Index
# Function to build vocabulary and reverse index
def build_vocabulary_and_index(data):
    word_counts = Counter(word for row in data for word in row[train_headers.index('text')])
    vocabulary = {word for word, count in word_counts.items() if count >= 5}
    reverse_index = {word: idx for idx, word in enumerate(vocabulary)}
    return vocabulary, reverse_index

vocabulary, reverse_index = build_vocabulary_and_index(train_data)

In [9]:
# Step 5: Calculate Probabilities
# Function to calculate word occurrence probabilities
def calculate_probabilities(data, vocabulary):
    word_occurrences = defaultdict(int)
    class_word_occurrences = defaultdict(lambda: defaultdict(int))
    class_counts = defaultdict(int)

    for row in data:
        label = int(row[train_headers.index('generated')])
        class_counts[label] += 1
        words = set(row[train_headers.index('text')])
        for word in words:
            if word in vocabulary:
                word_occurrences[word] += 1
                class_word_occurrences[label][word] += 1

    total_docs = len(data)
    word_probs = {word: count / total_docs for word, count in word_occurrences.items()}
    word_given_class_probs = {
        label: {word: (count / class_counts[label]) for word, count in word_counts.items()}
        for label, word_counts in class_word_occurrences.items()
    }

    return word_probs, word_given_class_probs

word_probs, word_given_class_probs = calculate_probabilities(train_data, vocabulary)

def calculate_ai_generated_prob(document, vocabulary, word_given_class_probs, class_probs):
    doc_words = set(document)
    ai_generated_score = class_probs[1]  # The probability of the AI-generated class
    not_ai_generated_score = class_probs[0]  # The probability of the not-AI-generated class

    for word in doc_words:
        if word in vocabulary:
            # Probability of word given AI-generated
            ai_generated_score *= word_given_class_probs[1].get(word, 1e-10)
            # Probability of word given not AI-generated
            not_ai_generated_score *= word_given_class_probs[0].get(word, 1e-10)

    # Calculate the normalized probability
    total = ai_generated_score + not_ai_generated_score
    ai_generated_prob = ai_generated_score / total if total > 0 else 0.5  # Avoid division by zero

    return ai_generated_prob

# Function to generate probabilities for the test data
def generate_probabilities_for_test_data(data, calculate_prob_func, vocabulary, word_given_class_probs, class_probs):
    probabilities = []
    for row in data:
        document = row[train_headers.index('text')]
        prob = calculate_prob_func(document, vocabulary, word_given_class_probs, class_probs)
        probabilities.append(prob)
    return probabilities

In [10]:
# Step 6: Define the Classifier and Evaluate Accuracy
# Function to classify a new document
def classify(document, vocabulary, reverse_index, word_given_class_probs):
    doc_words = set(document)
    class_scores = defaultdict(float)

    for word in doc_words:
        if word in vocabulary:
            for class_label, probs in word_given_class_probs.items():
                word_idx = reverse_index[word]
                class_scores[class_label] += probs.get(word_idx, 0)

    return max(class_scores, key=class_scores.get)

# Function to evaluate the classifier
def evaluate(data, classify_func):
    correct = 0
    for row in data:
        label = int(row[train_headers.index('generated')])
        prediction = classify_func(row[train_headers.index('text')], vocabulary, reverse_index, word_given_class_probs)
        if prediction == label:
            correct += 1
    return correct / len(data)

accuracy = evaluate(dev_data, classify)
print(f"Development Set Accuracy: {accuracy:.2%}")


Development Set Accuracy: 99.64%


In [11]:
# Step 7: Experiment with Smoothing and Identify Top Predictive Words
def train_naive_bayes(data, vocabulary):
    word_given_class_counts = defaultdict(lambda: defaultdict(int))
    class_counts = defaultdict(int)

    # Count how many times each word appears in documents of each class
    for row in data:
        label = int(row[train_headers.index('generated')])
        class_counts[label] += 1
        words = row[train_headers.index('text')]
        for word in words:
            if word in vocabulary:
                word_given_class_counts[label][word] += 1

    # Apply Laplace smoothing to word counts and convert them to probabilities
    word_given_class_probs = {
        label: {
            word: (word_count + 1) / (sum(class_word_counts.values()) + len(vocabulary))
            for word, word_count in class_word_counts.items()
        } for label, class_word_counts in word_given_class_counts.items()
    }

    # Calculate class probabilities
    total_docs = sum(class_counts.values())
    class_probs = {label: count / total_docs for label, count in class_counts.items()}

    return word_given_class_probs, class_probs

# Run the training function
word_given_class_probs, class_probs = train_naive_bayes(train_data, vocabulary)

# Define the function to get top words with their probabilities
def get_top_words(word_given_class_probs, vocabulary, top_n=10):
    top_words = {}
    for label, word_probs in word_given_class_probs.items():
        # Get the top_n words by probability for each class
        top_words[label] = sorted(word_probs.items(), key=lambda item: item[1], reverse=True)[:top_n]
    return top_words

# Get the top words for each class
top_words = get_top_words(word_given_class_probs, vocabulary)

# Now print the top words for each class
for label, words in top_words.items():
    print(f"Class {label}:")
    for word, prob in words:
        print(f"  {word}: {prob:.6f}")


Class 0:
  the: 0.064804
  to: 0.032030
  of: 0.029471
  a: 0.025067
  and: 0.023211
  in: 0.021758
  is: 0.016714
  that: 0.014802
  for: 0.012570
  it: 0.010691
Class 1:
  the: 0.019723
  and: 0.014792
  of: 0.012481
  a: 0.011864
  to: 0.011556
  that: 0.007396
  in: 0.006626
  for: 0.005547
  car: 0.005085
  is: 0.004931


In [16]:
# Load test data
test_headers, test_data = load_data('test_essays.csv')

# Preprocess test data
for row in test_data:
    text_index = test_headers.index('text')
    row[text_index] = preprocess_text(row[text_index])

# Generate probabilities for the test data with probabilities
test_probabilities = generate_probabilities_for_test_data(test_data, calculate_ai_generated_prob, vocabulary, word_given_class_probs, class_probs)

# Save the probabilities to submission.csv
def save_probabilities_to_csv(ids, probabilities, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["id", "generated"])
        for id, prob in zip(ids, probabilities):
            writer.writerow([id, prob])

# Extract the ids from the test dataset
test_ids = [row[0] for row in test_data]  # Assuming the first column is 'id'

# Save probabilities to submission.csv (adjust the path as needed)
save_probabilities_to_csv(test_ids, test_probabilities, 'submission.csv')
