In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
df=pd.read_csv("train_essays.csv")

In [3]:
df

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0
...,...,...,...,...
1373,fe6ff9a5,1,There has been a fuss about the Elector Colleg...,0
1374,ff669174,0,Limiting car usage has many advantages. Such a...,0
1375,ffa247e0,0,There's a new trend that has been developing f...,0
1376,ffc237e9,0,As we all know cars are a big part of our soci...,0


In [12]:
# generated - Whether the essay was written by a student (0) or generated by an LLM (1)

In [4]:
df['prompt_id'].value_counts()

0    708
1    670
Name: prompt_id, dtype: int64

In [5]:
len(df['text'].value_counts().index)

1378

In [6]:
df['generated'].value_counts()

0    1375
1       3
Name: generated, dtype: int64

In [10]:
## Seeing the avg of word count of essays

avg_length=0
for essay in df['text']:
    ll=len(essay)
    avg_length+=ll
avg_length

4366952

In [11]:
avg_length=avg_length/len(df['text'])
avg_length

3169.0507982583454

In [13]:
## Creating the model witht he original dataset 

train_df, dev_df = train_test_split(df, test_size=0.2, random_state=42)

# Build a vocabulary
vectorizer = CountVectorizer(min_df=5)  # omit words occurring less than five times
X_train = vectorizer.fit_transform(train_df['text'])
vocabulary = vectorizer.get_feature_names_out()

# Reverse index
reverse_index = {word: idx for idx, word in enumerate(vocabulary)}

# Calculate probabilities
total_documents = len(train_df)
class_counts = train_df['generated'].value_counts()

# P[word]
word_occurrence_prob = np.array(X_train.sum(axis=0) / total_documents).flatten()

# P[word | LLM]
llm_documents = train_df[train_df['generated'] == 1]
llm_word_occurrence_prob = np.array(X_train[train_df['generated'] == 1].sum(axis=0) / class_counts[1]).flatten()

# Build the LLM model
model = MultinomialNB()
model.fit(X_train, train_df['generated'])

# Predict on development set
X_dev = vectorizer.transform(dev_df['text'])
predictions = model.predict(X_dev)

# Calculate accuracy on the development set
accuracy = accuracy_score(dev_df['generated'], predictions)

# Experiments with smoothing
smoothing_values = [1.0, 0.5, 0.1, 0.01]
for alpha in smoothing_values:
    model = MultinomialNB(alpha=alpha)
    model.fit(X_train, train_df['generated'])
    predictions = model.predict(X_dev)
    accuracy = accuracy_score(dev_df['generated'], predictions)
    print(f'Accuracy with smoothing alpha={alpha}: {accuracy}')

# Derive Top 10 words that predict each class
top_words_llm = model.feature_log_prob_[1].argsort()[-10:][::-1]
top_words_human = model.feature_log_prob_[0].argsort()[-10:][::-1]

top_words_llm = [vocabulary[idx] for idx in top_words_llm]
top_words_human = [vocabulary[idx] for idx in top_words_human]


Accuracy with smoothing alpha=1.0: 0.9963768115942029
Accuracy with smoothing alpha=0.5: 0.9963768115942029
Accuracy with smoothing alpha=0.1: 0.9963768115942029
Accuracy with smoothing alpha=0.01: 0.9963768115942029


In [14]:
df_test=pd.read_csv("test_essays.csv")

In [15]:
df_test

Unnamed: 0,id,prompt_id,text
0,0000aaaa,2,Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.
2,2222cccc,4,CCC ddd eee.


In [16]:

# P[class | word] using the test dataset
test_df = pd.read_csv('test_essays.csv')  # Load your test dataset
X_test = vectorizer.transform(test_df['text'])
predictions_test = model.predict(X_test)
# accuracy_test = accuracy_score(test_df['generated'], predictions_test)

In [17]:
predictions_test

array([0, 0, 0], dtype=int64)