In [3]:
import numpy as np
import pandas as pd
import os

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [4]:
human_text_files = [
    os.path.join('/app/data/humans_writing', 'essays_sorted.csv')
]

ai_text_files = [
    os.path.join('/app/data/ai_generated', 'ai_generated_train_essays.csv'),
    os.path.join('/app/data/ai_generated', 'ai_generated_train_essays_gpt-4.csv'),
    os.path.join('/app/data/ai_generated', 'ai-essays.csv')
]

In [5]:
humans_dataframe =[]

for file_path in human_text_files:
    df = pd.read_csv(file_path)
    humans_dataframe.append(df)

humans_combined_essays = pd.concat(humans_dataframe, ignore_index=True)
humans_combined_essays.head()

Unnamed: 0,title,description,essay,authors,source_url,thumbnail_url
0,Addiction,"Passion stabs, unrequited love hurts and taboo...",Omer Bonne means well; that much is certain. A...,Angela Chen,https://aeon.co//essays/how-far-should-medicin...,https://images.aeonmedia.co/images/4c7d125f-f4...
1,Addiction,The fear of missing out haunts our social netw...,Here’s a test you might enjoy: rate these scen...,Jacob Burak,https://aeon.co//essays/can-we-break-free-from...,https://images.aeonmedia.co/images/1c3abfe6-af...
2,Addiction,The neuroscientific picture of addiction overl...,Human beings crave all sorts of things: coffee...,Zoey Lavallee,https://aeon.co//essays/why-the-pull-of-addict...,https://images.aeonmedia.co/images/98330d80-eb...
3,Addiction,The new science of addiction makes 12-step pro...,I hear a single voice as I walk up the steps t...,Rebecca Ruiz,https://aeon.co//essays/how-the-aa-is-out-of-s...,https://images.aeonmedia.co/images/a7bcbc8e-9c...
4,Addiction,Cutting brings relief because emotion and pain...,Here’s what I remember about the first time I ...,Carrie Arnold,https://aeon.co//essays/how-self-harm-provokes...,https://images.aeonmedia.co/images/42f29e7a-fb...


READ AI FILES

In [6]:
ai_dataframes = []
for file_path in ai_text_files:
    df = pd.read_csv(file_path)
    ai_dataframes.append(df)

ai_generated_essays = pd.concat(ai_dataframes, ignore_index=True)

ai_generated_essays.head()


Unnamed: 0,id,prompt_id,text,generated
0,d429f032,0.0,Advantages of Limiting Car Usage \n\nLimiting ...,1.0
1,1ce279be,0.0,Advantages of Limiting Car Usage\n\nLimiting c...,1.0
2,c9595213,0.0,Limiting car usage has numerous advantages tha...,1.0
3,f2266d87,0.0,The passages provided discuss the advantages o...,1.0
4,eeace4bd,0.0,Title: The Advantages of Limiting Car Usage\n\...,1.0


READ HUMANS ESSAYS

In [7]:
humans_dataframe =[]

for file_path in human_text_files:
    df = pd.read_csv(file_path)
    humans_dataframe.append(df)

humans_combined_essays = pd.concat(humans_dataframe, ignore_index=True)
humans_combined_essays.head()

Unnamed: 0,title,description,essay,authors,source_url,thumbnail_url
0,Addiction,"Passion stabs, unrequited love hurts and taboo...",Omer Bonne means well; that much is certain. A...,Angela Chen,https://aeon.co//essays/how-far-should-medicin...,https://images.aeonmedia.co/images/4c7d125f-f4...
1,Addiction,The fear of missing out haunts our social netw...,Here’s a test you might enjoy: rate these scen...,Jacob Burak,https://aeon.co//essays/can-we-break-free-from...,https://images.aeonmedia.co/images/1c3abfe6-af...
2,Addiction,The neuroscientific picture of addiction overl...,Human beings crave all sorts of things: coffee...,Zoey Lavallee,https://aeon.co//essays/why-the-pull-of-addict...,https://images.aeonmedia.co/images/98330d80-eb...
3,Addiction,The new science of addiction makes 12-step pro...,I hear a single voice as I walk up the steps t...,Rebecca Ruiz,https://aeon.co//essays/how-the-aa-is-out-of-s...,https://images.aeonmedia.co/images/a7bcbc8e-9c...
4,Addiction,Cutting brings relief because emotion and pain...,Here’s what I remember about the first time I ...,Carrie Arnold,https://aeon.co//essays/how-self-harm-provokes...,https://images.aeonmedia.co/images/42f29e7a-fb...


In [8]:
humans_essays = humans_combined_essays['essay']
ai_essays = ai_generated_essays['text']

In [9]:
humans_essays.head()
ai_essays.head()

ai_and_humans_df =pd.DataFrame({'AI': ai_essays, 'Human': humans_essays})
ai_and_humans_df.head()


Unnamed: 0,AI,Human
0,Advantages of Limiting Car Usage \n\nLimiting ...,Omer Bonne means well; that much is certain. A...
1,Advantages of Limiting Car Usage\n\nLimiting c...,Here’s a test you might enjoy: rate these scen...
2,Limiting car usage has numerous advantages tha...,Human beings crave all sorts of things: coffee...
3,The passages provided discuss the advantages o...,I hear a single voice as I walk up the steps t...
4,Title: The Advantages of Limiting Car Usage\n\...,Here’s what I remember about the first time I ...


In [10]:
print(len(ai_essays), len(humans_essays))

3488 2235


In [11]:
# Drop NaNs from each column and assign label
ai_labeled = pd.DataFrame({'text': ai_essays.dropna(), 'label': 'ai'})
human_labeled = pd.DataFrame({'text': humans_essays.dropna(), 'label': 'human'})

# Combine
training_df = pd.concat([ai_labeled, human_labeled], ignore_index=True)
print(training_df['label'].value_counts())
print(training_df.head())

label
ai       3488
human    2235
Name: count, dtype: int64
                                                text label
0  Advantages of Limiting Car Usage \n\nLimiting ...    ai
1  Advantages of Limiting Car Usage\n\nLimiting c...    ai
2  Limiting car usage has numerous advantages tha...    ai
3  The passages provided discuss the advantages o...    ai
4  Title: The Advantages of Limiting Car Usage\n\...    ai


Split into train/test sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    training_df['text'], training_df['label'], test_size=0.2, random_state=42, stratify=training_df['label']
)

Create a Single Labeled Dataset

In [15]:
ai_labeled = pd.DataFrame({'text': ai_essays.dropna(), 'label': 'ai'})
human_labeled = pd.DataFrame({'text': humans_essays.dropna(), 'label': 'human'})
training_df = pd.concat([ai_labeled, human_labeled], ignore_index=True)


Train-Test Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    training_df['text'],            # all essays
    training_df['label'],           # 'ai' / 'human'
    test_size=0.20,
    random_state=42,
    stratify=training_df['label']   # keep the same ai/human ratio
)
label_map = {'ai': 0, 'human': 1}
y_train_num = y_train.map(label_map)
y_test_num  = y_test.map(label_map)


Vectorize the Text

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english', 
    max_features=5000      # Limit to top 5k features for speed
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


Train a Classifier

In [21]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)


Evaluate

In [22]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          ai       1.00      1.00      1.00       698
       human       1.00      1.00      1.00       447

    accuracy                           1.00      1145
   macro avg       1.00      1.00      1.00      1145
weighted avg       1.00      1.00      1.00      1145



Preparing the data

In [23]:
import pandas as pd

df = pd.concat([
    pd.DataFrame({'essay': ai_essays, 'label': 0}),     # 0 = AI
    pd.DataFrame({'essay': humans_essays, 'label': 1})  # 1 = Human
], ignore_index=True).dropna()


Split Data & Tokenize

In [24]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 300

# 1. Tokenize ALL essays using the whole dataset for consistency
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df['essay'].astype(str))

# 2. Convert all essays to padded sequences
all_sequences = tokenizer.texts_to_sequences(df['essay'].astype(str))
all_padded = pad_sequences(all_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# 3. Convert labels to numpy array if needed
labels = df['label'].astype(int).values

# 4. Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    all_padded, labels, test_size=0.2, random_state=42, stratify=labels
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (4578, 300), Test shape: (1145, 300)


Statistical models

In [25]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# 1. load your df with columns 'essay' (str) and 'label' (0=AI,1=Human)
# assume df is already defined

# 2. train/test split
X_train_txt, X_test_txt, y_train, y_test = train_test_split(
    df['essay'].astype(str), df['label'], 
    test_size=0.2, random_state=42, stratify=df['label']
)

# 3. turn text into TF-IDF features
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tf = tfidf.fit_transform(X_train_txt)
X_test_tf  = tfidf.transform(X_test_txt)

# 4. fit logistic regression
stat_model = LogisticRegression(max_iter=1000, class_weight='balanced')
stat_model.fit(X_train_tf, y_train)

# 5. evaluate
y_pred = stat_model.predict(X_test_tf)
print("=== Statistical baseline ===")
print(classification_report(y_test, y_pred, target_names=['AI','Human']))


=== Statistical baseline ===
              precision    recall  f1-score   support

          AI       1.00      1.00      1.00       698
       Human       1.00      1.00      1.00       447

    accuracy                           1.00      1145
   macro avg       1.00      1.00      1.00      1145
weighted avg       1.00      1.00      1.00      1145



Build and Train the Model | Deep-learning model

In [26]:
# import tensorflow as tf
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

# # hyper‐parameters
# MAX_WORDS = 10000
# MAX_LEN   = 300
# EMB_DIM   = 128

# # split as before
# X_train_txt, X_test_txt, y_train, y_test = train_test_split(
#     df['essay'].astype(str), df['label'], 
#     test_size=0.2, random_state=42, stratify=df['label']
# )

# # tokenize
# tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
# tokenizer.fit_on_texts(X_train_txt)
# train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train_txt),
#                           maxlen=MAX_LEN, padding='post', truncating='post')
# test_seq  = pad_sequences(tokenizer.texts_to_sequences(X_test_txt),
#                           maxlen=MAX_LEN, padding='post', truncating='post')

# # build model
# dl_model = Sequential([
#     Embedding(input_dim=MAX_WORDS, output_dim=EMB_DIM, input_length=MAX_LEN),
#     Bidirectional(LSTM(64, return_sequences=False)),
#     Dropout(0.5),
#     Dense(32, activation='relu'),
#     Dropout(0.5),
#     Dense(1, activation='sigmoid')
# ])

# dl_model.compile(
#     loss='binary_crossentropy',
#     optimizer='adam',
#     metrics=['accuracy']
# )

# # train
# history = dl_model.fit(
#     train_seq, y_train,
#     validation_data=(test_seq, y_test),
#     epochs=5, batch_size=32
# )

# # evaluate
# print("\n=== Deep-learning model ===")
# dl_model.evaluate(test_seq, y_test, verbose=2)

# # classification report
# y_prob = dl_model.predict(test_seq).ravel()
# y_pred = (y_prob > 0.5).astype(int)
# print(classification_report(y_test, y_pred, target_names=['AI','Human']))


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import joblib


MAX_WORDS = 10000
MAX_LEN   = 300
EMB_DIM   = 128

# build your TF‐IDF + logistic‐regression “statistical” model if you haven’t already:
tfidf = TfidfVectorizer(max_features=MAX_WORDS, ngram_range=(1,2))
X_stat = tfidf.fit_transform(df['essay'])
stat_model = LogisticRegression(max_iter=1000, class_weight='balanced')
stat_model.fit(X_stat, df['label'])

# split for the deep‐learning model
X_train_txt, X_test_txt, y_train, y_test = train_test_split(
    df['essay'].astype(str), df['label'], 
    test_size=0.2, random_state=42, stratify=df['label']
)

# tokenize for DL
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_txt)
train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train_txt),
                          maxlen=MAX_LEN, padding='post', truncating='post')
test_seq  = pad_sequences(tokenizer.texts_to_sequences(X_test_txt),
                          maxlen=MAX_LEN, padding='post', truncating='post')

# build your DL model
dl_model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=EMB_DIM, input_length=MAX_LEN),
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
dl_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train
history = dl_model.fit(
    train_seq, y_train,
    validation_data=(test_seq, y_test),
    epochs=5, batch_size=32
)

# evaluate & report
print("\n=== Deep-learning model ===")
dl_model.evaluate(test_seq, y_test, verbose=2)
y_prob = dl_model.predict(test_seq).ravel()
y_pred = (y_prob > 0.5).astype(int)
print(classification_report(y_test, y_pred, target_names=['AI','Human']))

# ————— now save everything —————
# 1) deep-learning model in native Keras format
dl_model.save('dl_model.keras')

# 2) tokenizer (so you can recreate sequences later)
joblib.dump(tokenizer,      'tokenizer.joblib')

# 3) TF-IDF vectorizer + statistical model
joblib.dump(tfidf,          'tfidf_vectorizer.joblib')
joblib.dump(stat_model,     'stat_model.joblib')

print("✅ Saved: dl_model.keras, tokenizer.joblib, tfidf_vectorizer.joblib, stat_model.joblib")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

=== Deep-learning model ===
36/36 - 1s - loss: 0.0030 - accuracy: 0.9983 - 1s/epoch - 34ms/step
              precision    recall  f1-score   support

          AI       1.00      1.00      1.00       698
       Human       1.00      1.00      1.00       447

    accuracy                           1.00      1145
   macro avg       1.00      1.00      1.00      1145
weighted avg       1.00      1.00      1.00      1145

✅ Saved: dl_model.keras, tokenizer.joblib, tfidf_vectorizer.joblib, stat_model.joblib


Simple Inference on New Essays

In [27]:


from tensorflow.keras.preprocessing.sequence import pad_sequences

def classify_essay(text):
    stat_p_human = stat_model.predict_proba(tfidf.transform([text]))[0,1]
    seq         = tokenizer.texts_to_sequences([text])
    padded      = pad_sequences(seq, maxlen=MAX_LEN, padding='post', truncating='post')
    dl_p_human  = dl_model.predict(padded, verbose=0)[0,0]
    p_human     = (stat_p_human + dl_p_human) / 2

    if p_human > 0.5:
        return "Human", p_human
    else:
        return "AI", 1 - p_human

for sample in [
    '''
    9th January 2025
The Evolution of Citizenship
Citizenship represents a person's belonging, although its meaning has changed throughout history. Cooper describes citizenship as the intersection of equality and rights, which relates to integration ethics, differentiation ethics, and fragmentation ethics. Citizenship evolved as people fled war zones or sought better living conditions. This derives from ancient Greek and Roman civilizations, which developed within empires such as Rome rather than modern nation-states. Governments offered citizenship to provide rights and identification, allowing people to live freely and without fear of discrimination. 
Despite its promise, citizenship faces challenges in many countries. For example, dual citizenship raises questions of loyalty as citizens are expected to align with one nation’s cultural and political goals. The understanding of citizenship has varied among cultures as well. In medieval Europe, citizenship was linked to cities rather than nations, but monarchies and empires gradually redefined it. The British Empire, for instance, treated immigrants as part of the kingdom; however, rights were unequally distributed, and colonized people faced exclusion despite fighting for equality using the empire's ideals. The rise of capitalism further complicated citizenship by linking it to economic roles like property ownership or labor. Although poor workers were nominally considered equal citizens, systemic inequities persisted, sparking debates about fairness. Furthermore, movies show the challenges immigrants face due to citizenship, including deportation, job insecurity, language barriers, and uncertain futures. Religion has also influenced citizenship, often leading to violence and separating populations like the massacres aimed to remove entire Jewish communities. The Rwandan genocide, in the movie, shows how even diplomats, despite being from the same country, were stripped of their citizenship and targeted for extermination due to their perceived disloyalty and the actions of their fellow countrymen.
In conclusion, citizenship is a dynamic and diverse concept that has evolved in response to historical, cultural, and political circumstances.
    '''
]:
    label, conf = classify_essay(sample)
    print(len(sample))
    print(f"{label:>6}   ({conf:.2%} confidence)")


2233
 Human   (66.81% confidence)
