In [None]:
%pip install --upgrade datasets
%pip install datasets pandas tensorflow

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset
import json
import tensorflow as tf

# 1. Load and preprocess the dataset
dataset = load_dataset("shahxeebhassan/human_vs_ai_sentences")
df = pd.DataFrame(dataset['train'])  # Convert to pandas DataFrame
texts = df['text'].values
labels = df['label'].values

tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# 2. Build and train the CNN model
model = Sequential()
model.add(Embedding(5000, 128, input_length=max_length))
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_test, y_test))

# 3. Save the model and tokenizer
model.save('ai_detection_model.keras')
print("Model saved successfully!")

tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))
print("Tokenizer saved successfully!")

# 4. Load the model and tokenizer for prediction
loaded_model = tf.keras.models.load_model('ai_detection_model.keras')

with open('tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

# 5. Prediction function
def predict_ai_generated(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    prediction = loaded_model.predict(padded_sequence, verbose=0)[0][0]  # Using verbose=0
    return "AI-generated" if prediction > 0.5 else "Human-written"