In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset
import json

# 1. Load the dataset
dataset = load_dataset("shahxeebhassan/human_vs_ai_sentences")
df = pd.DataFrame(dataset['train'])
texts = df['text'].values
labels = df['label'].values

# 2. Preprocess the data
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# 3. Build the LSTM model
model = Sequential()
model.add(Embedding(5000, 128, input_length=max_length))
model.add(LSTM(128))  # LSTM layer
model.add(Dense(1, activation='sigmoid'))

# 4. Compile and train the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# 5. Save the model and tokenizer
model.save('ai_detection_model_lstm.keras')
print("Model saved successfully!")

tokenizer_json = tokenizer.to_json()
with open('tokenizer_lstm.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))
print("Tokenizer saved successfully!")

# 6. Load the model and tokenizer for prediction
import tensorflow as tf
import json

loaded_model = tf.keras.models.load_model('ai_detection_model_lstm.keras')

with open('tokenizer_lstm.json') as f:
    data = json.load(f)
    tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

def predict_ai_generated(text):
    # Check if the input text has at least 50 words
    if len(text.split()) < 50:
        return "Text too short. Please enter at least 50 words."

    # Proceed with prediction if text length is sufficient
    sequence = tokenizer.texts_to_sequences([text])
    max_length = 100
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    prediction = loaded_model.predict(padded_sequence)[0][0]
    return "AI-generated" if prediction > 0.5 else "Human-written"