In [None]:
#######################
## Interactive model ##
#######################

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Dataset Path
file_path = "/content/drive/My Drive/Dataset for AI Generated Text Detection/Dataset_for_AI_Generated_Text_Detection.csv"

# Load dataset
data = pd.read_csv(file_path)
texts = data['text'].values
labels = data['label'].values  # 1 = AI-generated, 0 = non-AI-generated

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Define H-score function
def compute_h_score(texts1, texts2):
    """Compute a simplified H-score (e.g., based on word frequency divergence)."""
    def get_word_freq(texts):
        word_count = {}
        for text in texts:
            words = text.lower().split()
            for word in words:
                word_count[word] = word_count.get(word, 0) + 1
        total = sum(word_count.values())
        return {k: v / total for k, v in word_count.items()}

    freq1 = get_word_freq(texts1)
    freq2 = get_word_freq(texts2)

    # Hellinger distance as H-score
    common_words = set(freq1.keys()).union(freq2.keys())
    p = np.array([freq1.get(w, 0) for w in common_words])
    q = np.array([freq2.get(w, 0) for w in common_words])
    h_score = np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / np.sqrt(2)
    return h_score

# Split training data into AI and non-AI texts
ai_texts_train = X_train[y_train == 1]
non_ai_texts_train = X_train[y_train == 0]

# Train models
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# SVM
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train_tfidf, y_train)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_tfidf, y_train)

# CNN
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_seq, maxlen=100)

cnn_model = Sequential([
    Embedding(5000, 128, input_length=100),
    Conv1D(64, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Function to predict user input
def predict_text(user_input):
    # Prepare input for models
    input_tfidf = tfidf.transform([user_input])
    input_seq = tokenizer.texts_to_sequences([user_input])
    input_pad = pad_sequences(input_seq, maxlen=100)

    # H-score prediction
    h_score_ai = compute_h_score([user_input], ai_texts_train)
    h_score_non_ai = compute_h_score([user_input], non_ai_texts_train)
    h_score_pred = 1 if (h_score_ai < h_score_non_ai and h_score_ai > 0.4) else 0
    h_score_result = "AI-generated" if h_score_pred == 1 else "Non-AI-generated"

    # SVM prediction
    svm_pred = svm.predict(input_tfidf)[0]
    svm_result = "AI-generated" if svm_pred == 1 else "Non-AI-generated"

    # Random Forest prediction
    rf_pred = rf.predict(input_tfidf)[0]
    rf_result = "AI-generated" if rf_pred == 1 else "Non-AI-generated"

    # CNN prediction
    cnn_prob = cnn_model.predict(input_pad)[0][0]
    cnn_pred = 1 if cnn_prob > 0.5 else 0
    cnn_result = "AI-generated" if cnn_pred == 1 else "Non-AI-generated"

    # Print results
    print("\nPrediction Results for Input Text:")
    print(f"H-score Method: {h_score_result} (H-score vs AI: {h_score_ai:.4f}, vs Non-AI: {h_score_non_ai:.4f})")
    print(f"SVM: {svm_result}")
    print(f"Random Forest: {rf_result}")
    print(f"CNN: {cnn_result} (Probability: {cnn_prob:.4f})")

# Get user input and predict
while True:
    user_input = input("\nEnter text to analyze (or 'quit' to exit): ")
    if user_input.lower() == 'quit':
        break
    predict_text(user_input)