In [None]:
# RST x H-score METHOD

# Define H-score function
def compute_h_score(texts1, texts2):
    """Compute a simplified H-score (e.g., based on word frequency divergence)."""
    def get_word_freq(texts):
        word_count = {}
        for text in texts:
            words = text.lower().split()
            for word in words:
                word_count[word] = word_count.get(word, 0) + 1
        total = sum(word_count.values())
        return {k: v / total for k, v in word_count.items()}

    freq1 = get_word_freq(texts1)
    freq2 = get_word_freq(texts2)

    # Hellinger distance as H-score
    common_words = set(freq1.keys()).union(freq2.keys())
    p = np.array([freq1.get(w, 0) for w in common_words])
    q = np.array([freq2.get(w, 0) for w in common_words])
    h_score = np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / np.sqrt(2)
    return h_score

# Split training data into AI and non-AI texts
ai_texts_train = X_train[y_train == 1]
non_ai_texts_train = X_train[y_train == 0]

# Compute H-score between AI and non-AI texts
h_score_ai_vs_non_ai = compute_h_score(ai_texts_train, non_ai_texts_train)
print(f"\nH-score between AI and non-AI texts (training set): {h_score_ai_vs_non_ai:.4f}")

# Function to classify test texts using H-score threshold
def classify_with_h_score(test_texts, ai_reference_texts, non_ai_reference_texts):
    h_scores_ai = []
    h_scores_non_ai = []
    predictions = []

    for text in test_texts:
        h_score_ai = compute_h_score([text], ai_reference_texts)
        h_score_non_ai = compute_h_score([text], non_ai_reference_texts)
        h_scores_ai.append(h_score_ai)
        h_scores_non_ai.append(h_score_non_ai)

        # Use the difference as a confidence score (lower H-score to AI means more likely AI)
        confidence = h_score_non_ai - h_score_ai  # Positive means more likely AI
        predictions.append(1 if confidence > 0 else 0)

    # Normalize confidence scores to [0, 1] range for ROC curve
    confidence_scores = np.array(h_scores_non_ai) - np.array(h_scores_ai)
    confidence_scores = (confidence_scores - confidence_scores.min()) / (confidence_scores.max() - confidence_scores.min())
    return np.array(predictions), confidence_scores

# Classify test set using H-score
h_score_preds, h_score_probs = classify_with_h_score(X_test, ai_texts_train, non_ai_texts_train)
print("\nH-score Classification Report (threshold > 0.4 for AI):")
print(classification_report(y_test, h_score_preds))
print("H-score Confusion Matrix:")
print(confusion_matrix(y_test, h_score_preds))

# Plot ROC curve for H-score
plot_roc_curve(y_test, h_score_probs, "H-score")