In [1]:
# ============================================================================
# CELL 1: INSTALLATIONS
# ============================================================================
print("Installing required packages...")

!pip install -q xgboost scikit-learn

print("✓ All packages installed successfully!")

Installing required packages...
✓ All packages installed successfully!


In [2]:
# ============================================================================
# CELL 2: IMPORTS
# ============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import (
    StratifiedKFold, cross_val_score, train_test_split
)
from sklearn.metrics import (
    f1_score, accuracy_score, roc_auc_score,
    precision_score, recall_score, classification_report,
    confusion_matrix, roc_curve
)
import time
import pickle
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ All imports successful!")
print(f"XGBoost version: {xgb.__version__}")
print(f"Running on: CPU (optimal for tree-based models)")

✓ All imports successful!
XGBoost version: 3.0.5
Running on: CPU (optimal for tree-based models)


In [3]:
# ============================================================================
# CELL 3: DEFINE FEATURE SET
# ============================================================================

expected_features = [
    'trigram_diversity', 'yules_k', 'token_burstiness', 'char_trigram_entropy',
    'avg_tree_depth', 'max_tree_depth', 'avg_dependency_distance',
    'uppercase_ratio', 'whitespace_ratio', 'unique_char_count',
    'comma_ratio', 'period_ratio', 'question_ratio', 'exclamation_ratio',
    'semicolon_ratio', 'colon_ratio', 'quote_ratio',
    'sentiment_polarity', 'sentiment_subjectivity', 'sentiment_polarity_variance',
    'neutral_sentence_ratio', 'positive_word_ratio', 'negative_word_ratio',
    'pos_ratio_DET', 'pos_ratio_ADP', 'pos_ratio_AUX', 'pos_ratio_CCONJ',
    'pos_ratio_PART', 'pos_ratio_NUM', 'pos_row_entropy_weighted',
    'function_to_content_rate', 'noun_verb_alternation_rate', 'content_function_ratio',
    'noun_verb_ratio', 'adj_adv_ratio', 'verbs_per_100_tok', 'nouns_per_100_tok',
    'adj_per_100_tok', 'adv_per_100_tok', 'pron_per_100_tok', 'punct_per_100_tok',
    'tokens_per_sentence_mean', 'mean_nouns_per_sent', 'mean_verbs_per_sent',
    'mean_adjs_per_sent', 'mean_advs_per_sent', 'prop_sents_with_verb',
    'unique_upos_per_sent_mean', 'max_runlen_NOUN', 'max_runlen_PUNCT',
    'avg_sentence_length', 'sentence_length_std', 'n_sentences_doc'
]

print(f"✓ Feature set defined: {len(expected_features)} features")

✓ Feature set defined: 53 features


In [4]:
# ============================================================================
# CELL 4: LOAD DATA
# ============================================================================

# Adjust this to your actual data loading method
df = pd.read_csv(r'/content/raid_sample_large_PostPOS_CLEAN.csv')

print(f"Dataset shape: {df.shape}")
print(f"Class distribution:\n{df['is_ai'].value_counts()}")

# Prepare data
X = df[expected_features].values
y = df['is_ai'].astype(int).values

print(f"\nData prepared:")
print(f"  Features: {X.shape}")
print(f"  Labels: {y.shape}")
print(f"  Class distribution: AI={y.sum()}, Human={len(y)-y.sum()}")

# Create train/test split
print(f"\nCreating 80/20 train/test split...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"  Training set: {X_train.shape[0]} samples")
print(f"  Test set: {X_test.shape[0]} samples")
print(f"\n✓ Data ready for optimization")

Dataset shape: (60000, 86)
Class distribution:
is_ai
True     30000
False    30000
Name: count, dtype: int64

Data prepared:
  Features: (60000, 53)
  Labels: (60000,)
  Class distribution: AI=30000, Human=30000

Creating 80/20 train/test split...
  Training set: 48000 samples
  Test set: 12000 samples

✓ Data ready for optimization


In [5]:
# ============================================================================
# CELL 5: SETUP CHECKPOINT SYSTEM
# ============================================================================

# Create checkpoint directory
checkpoint_dir = Path('xgb_optimization')
checkpoint_dir.mkdir(exist_ok=True)

print(f"✓ Checkpoint directory created: {checkpoint_dir}/")
print("  All progress will be saved automatically")
print("  You can safely resume if interrupted\n")

# Global timer
notebook_start_time = time.time()

✓ Checkpoint directory created: xgb_optimization/
  All progress will be saved automatically
  You can safely resume if interrupted



In [6]:
# ============================================================================
# CELL 6: COMPREHENSIVE XGBOOST SEARCH (ALL ON FULL TRAINING SET)
# ============================================================================

print("="*70)
print("COMPREHENSIVE XGBOOST HYPERPARAMETER SEARCH")
print("="*70)
print("Strategy: XGBoost is fast enough to test 30+ configs on full training set")
print("Expected total time: 10-15 minutes\n")

checkpoint_dir = Path('xgb_comprehensive_search')
checkpoint_dir.mkdir(exist_ok=True)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# COMPREHENSIVE GRID: 30 strategic configurations
configs = [
    # ===== BASELINE VARIATIONS =====
    {'name': 'baseline_100', 'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'baseline_300', 'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'baseline_500', 'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    # ===== TREE COUNT × LEARNING RATE COMBINATIONS =====
    {'name': 'many_trees_low_lr', 'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.05,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'very_many_low_lr', 'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.03,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'few_high_lr', 'n_estimators': 50, 'max_depth': 6, 'learning_rate': 0.3,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    # ===== DEPTH VARIATIONS =====
    {'name': 'very_shallow', 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'shallow', 'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'medium_deep', 'n_estimators': 300, 'max_depth': 8, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'deep', 'n_estimators': 300, 'max_depth': 10, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'very_deep', 'n_estimators': 300, 'max_depth': 12, 'learning_rate': 0.05,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    # ===== SUBSAMPLING VARIATIONS =====
    {'name': 'low_subsample', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.6, 'colsample_bytree': 0.6, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'med_subsample', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.7, 'colsample_bytree': 0.7, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'high_subsample', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.95, 'colsample_bytree': 0.95, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'full_sample', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 1.0, 'colsample_bytree': 1.0, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    # ===== REGULARIZATION VARIATIONS =====
    {'name': 'l1_light', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0.3, 'reg_lambda': 1},

    {'name': 'l1_strong', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 1.0, 'reg_lambda': 1},

    {'name': 'l2_strong', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 3.0},

    {'name': 'elastic_net', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0.5, 'reg_lambda': 2.0},

    # ===== MIN CHILD WEIGHT VARIATIONS =====
    {'name': 'min_child_3', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'min_child_5', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 5, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'min_child_10', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 10, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 1},

    # ===== GAMMA VARIATIONS =====
    {'name': 'gamma_01', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0.1,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'gamma_03', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0.3,
     'reg_alpha': 0, 'reg_lambda': 1},

    {'name': 'gamma_05', 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1,
     'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 1, 'gamma': 0.5,
     'reg_alpha': 0, 'reg_lambda': 1},

    # ===== BALANCED COMBINATIONS =====
    {'name': 'aggressive', 'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.15,
     'subsample': 0.9, 'colsample_bytree': 0.9, 'min_child_weight': 1, 'gamma': 0,
     'reg_alpha': 0, 'reg_lambda': 0.5},

    {'name': 'conservative', 'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.05,
     'subsample': 0.7, 'colsample_bytree': 0.7, 'min_child_weight': 5, 'gamma': 0.2,
     'reg_alpha': 0.3, 'reg_lambda': 2.0},

    {'name': 'balanced_fast', 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.15,
     'subsample': 0.85, 'colsample_bytree': 0.85, 'min_child_weight': 2, 'gamma': 0.05,
     'reg_alpha': 0.1, 'reg_lambda': 1.0},

    {'name': 'balanced_accurate', 'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.08,
     'subsample': 0.85, 'colsample_bytree': 0.85, 'min_child_weight': 2, 'gamma': 0.1,
     'reg_alpha': 0.1, 'reg_lambda': 1.2},

    {'name': 'optimal_candidate', 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.08,
     'subsample': 0.85, 'colsample_bytree': 0.85, 'min_child_weight': 1, 'gamma': 0.05,
     'reg_alpha': 0.05, 'reg_lambda': 1.0},
]

print(f"Testing {len(configs)} configurations")
print(f"Estimated time: {len(configs) * 0.34:.1f} minutes\n")

# Checkpoint system
checkpoint_file = checkpoint_dir / 'all_results.pkl'

if checkpoint_file.exists():
    with open(checkpoint_file, 'rb') as f:
        results = pickle.load(f)
    print(f" Loaded {len(results)} completed configurations\n")
else:
    results = []

start_time = time.time()

# Test all configurations
for idx, config in enumerate(configs, 1):
    name = config.pop('name')

    # Skip if already done
    if any(r['name'] == name for r in results):
        print(f"[{idx}/{len(configs)}] ⏭  '{name}' already completed")
        continue

    print(f"[{idx}/{len(configs)}] Testing: {name}")

    model = XGBClassifier(
        random_state=42, n_jobs=-1,
        eval_metric='logloss', verbosity=0,
        **config
    )

    start = time.time()
    scores = cross_val_score(
        model, X_train, y_train,
        cv=cv, scoring='f1',
        n_jobs=-1, verbose=0
    )
    elapsed = time.time() - start

    result = {
        'name': name,
        'params': config,
        'f1_mean': scores.mean(),
        'f1_std': scores.std(),
        'f1_scores': scores.tolist(),
        'time_seconds': elapsed
    }

    results.append(result)

    print(f"  ✓ F1: {scores.mean():.4f} ± {scores.std():.4f}  ({elapsed:.0f}s)")

    # Save checkpoint
    with open(checkpoint_file, 'wb') as f:
        pickle.dump(results, f)

    # Show progress
    completed = len(results)
    avg_time = (time.time() - start_time) / completed
    remaining = (len(configs) - completed) * avg_time
    if remaining > 0:
        print(f"  ⏱  Est. {remaining/60:.1f} min remaining")

total_time = time.time() - start_time

# Display results
df_results = pd.DataFrame(results).sort_values('f1_mean', ascending=False)

print("\n" + "="*70)
print("COMPREHENSIVE SEARCH COMPLETE")
print("="*70)
print(f"Total time: {total_time/60:.1f} minutes")
print(f"Configs tested: {len(results)}\n")

print("TOP 10 CONFIGURATIONS:")
print(df_results[['name', 'f1_mean', 'f1_std']].head(10).to_string(index=False))

best = df_results.iloc[0]
print(f"\n WINNER: {best['name']}")
print(f"   F1 Score: {best['f1_mean']:.4f} ± {best['f1_std']:.4f}")
print(f"   Parameters: {best['params']}")

# Save full results
df_results.to_csv('xgboost_comprehensive_results.csv', index=False)
print(f"\n✓ Full results saved to 'xgboost_comprehensive_results.csv'")

COMPREHENSIVE XGBOOST HYPERPARAMETER SEARCH
Strategy: XGBoost is fast enough to test 30+ configs on full training set
Expected total time: 10-15 minutes

Testing 30 configurations
Estimated time: 10.2 minutes

[1/30] Testing: baseline_100
  ✓ F1: 0.8818 ± 0.0015  (13s)
  ⏱  Est. 6.5 min remaining
[2/30] Testing: baseline_300
  ✓ F1: 0.9056 ± 0.0018  (17s)
  ⏱  Est. 7.1 min remaining
[3/30] Testing: baseline_500
  ✓ F1: 0.9123 ± 0.0005  (28s)
  ⏱  Est. 8.8 min remaining
[4/30] Testing: many_trees_low_lr
  ✓ F1: 0.9119 ± 0.0010  (44s)
  ⏱  Est. 11.1 min remaining
[5/30] Testing: very_many_low_lr
  ✓ F1: 0.9077 ± 0.0006  (53s)
  ⏱  Est. 13.0 min remaining
[6/30] Testing: few_high_lr
  ✓ F1: 0.8838 ± 0.0016  (3s)
  ⏱  Est. 10.6 min remaining
[7/30] Testing: very_shallow
  ✓ F1: 0.8777 ± 0.0016  (10s)
  ⏱  Est. 9.2 min remaining
[8/30] Testing: shallow
  ✓ F1: 0.8947 ± 0.0007  (13s)
  ⏱  Est. 8.4 min remaining
[9/30] Testing: medium_deep
  ✓ F1: 0.9130 ± 0.0026  (31s)
  ⏱  Est. 8.3 min rema

In [12]:
# ============================================================================
# CELL 7: FINAL VALIDATION WITH 5-FOLD CV
# ============================================================================

print("\n" + "="*70)
print("FINAL VALIDATION: TOP 3 CONFIGS WITH 5-FOLD CV")
print("="*70)
print("Testing top 3 configurations with more robust 5-fold CV\n")

# Get top 3 from comprehensive search
top3 = df_results.head(3)

cv_final = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

final_validation_results = []

for idx, row in top3.iterrows():
    name = row['name']
    params = row['params']

    print(f"\n{'─'*70}")
    print(f"Validating: {name}")
    print(f"{'─'*70}")
    print(f"3-fold F1: {row['f1_mean']:.4f} ± {row['f1_std']:.4f}")

    model = XGBClassifier(
        random_state=42, n_jobs=-1,
        eval_metric='logloss', verbosity=0,
        **params
    )

    start = time.time()

    # Get comprehensive metrics with 5-fold CV
    scores_f1 = cross_val_score(model, X_train, y_train, cv=cv_final,
                                scoring='f1', n_jobs=-1, verbose=0)
    scores_acc = cross_val_score(model, X_train, y_train, cv=cv_final,
                                 scoring='accuracy', n_jobs=-1, verbose=0)
    scores_auc = cross_val_score(model, X_train, y_train, cv=cv_final,
                                 scoring='roc_auc', n_jobs=-1, verbose=0)
    scores_prec = cross_val_score(model, X_train, y_train, cv=cv_final,
                                  scoring='precision', n_jobs=-1, verbose=0)
    scores_rec = cross_val_score(model, X_train, y_train, cv=cv_final,
                                 scoring='recall', n_jobs=-1, verbose=0)

    elapsed = time.time() - start

    result = {
        'name': name,
        'params': params,
        'f1_mean': scores_f1.mean(),
        'f1_std': scores_f1.std(),
        'accuracy_mean': scores_acc.mean(),
        'accuracy_std': scores_acc.std(),
        'roc_auc_mean': scores_auc.mean(),
        'roc_auc_std': scores_auc.std(),
        'precision_mean': scores_prec.mean(),
        'precision_std': scores_prec.std(),
        'recall_mean': scores_rec.mean(),
        'recall_std': scores_rec.std(),
        'time_seconds': elapsed
    }

    final_validation_results.append(result)

    print(f"\n5-fold CV Results:")
    print(f"  F1 Score:  {scores_f1.mean():.4f} ± {scores_f1.std():.4f}")
    print(f"  Accuracy:  {scores_acc.mean():.4f} ± {scores_acc.std():.4f}")
    print(f"  ROC-AUC:   {scores_auc.mean():.4f} ± {scores_auc.std():.4f}")
    print(f"  Precision: {scores_prec.mean():.4f} ± {scores_prec.std():.4f}")
    print(f"  Recall:    {scores_rec.mean():.4f} ± {scores_rec.std():.4f}")
    print(f"  Time: {elapsed:.1f}s")

# Determine final winner
final_df = pd.DataFrame(final_validation_results).sort_values('f1_mean', ascending=False)
winner = final_df.iloc[0]

print("\n" + "="*70)
print("FINAL VALIDATION COMPLETE")
print("="*70)

print("\nRankings (5-fold CV):")
for idx, row in final_df.iterrows():
    print(f"  {row['name']:<20}: F1={row['f1_mean']:.4f} ± {row['f1_std']:.4f}")

print(f"\n FINAL WINNER: {winner['name']}")
print(f"   F1 Score: {winner['f1_mean']:.4f} ± {winner['f1_std']:.4f}")
print(f"   Accuracy: {winner['accuracy_mean']:.4f} ± {winner['accuracy_std']:.4f}")
print(f"   ROC-AUC:  {winner['roc_auc_mean']:.4f} ± {winner['roc_auc_std']:.4f}")

# Save final validation results
final_df.to_csv('xgboost_final_validation.csv', index=False)
print(f"\n✓ Final validation results saved")


FINAL VALIDATION: TOP 3 CONFIGS WITH 5-FOLD CV
Testing top 3 configurations with more robust 5-fold CV


──────────────────────────────────────────────────────────────────────
Validating: aggressive
──────────────────────────────────────────────────────────────────────
3-fold F1: 0.9170 ± 0.0012

5-fold CV Results:
  F1 Score:  0.9206 ± 0.0030
  Accuracy:  0.9221 ± 0.0029
  ROC-AUC:   0.9787 ± 0.0010
  Precision: 0.9390 ± 0.0031
  Recall:    0.9028 ± 0.0035
  Time: 482.9s

──────────────────────────────────────────────────────────────────────
Validating: deep
──────────────────────────────────────────────────────────────────────
3-fold F1: 0.9143 ± 0.0021

5-fold CV Results:
  F1 Score:  0.9190 ± 0.0034
  Accuracy:  0.9206 ± 0.0034
  ROC-AUC:   0.9783 ± 0.0009
  Precision: 0.9382 ± 0.0033
  Recall:    0.9006 ± 0.0038
  Time: 443.7s

──────────────────────────────────────────────────────────────────────
Validating: optimal_candidate
─────────────────────────────────────────────────────

In [None]:
# ============================================================================
# CELL 8: TEST SET EVALUATION
# ============================================================================

print("\n" + "="*70)
print("TEST SET EVALUATION")
print("="*70)
print(f"Training final model: {winner['name']}")
print(f"Parameters: {winner['params']}\n")

# Train final model on full training set
final_model = XGBClassifier(
    random_state=42, n_jobs=-1,
    eval_metric='logloss', verbosity=0,
    **winner['params']
)

print(f"Training on {len(X_train)} samples...")
final_model.fit(X_train, y_train)

print(f"Evaluating on {len(X_test)} test samples...")
y_pred = final_model.predict(X_test)
y_proba = final_model.predict_proba(X_test)[:, 1]

# Calculate all metrics
test_results = {
    'accuracy': accuracy_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_proba),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred)
}

print("\n" + "="*70)
print("TEST SET RESULTS")
print("="*70)
print(f"  Accuracy:  {test_results['accuracy']:.4f}")
print(f"  F1 Score:  {test_results['f1']:.4f}")
print(f"  ROC-AUC:   {test_results['roc_auc']:.4f}")
print(f"  Precision: {test_results['precision']:.4f}")
print(f"  Recall:    {test_results['recall']:.4f}")

# Generalization check
cv_test_gap = abs(winner['f1_mean'] - test_results['f1'])
print(f"\n📊 Generalization Analysis:")
print(f"  CV F1:     {winner['f1_mean']:.4f}")
print(f"  Test F1:   {test_results['f1']:.4f}")
print(f"  Gap:       {cv_test_gap:.4f} ({cv_test_gap/winner['f1_mean']*100:.2f}%)")

if cv_test_gap < 0.01:
    print(f"  ✓ EXCELLENT: Model generalizes very well")
elif cv_test_gap < 0.02:
    print(f"  ✓ GOOD: Acceptable generalization")
else:
    print(f"  ⚠ WARNING: Significant generalization gap")

# Detailed classification report
print("\n" + "-"*70)
print("Classification Report:")
print("-"*70)
print(classification_report(y_test, y_pred,
                          target_names=['Human', 'AI'],
                          digits=4))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(f"                 Predicted")
print(f"                 Human      AI")
print(f"Actual Human   {cm[0,0]:6d}  {cm[0,1]:6d}")
print(f"       AI      {cm[1,0]:6d}  {cm[1,1]:6d}")

# Error analysis
human_as_ai = cm[0, 1]
ai_as_human = cm[1, 0]
total_human = cm[0].sum()
total_ai = cm[1].sum()

print(f"\n📈 Error Analysis:")
print(f"  False Positives (Human→AI): {human_as_ai:5d} ({human_as_ai/total_human*100:.2f}%)")
print(f"  False Negatives (AI→Human): {ai_as_human:5d} ({ai_as_human/total_ai*100:.2f}%)")
print(f"  Total errors: {human_as_ai + ai_as_human:5d} ({(human_as_ai + ai_as_human)/len(y_test)*100:.2f}%)")

# Save final model
final_model_path = checkpoint_dir / 'xgboost_optimized_final.pkl'
with open(final_model_path, 'wb') as f:
    pickle.dump(final_model, f)

print(f"\n✓ Final model saved to: {final_model_path}")

# Save test predictions for analysis
test_results_df = pd.DataFrame({
    'y_true': y_test,
    'y_pred': y_pred,
    'y_proba': y_proba
})
test_results_df.to_csv('test_predictions.csv', index=False)
print(f"✓ Test predictions saved to: test_predictions.csv")

In [None]:
# ============================================================================
# CELL 9: COMPREHENSIVE VISUALIZATION
# ============================================================================

print("\n" + "="*70)
print("GENERATING COMPREHENSIVE VISUALIZATIONS")
print("="*70)

fig = plt.figure(figsize=(20, 12))

# 1. Hyperparameter exploration heatmap (top 15 configs)
ax1 = plt.subplot(3, 4, 1)
top15 = df_results.head(15)
heatmap_data = []
for _, row in top15.iterrows():
    heatmap_data.append([
        row['params']['n_estimators'],
        row['params']['max_depth'],
        row['params']['learning_rate'] * 100,  # Scale for visibility
        row['params']['subsample'] * 10,  # Scale for visibility
    ])

im = ax1.imshow(heatmap_data, aspect='auto', cmap='RdYlGn')
ax1.set_yticks(range(len(top15)))
ax1.set_yticklabels(top15['name'], fontsize=8)
ax1.set_xticks(range(4))
ax1.set_xticklabels(['n_est', 'depth', 'LR×100', 'sub×10'], fontsize=9)
ax1.set_title('Top 15: Hyperparameter Patterns', fontweight='bold', fontsize=11)
plt.colorbar(im, ax=ax1)

# 2. F1 scores distribution
ax2 = plt.subplot(3, 4, 2)
ax2.hist(df_results['f1_mean'], bins=20, color='steelblue', alpha=0.7, edgecolor='black')
ax2.axvline(winner['f1_mean'], color='red', linestyle='--', linewidth=2,
           label=f"Winner: {winner['f1_mean']:.4f}")
ax2.set_xlabel('F1 Score', fontweight='bold')
ax2.set_ylabel('Count', fontweight='bold')
ax2.set_title('F1 Score Distribution (30 configs)', fontweight='bold', fontsize=11)
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')

# 3. Top 10 configs comparison
ax3 = plt.subplot(3, 4, 3)
top10 = df_results.head(10)
bars = ax3.barh(range(10), top10['f1_mean'],
               xerr=top10['f1_std'], capsize=5,
               color=plt.cm.RdYlGn(np.linspace(0.5, 0.9, 10)), alpha=0.8)
ax3.set_yticks(range(10))
ax3.set_yticklabels(top10['name'], fontsize=9)
ax3.set_xlabel('F1 Score', fontweight='bold')
ax3.set_title('Top 10 Configurations', fontweight='bold', fontsize=11)
ax3.invert_yaxis()
ax3.grid(True, alpha=0.3, axis='x')
ax3.set_xlim([0.87, 0.92])

# Add value labels
for i, (bar, val) in enumerate(zip(bars, top10['f1_mean'])):
    ax3.text(val, i, f' {val:.4f}', va='center', fontsize=8, fontweight='bold')

# 4. CV vs Test comparison
ax4 = plt.subplot(3, 4, 4)
metrics = ['Accuracy', 'F1', 'ROC-AUC', 'Precision', 'Recall']
cv_vals = [winner['accuracy_mean'], winner['f1_mean'], winner['roc_auc_mean'],
           winner['precision_mean'], winner['recall_mean']]
test_vals = [test_results['accuracy'], test_results['f1'], test_results['roc_auc'],
            test_results['precision'], test_results['recall']]

x = np.arange(len(metrics))
width = 0.35

bars1 = ax4.bar(x - width/2, cv_vals, width, label='5-fold CV',
               alpha=0.8, color='steelblue')
bars2 = ax4.bar(x + width/2, test_vals, width, label='Test Set',
               alpha=0.8, color='darkblue')

ax4.set_ylabel('Score', fontweight='bold')
ax4.set_title('CV vs Test Set Performance', fontweight='bold', fontsize=11)
ax4.set_xticks(x)
ax4.set_xticklabels(metrics, rotation=45, ha='right', fontsize=9)
ax4.legend()
ax4.grid(True, alpha=0.3, axis='y')
ax4.set_ylim([0.85, 1.0])

# 5. ROC Curve
ax5 = plt.subplot(3, 4, 5)
fpr, tpr, _ = roc_curve(y_test, y_proba)
ax5.plot(fpr, tpr, linewidth=2.5, color='darkblue',
        label=f'XGBoost (AUC={test_results["roc_auc"]:.4f})')
ax5.plot([0, 1], [0, 1], 'k--', alpha=0.3, linewidth=1)
ax5.set_xlabel('False Positive Rate', fontweight='bold')
ax5.set_ylabel('True Positive Rate', fontweight='bold')
ax5.set_title('ROC Curve (Test Set)', fontweight='bold', fontsize=11)
ax5.legend(loc='lower right')
ax5.grid(True, alpha=0.3)

# 6. Confusion Matrix
ax6 = plt.subplot(3, 4, 6)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax6,
           xticklabels=['Human', 'AI'], yticklabels=['Human', 'AI'],
           cbar_kws={'label': 'Count'}, annot_kws={'fontsize': 12, 'fontweight': 'bold'})
ax6.set_ylabel('True Label', fontweight='bold')
ax6.set_xlabel('Predicted Label', fontweight='bold')
ax6.set_title('Confusion Matrix', fontweight='bold', fontsize=11)

# 7. Feature Importance (Top 20)
ax7 = plt.subplot(3, 4, 7)
importances = final_model.feature_importances_
indices = np.argsort(importances)[-20:]
top20_features = [expected_features[i] for i in indices]
top20_importance = importances[indices]

colors = plt.cm.viridis(np.linspace(0.3, 0.9, 20))
ax7.barh(range(20), top20_importance, color=colors, alpha=0.8)
ax7.set_yticks(range(20))
ax7.set_yticklabels(top20_features, fontsize=8)
ax7.set_xlabel('Importance', fontweight='bold')
ax7.set_title('Top 20 Feature Importances', fontweight='bold', fontsize=11)
ax7.grid(True, alpha=0.3, axis='x')

# 8. Prediction confidence distribution
ax8 = plt.subplot(3, 4, 8)
human_probs = y_proba[y_test == 0]
ai_probs = y_proba[y_test == 1]

ax8.hist(human_probs, bins=30, alpha=0.6, label='Human texts', color='blue', edgecolor='black')
ax8.hist(ai_probs, bins=30, alpha=0.6, label='AI texts', color='red', edgecolor='black')
ax8.axvline(0.5, color='black', linestyle='--', linewidth=2, alpha=0.5, label='Decision threshold')
ax8.set_xlabel('Predicted Probability (AI)', fontweight='bold')
ax8.set_ylabel('Count', fontweight='bold')
ax8.set_title('Prediction Confidence Distribution', fontweight='bold', fontsize=11)
ax8.legend()
ax8.grid(True, alpha=0.3, axis='y')

# 9. Parameter sensitivity: n_estimators
ax9 = plt.subplot(3, 4, 9)
n_est_analysis = df_results[df_results['name'].str.contains('baseline|many_trees|very_many|few')]
ax9.scatter([p['n_estimators'] for p in n_est_analysis['params']],
           n_est_analysis['f1_mean'], s=100, alpha=0.7, color='steelblue')
ax9.set_xlabel('n_estimators', fontweight='bold')
ax9.set_ylabel('F1 Score', fontweight='bold')
ax9.set_title('Impact of Tree Count', fontweight='bold', fontsize=11)
ax9.grid(True, alpha=0.3)

# 10. Parameter sensitivity: max_depth
ax10 = plt.subplot(3, 4, 10)
depth_configs = df_results[df_results['name'].str.contains('shallow|deep|baseline')]
depths = [p['max_depth'] for p in depth_configs['params']]
ax10.scatter(depths, depth_configs['f1_mean'], s=100, alpha=0.7, color='darkgreen')
ax10.set_xlabel('max_depth', fontweight='bold')
ax10.set_ylabel('F1 Score', fontweight='bold')
ax10.set_title('Impact of Tree Depth', fontweight='bold', fontsize=11)
ax10.grid(True, alpha=0.3)

# 11. Error rates by class
ax11 = plt.subplot(3, 4, 11)
error_data = {
    'Human→AI\n(False Pos)': human_as_ai / total_human * 100,
    'AI→Human\n(False Neg)': ai_as_human / total_ai * 100
}
bars = ax11.bar(error_data.keys(), error_data.values(),
               color=['lightcoral', 'coral'], alpha=0.7, edgecolor='black', linewidth=2)
ax11.set_ylabel('Error Rate (%)', fontweight='bold')
ax11.set_title('Error Analysis by Class', fontweight='bold', fontsize=11)
ax11.grid(True, alpha=0.3, axis='y')

# Add value labels
for bar in bars:
    height = bar.get_height()
    ax11.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}%', ha='center', va='bottom',
             fontweight='bold', fontsize=11)

# 12. Training time vs performance
ax12 = plt.subplot(3, 4, 12)
times = [r['time_seconds'] for r in results]
f1s = [r['f1_mean'] for r in results]
scatter = ax12.scatter(times, f1s, s=100, alpha=0.6,
                      c=f1s, cmap='RdYlGn', edgecolors='black', linewidth=0.5)
ax12.scatter(winner['time_seconds'], winner['f1_mean'],
            s=300, marker='*', color='gold', edgecolors='black',
            linewidth=2, label='Winner', zorder=5)
ax12.set_xlabel('Training Time (seconds)', fontweight='bold')
ax12.set_ylabel('F1 Score', fontweight='bold')
ax12.set_title('Efficiency: Time vs Performance', fontweight='bold', fontsize=11)
ax12.legend()
ax12.grid(True, alpha=0.3)
plt.colorbar(scatter, ax=ax12, label='F1 Score')

plt.tight_layout()
plt.savefig('xgboost_comprehensive_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Comprehensive visualization saved as 'xgboost_comprehensive_analysis.png'")