In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, learning_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)

# Import the modules themselves for version checking
import xgboost

# Import the classifier classes
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("All imports successful!")
print(f"XGBoost version: {xgboost.__version__}")


In [None]:
backbone_features = [
    'trigram_diversity',
    'yules_k',
    'comma_ratio',
    'colon_ratio',
    'pos_ratio_NUM',
    'verbs_per_100_tok',
    'sentence_length_std',
    'n_sentences_doc',
    'exclamation_ratio',
    'token_burstiness'
]

# Original expected_features with 9 features removed that don't exist
expected_features = [
    # 'smog_index', 'automated_readability_index',  # REMOVED - not in df
    # 'unigram_diversity',  # REMOVED - not in df
    'trigram_diversity',
    # 'hapax_type_ratio',  # REMOVED - not in df
    'yules_k',
    # 'mtld',  # REMOVED - not in df
    # 'trigram_entropy',  # REMOVED - not in df
    'token_burstiness', 'char_trigram_entropy',
    'avg_tree_depth', 'max_tree_depth', 'avg_dependency_distance',
    # 'right_dependency_ratio',  # REMOVED - not in df
    'uppercase_ratio', 'whitespace_ratio', 'unique_char_count',
    # 'bits_per_char',  # REMOVED - not in df
    'comma_ratio', 'period_ratio', 'question_ratio', 'exclamation_ratio',
    'semicolon_ratio', 'colon_ratio', 'quote_ratio',
    'sentiment_polarity', 'sentiment_subjectivity', 'sentiment_polarity_variance',
    'neutral_sentence_ratio', 'positive_word_ratio', 'negative_word_ratio',
    'pos_ratio_DET', 'pos_ratio_ADP', 'pos_ratio_AUX', 'pos_ratio_CCONJ',
    'pos_ratio_PART', 'pos_ratio_NUM', 'pos_row_entropy_weighted',
    'function_to_content_rate', 'noun_verb_alternation_rate', 'content_function_ratio',
    'noun_verb_ratio', 'adj_adv_ratio', 'verbs_per_100_tok', 'nouns_per_100_tok',
    'adj_per_100_tok', 'adv_per_100_tok', 'pron_per_100_tok', 'punct_per_100_tok',
    'tokens_per_sentence_mean', 'mean_nouns_per_sent', 'mean_verbs_per_sent',
    'mean_adjs_per_sent', 'mean_advs_per_sent', 'prop_sents_with_verb',
    'unique_upos_per_sent_mean', 'max_runlen_NOUN', 'max_runlen_PUNCT',
    'avg_sentence_length', 'sentence_length_std',
    # 'n_tokens_doc',  # REMOVED - not in df
    'n_sentences_doc'
]

# Verify all backbone features exist in expected_features
missing = set(backbone_features) - set(expected_features)
if missing:
    print(f"WARNING: {missing} not in full feature set")
else:
    print(f"✓ All {len(backbone_features)} backbone features found in full feature set")
    print(f"✓ Full feature set contains {len(expected_features)} features")
    print(f"\nFeature sets defined:")
    print(f"  - Backbone: 10 features (84.8% reduction)")
    print(f"  - Full: 53 features")

In [None]:

# Replace this with your actual data loading
# Example:
df = pd.read_csv(r'C:\Users\marco\OneDrive\Desktop\Tesi_Codice\Cognitive_TaskGen\backbone creation\raid_sample_large_PostPOS_CLEAN.csv')
# OR if you're loading from previous notebook variables:
# Assuming df already exists with your features and 'is_ai' column

# Verify data
print(f"Dataset shape: {df.shape}")
print(f"Class distribution:\n{df['is_ai'].value_counts()}")
print(f"\nFeature columns available: {len(df.columns)}")

# Check for missing values
missing_counts = df[expected_features].isnull().sum()
if missing_counts.sum() > 0:
    print(f"\nWARNING: Missing values detected:")
    print(missing_counts[missing_counts > 0])
else:
    print("\n✓ No missing values in feature set")

# Prepare data matrices
X_backbone = df[backbone_features].values
X_full = df[expected_features].values
y = df['is_ai'].values  # CHANGED FROM 'label' to 'is_ai'

print(f"\nData prepared:")
print(f"  Backbone features: {X_backbone.shape}")
print(f"  Full features: {X_full.shape}")
print(f"  Labels: {y.shape}")
print(f"  Class distribution: AI={y.sum()}, Human={len(y)-y.sum()}")