In [4]:
#Master Features - Train/Test Split with Commercial Overlap


import pandas as pd
import numpy as np
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')


print("MASTER FEATURES - TRAIN/TEST SPLIT INITIALIZATION")


from google.colab import drive
drive.mount('/content/drive')

# Base paths
BASE_DRIVE = '/content/drive/MyDrive/Tesi Magistrale'
MASTER_DIR = f'{BASE_DRIVE}/master_features'

# File paths
MASTER_FEATURES_PATH = f'{MASTER_DIR}/master_features_complete_2.csv'
COMMERCIAL_TEST_PATH = '/content/Testing-Commercial.csv'  # Adjust if needed

# Output directory for splits
SPLITS_DIR = f'{BASE_DRIVE}/train_test_splits'
os.makedirs(SPLITS_DIR, exist_ok=True)

print(f"Master features: {MASTER_FEATURES_PATH}")
print(f"Commercial test: {COMMERCIAL_TEST_PATH}")
print(f"Output directory: {SPLITS_DIR}")

MASTER FEATURES - TRAIN/TEST SPLIT INITIALIZATION
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Master features: /content/drive/MyDrive/Tesi Magistrale/master_features/master_features_complete_2.csv
Commercial test: /content/Testing-Commercial.csv
Output directory: /content/drive/MyDrive/Tesi Magistrale/train_test_splits


In [5]:
# Feature categories dictionary
FEATURE_CATEGORIES = {
    'BACKBONE': [
        'alpha_ratio', 'punct_ratio', 'avg_word_length', 'std_word_length',
        'entropy_bits', 'entropy_norm', 'type_token_ratio', 'stopword_ratio',
        'avg_sentence_length', 'sentence_length_std', 'n_sentences_doc',
        'flesch_reading_ease', 'trigram_diversity', 'token_burstiness',
        'char_trigram_entropy', 'uppercase_ratio', 'unique_char_count',
        'compression_ratio', 'avg_tree_depth', 'max_tree_depth',
        'avg_dependency_distance', 'left_dependency_ratio', 'hapax_legomena_ratio',
        'yules_k', 'comma_ratio', 'period_ratio', 'question_ratio',
        'exclamation_ratio', 'semicolon_ratio', 'colon_ratio', 'quote_ratio',
        'sentiment_polarity', 'sentiment_subjectivity', 'sentiment_polarity_variance',
        'neutral_sentence_ratio', 'positive_word_ratio', 'negative_word_ratio',
        'pos_ratio_DET', 'pos_ratio_ADP', 'pos_ratio_AUX', 'pos_ratio_CCONJ',
        'pos_ratio_PART', 'pos_ratio_NUM', 'pos_row_entropy_weighted',
        'function_to_content_rate', 'noun_verb_alternation_rate',
        'content_function_ratio', 'noun_verb_ratio', 'adj_adv_ratio',
        'verbs_per_100_tok', 'nouns_per_100_tok', 'adj_per_100_tok',
        'adv_per_100_tok', 'pron_per_100_tok', 'punct_per_100_tok',
        'tokens_per_sentence_mean', 'mean_nouns_per_sent', 'mean_verbs_per_sent',
        'mean_adjs_per_sent', 'mean_advs_per_sent', 'prop_sents_with_verb',
        'unique_upos_per_sent_mean', 'max_runlen_NOUN', 'max_runlen_PUNCT',
    ],
    'COREFERENCE': [
        'pronoun_ratio', 'minimal_chain_ratio', 'avg_chain_length',
        'chain_length_variance', 'long_range_coref_ratio', 'chain_connectivity',
        'repeat_mention_expansion_rate', 'avg_tokens_added_on_repeat',
        'repeat_overspecification_ratio', 'adjective_modification_rate',
        'prepositional_modification_rate', 'relative_clause_rate',
        'modification_type_entropy', 'avg_modifiers_per_mention',
        'second_mention_pronoun_rate', 'full_np_in_repeats_rate',
        'macro_avg_context_entities', 'meso_pragmatic_necessity_rate',
        'macro_overspecification_loose_rate', 'meso_consensus_score_mean',
        'micro_statistical_typicality_mean', 'macro_modification_vs_competitors_ratio',
        'meso_context_density', 'scale_consistency_score',
        'micro_meso_overspec_gradient', 'meso_macro_context_ratio',
        'necessity_scale_variance', 'minimal_chain_count',
        'minimal_first_avg_modifiers', 'minimal_second_pronoun_rate',
        'minimal_avg_total_modifiers', 'singleton_count', 'singleton_ratio',
        'singleton_avg_modifiers', 'singleton_descriptive_rate',
        'singleton_vs_chain_first_ratio', 'singleton_modification_entropy',
        'singleton_avg_tokens',
    ],
    'PERPLEXITY': [
        'doc_perplexity', 'mean_sentence_perplexity', 'sentence_perplexity_variance',
        'token_probability_entropy', 'perplexity_curvature', 'perplexity_burstiness',
        'perplexity_trajectory_slope', 'perturbation_discrepancy',
    ],
    'COHESION': [
        'entity_mention_density', 'entity_reuse_rate', 'entity_graph_density',
        'entity_isolated_sentences', 'entity_largest_component_size',
        'mean_entity_continuation_rate', 'mean_adjacent_cosine_similarity',
        'min_adjacent_cosine_similarity', 'adjacent_similarity_variance',
        'semantic_graph_density', 'similarity_decay_rate',
        'mean_nonadjacent_similarity', 'long_range_similarity',
        'semantic_graph_isolated_sentences', 'semantic_largest_component_size',
        'semantic_average_degree', 'topic_entropy', 'topic_drift_rate',
        'dominant_topic_proportion', 'topic_switching_frequency',
        'topic_persistence', 'num_distinct_topics', 'topic_diversity',
        'topic_concentration', 'topic_transition_similarity', 'topic_return_rate',
    ],
    'TEMPORAL': [
        'temp_num_events', 'temp_events_per_sentence', 'temp_event_lexical_diversity',
        'temp_tense_distribution_entropy', 'temp_num_timex', 'temp_timex_event_ratio',
        'temp_rel_mean_confidence', 'temp_rel_confidence_variance',
        'temp_rel_before_after_ratio', 'temp_rel_cycle_edge_ratio',
        'temp_rel_raw_cycle_count', 'temp_rel_cycle_approx_flag',
        'temp_rel_parallel_edge_rate', 'temp_rel_type_entropy',
        'temp_rel_transitivity_violation_rate', 'tg_edge_retention',
        'tg_degree_entropy', 'tg_avg_in_degree', 'tg_avg_out_degree',
        'tg_ordering_entropy', 'tg_longest_path', 'tg_mean_depth',
        'tg_branching_factor', 'tg_global_coherence',
        'temp_constraint_violation_rate', 'temp_constraint_csp_score',
        'temp_scope_variance', 'temp_tense_time_alignment',
        'temp_deixis_consistency', 'temp_ref_time_shifts',
        'tg_centralization', 'tg_clustering_coefficient', 'tg_density',
    ],
    'METACOGNITION': [
        'transitions_density', 'frame_markers_density', 'endophoric_markers_density',
        'attitude_markers_density', 'engagement_markers_density', 'hedges_density',
        'boosters_density', 'epistemic_density', 'personal_epistemic_density',
        'reformulation_density', 'self_mention_first_use_ratio',
        'self_mention_first_20pct_density', 'certainty_first_third',
        'certainty_last_third', 'certainty_gradient', 'certainty_overall',
        'weasel_density', 'evidential_density',
    ],
    'CALIBRATION': [
        'hedge_perplexity_correlation', 'booster_perplexity_anticorrelation',
        'metacog_spike_perplexity_ratio', 'certainty_perplexity_alignment',
        'reformulation_complexity_match',
    ],
}


print(f"Feature categories defined")


Feature categories defined


In [9]:

print(f"Loading master features from: {MASTER_FEATURES_PATH}")

try:
    df_master = pd.read_csv(MASTER_FEATURES_PATH)
    print(f"Master features loaded successfully")

    # Verify essential columns
    if 'id' not in df_master.columns:
        raise ValueError("Missing 'id' column in master features!")
    if 'is_ai' not in df_master.columns:
        raise ValueError("Missing 'is_ai' column in master features!")


    # Verify features are present
    feature_cols = [col for col in df_master.columns if col not in ['id', 'is_ai']]

except FileNotFoundError:
    print(f"Master features file not found!")
    print(f"Path: {MASTER_FEATURES_PATH}")
    raise
except Exception as e:
    print(f"ERROR loading master features: {e}")
    raise


Loading master features from: /content/drive/MyDrive/Tesi Magistrale/master_features/master_features_complete_2.csv
Master features loaded successfully


In [10]:
#Load commercial

print(f"Loading commercial test set from: {COMMERCIAL_TEST_PATH}")

try:
    df_commercial = pd.read_csv(COMMERCIAL_TEST_PATH)
    print(f"Commercial test set loaded successfully")
    print(f"Columns: {list(df_commercial.columns)}")

    # Check for ID column
    id_col_commercial = None
    for possible_id in ['id', 'uuid', 'ID', 'UUID']:
        if possible_id in df_commercial.columns:
            id_col_commercial = possible_id

    print(f"Using ID column: '{id_col_commercial}'")

    # Check for label column
    label_col_commercial = None
    for possible_label in ['is_ai', 'label', 'Label', 'is_AI']:
        if possible_label in df_commercial.columns:
            label_col_commercial = possible_label
            break

    if label_col_commercial:
        print(f"Label column found: '{label_col_commercial}")
        n_ai_comm = (df_commercial[label_col_commercial] == 1).sum()
        n_human_comm = (df_commercial[label_col_commercial] == 0).sum()
    else:
        print(f"No label column found")

except FileNotFoundError:
    print(f"Commercial test file not found!")
    print(f"Path: {COMMERCIAL_TEST_PATH}")
    raise
except Exception as e:
    print(f"ERROR loading commercial test set: {e}")
    raise

Loading commercial test set from: /content/Testing-Commercial.csv
Commercial test set loaded successfully
Columns: ['id', 'generation', 'label', 'GPTZero', 'Quillboat', 'Sapling', 'WalterAI', 'ZeroGPT']
  • Using ID column: 'id'
Label column found: 'label


In [12]:
#Overlapping ids

# Get ID sets
master_ids = set(df_master['id'].values)
commercial_ids = set(df_commercial[id_col_commercial].values)

# Find overlap
overlapping_ids = master_ids & commercial_ids

# Show overlap percentage
overlap_pct_commercial = (len(overlapping_ids) / len(commercial_ids)) * 100

print(f"Overlap statistics:")
print(f"% of commercial dataset: {overlap_pct_commercial:.2f}%")

# Show sample overlapping IDs
print(f"\n  Sample overlapping IDs:")
for i, id_val in enumerate(list(overlapping_ids)[:5], 1):
    print(f"    {i}. {id_val}")

Overlap statistics:
% of commercial dataset: 100.00%

  Sample overlapping IDs:
    1. 4a84a694-1ad6-498f-8ac4-1938b8b067cd
    2. 9ac40e82-3385-4090-866c-fa04bdffd475
    3. 3216fd24-4854-443d-8b65-582d157e5821
    4. 8dcb2914-d011-48fd-be29-feb8482e5b55
    5. 9a0e73ff-4f4c-4439-bb5a-d00e104633ad


In [14]:
#Create Train/Test Split

# Split based on overlapping IDs
df_test = df_master[df_master['id'].isin(overlapping_ids)].copy()
df_train = df_master[~df_master['id'].isin(overlapping_ids)].copy()

# Verify no overlap
assert len(set(df_train['id']) & set(df_test['id'])) == 0, "Train/test overlap detected!"
print(f"No ID overlap between train and test sets")

No ID overlap between train and test sets


In [21]:
#Feature Matrices

#Get feature columns (exclude id and label)
feature_cols = [col for col in df_master.columns if col not in ['id', 'is_ai']]

print(f"Extracting features:")


# Separate features and labels
X_train = df_train[feature_cols].copy()
y_train = df_train['is_ai'].copy()
ids_train = df_train['id'].copy()

X_test = df_test[feature_cols].copy()
y_test = df_test['is_ai'].copy()
ids_test = df_test['id'].copy()

print(f"Feature matrices created")

# Check for any data quality issues
print(f"Data quality check:")

# NaN values
train_nans = X_train.isna().sum().sum()
test_nans = X_test.isna().sum().sum()
print(f"NaN values in train: {train_nans}")
print(f"NaN values in test: {test_nans}")

if train_nans > 0 or test_nans > 0:
    print(f"    ⚠ WARNING: NaN values detected!")

# Infinite values
train_infs = np.isinf(X_train).sum().sum()
test_infs = np.isinf(X_test).sum().sum()
print(f"Infinite values in train: {train_infs}")
print(f"Infinite values in test: {test_infs}")

if train_infs > 0 or test_infs > 0:
    print(f"Infinite values detected!")

if train_nans == 0 and test_nans == 0 and train_infs == 0 and test_infs == 0:
    print(f"No data quality issues detected")

Extracting features:
Feature matrices created
Data quality check:
NaN values in train: 0
NaN values in test: 0
Infinite values in train: 0
Infinite values in test: 0
No data quality issues detected


In [23]:
#Train model

# Import necessary libraries
import xgboost as xgb
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    average_precision_score
)


# XGBoost hyperparameters
xgb_params = {
    'n_estimators': 500,
    'max_depth': 9,
    'learning_rate': 0.15,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'gamma': 0.1,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'random_state': 42,
    'n_jobs': -1,
    'tree_method': 'hist',  # Faster on CPU
}

print(f"XGBoost configuration")
for param, value in xgb_params.items():
    print(f"{param}: {value}")

# Train the model
print(f"Training model")
model = xgb.XGBClassifier(**xgb_params)

# Fit with evaluation set to track performance
eval_set = [(X_train, y_train), (X_test, y_test)]
model.fit(
    X_train,
    y_train,
    eval_set=eval_set,
    verbose=50  # Print every 50 rounds
)

print(f"Model training complete!")


XGBoost configuration
  • n_estimators: 500
  • max_depth: 9
  • learning_rate: 0.15
  • subsample: 0.8
  • colsample_bytree: 0.8
  • min_child_weight: 3
  • gamma: 0.1
  • reg_alpha: 0.1
  • reg_lambda: 1.0
  • objective: binary:logistic
  • eval_metric: logloss
  • random_state: 42
  • n_jobs: -1
  • tree_method: hist
Training model
[0]	validation_0-logloss:0.60612	validation_1-logloss:0.61401
[50]	validation_0-logloss:0.06382	validation_1-logloss:0.14135
[100]	validation_0-logloss:0.02623	validation_1-logloss:0.10930
[150]	validation_0-logloss:0.01513	validation_1-logloss:0.10244
[200]	validation_0-logloss:0.01052	validation_1-logloss:0.10097
[250]	validation_0-logloss:0.00815	validation_1-logloss:0.10011
[300]	validation_0-logloss:0.00675	validation_1-logloss:0.09308
[350]	validation_0-logloss:0.00619	validation_1-logloss:0.09316
[400]	validation_0-logloss:0.00584	validation_1-logloss:0.09479
[450]	validation_0-logloss:0.00567	validation_1-logloss:0.09037
[499]	validation_0-logloss

In [24]:
#Evaluate Perfomance

# Make predictions
y_train_pred = model.predict(X_train)
y_train_proba = model.predict_proba(X_train)[:, 1]

y_test_pred = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:, 1]

print(f"Predictions generated")


# TRAINING SET PERFORMANCE

train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)
train_auc = roc_auc_score(y_train, y_train_proba)

print(f"Metrics:")
print(f"Accuracy:  {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall:    {train_recall:.4f}")
print(f"F1 Score:  {train_f1:.4f}")
print(f"ROC AUC:   {train_auc:.4f}")


# TEST SET PERFORMANCE (COMMERCIAL HOLDOUT)


test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_auc = roc_auc_score(y_test, y_test_proba)

print(f"\nMetrics:")
print(f"Accuracy:  {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall:    {test_recall:.4f}")
print(f"F1 Score:  {test_f1:.4f}")
print(f"ROC AUC:   {test_auc:.4f}")

# PERFORMANCE COMPARISON


print(f"\n" + "="*80)
print("Train vs Test")
print("="*80)

comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC'],
    'Train': [train_accuracy, train_precision, train_recall, train_f1, train_auc],
    'Test': [test_accuracy, test_precision, test_recall, test_f1, test_auc],
})
comparison_df['Difference'] = comparison_df['Train'] - comparison_df['Test']

print(f"\n{comparison_df.to_string(index=False)}")


Predictions generated
Metrics:
Accuracy:  1.0000
Precision: 1.0000
Recall:    1.0000
F1 Score:  1.0000
ROC AUC:   1.0000

Metrics:
Accuracy:  0.9600
Precision: 1.0000
Recall:    0.9167
F1 Score:  0.9565
ROC AUC:   0.9968

Train vs Test

   Metric  Train     Test  Difference
 Accuracy    1.0 0.960000    0.040000
Precision    1.0 1.000000    0.000000
   Recall    1.0 0.916667    0.083333
 F1 Score    1.0 0.956522    0.043478
  ROC AUC    1.0 0.996795    0.003205


In [27]:
#Feature Importance Analysis

print("Feature Importance Analysis")

# Get feature importances
feature_importance = model.feature_importances_

# Get feature names from the columns (excluding id and label)
feature_names = [col for col in df_master.columns if col not in ['id', 'is_ai']]


# Create dataframe
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

# Add category information
def get_feature_category(feature_name):
    for category, features in FEATURE_CATEGORIES.items():
        if feature_name in features:
            return category
    return 'UNKNOWN'

importance_df['category'] = importance_df['feature'].apply(get_feature_category)

print(f"\n→ Top 20 Most Important Features:")
print(f"\n{'Rank':<6}{'Feature':<40}{'Category':<20}{'Importance':<12}")
print("="*80)

for idx, row in importance_df.head(20).iterrows():
    rank = importance_df.index.get_loc(idx) + 1
    print(f"{rank:<6}{row['feature']:<40}{row['category']:<20}{row['importance']:.6f}")

# Importance by category
print(f"Feature Importance by Category:")
category_importance = importance_df.groupby('category')['importance'].agg(['sum', 'mean'])
category_importance = category_importance.sort_values('sum', ascending=False)
category_importance.columns = ['Total', 'Average']

print(f"\n{category_importance.to_string()}")

Feature Importance Analysis

→ Top 20 Most Important Features:

Rank  Feature                                 Category            Importance  
1     type_token_ratio                        BACKBONE            0.042063
2     doc_perplexity                          PERPLEXITY          0.036596
3     tg_ordering_entropy                     TEMPORAL            0.034156
4     mean_sentence_perplexity                PERPLEXITY          0.027468
5     temp_scope_variance                     TEMPORAL            0.021233
6     yules_k                                 BACKBONE            0.019114
7     pos_row_entropy_weighted                BACKBONE            0.018425
8     comma_ratio                             BACKBONE            0.016069
9     pos_ratio_NUM                           BACKBONE            0.014350
10    sentence_length_std                     BACKBONE            0.012889
11    tg_longest_path                         TEMPORAL            0.012879
12    semantic_average_degree   