In [None]:
!pip install xgboost scikit-learn pandas numpy matplotlib seaborn scipy -q


# Mount Google Drive
from google.colab import drive
import os

try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted successfully")

    # Verify mount
    if os.path.exists('/content/drive/MyDrive'):
        print("Drive path verified")
    else:
        raise Exception("Drive mounted but MyDrive not found")
except Exception as e:
    print(f" ERROR: Could not mount Google Drive: {e}")
    raise

# Check available space
import shutil
try:
    available_space = shutil.disk_usage('/content/drive/MyDrive').free / 1e9
    print(f"Available space on Drive: {available_space:.1f} GB")
    if available_space < 1.0:
        print(f"WARNING: Low disk space!")
except:
    print(f"Could not check disk space")

# Optional: Check GPU (not needed for this analysis, but useful info)
import torch
print(f"GPU Check (optional, not required for this analysis):")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print(f"Running on CPU (sufficient for this analysis)")


!pip install -U kaleido

Mounted at /content/drive
Google Drive mounted successfully
Drive path verified
Available space on Drive: 200.9 GB
GPU Check (optional, not required for this analysis):
CUDA available: True
GPU: Tesla T4


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from scipy import stats
from scipy.stats import ttest_ind
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
import warnings
warnings.filterwarnings('ignore')
import os
import time

def load_feature_list(csv_path):
    df = pd.read_csv(csv_path)

    features = (
        df["feature"]
        .dropna()
        .astype(str)
        .str.strip()
        .unique()
        .tolist()
    )

    return features

In [None]:
#LOAD ALL DATASETS

ORIGINAL_DF_PATH = '/content/raid_sample_medium_PostPOS_CLEAN (1).csv'
ENTITY_FEATURES_PATH = '/content/drive/MyDrive/Tesi Magistrale/entity_cohesion/entity_cohesion_features_final.csv'
SEMANTIC_FEATURES_PATH = '/content/drive/MyDrive/Tesi Magistrale/semantic_cohesion/semantic_cohesion_features_final.csv'
BASE_FEATURES_PATH = '/content/baseline_features.csv'
TOPIC_FEATURES_PATH = '/content/drive/MyDrive/Tesi Magistrale/topic_coherence/topic_coherence_features_final.csv'


# Output directory for results
OUTPUT_DIR = '/content/drive/MyDrive/Tesi Magistrale/cohesion_analysis'
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, 'plots'), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, 'tables'), exist_ok=True)

backbone_features = load_feature_list(BASE_FEATURES_PATH)
df_original = pd.read_csv(ORIGINAL_DF_PATH)

df_entity = pd.read_csv(ENTITY_FEATURES_PATH)
df_semantic = pd.read_csv(SEMANTIC_FEATURES_PATH)
df_topic = pd.read_csv(TOPIC_FEATURES_PATH)


#Merging all dataset

# Select backbone + label from original
df_backbone = df_original[['id', 'is_ai'] + backbone_features].copy()
df_merged = df_backbone.merge(df_entity, on='id', how='inner')
df_merged = df_merged.merge(df_semantic, on='id', how='inner')
df_merged = df_merged.merge(df_topic, on='id', how='inner')

# Get feature groups
entity_features = [c for c in df_entity.columns if c != 'id']
semantic_features = [c for c in df_semantic.columns if c != 'id']
topic_features = [c for c in df_topic.columns if c != 'id']

all_cohesion_features = entity_features + semantic_features + topic_features
all_features = backbone_features + all_cohesion_features

print(f"Feature breakdown:")
print(f"Entity cohesion: {len(entity_features)}")
print(f"Semantic cohesion: {len(semantic_features)}")
print(f"Topic coherence: {len(topic_features)}")

Feature breakdown:
Entity cohesion: 6
Semantic cohesion: 10
Topic coherence: 10


In [None]:

# Check for missing values
nan_counts = df_merged[all_features].isna().sum()
nan_features = nan_counts[nan_counts > 0].sort_values(ascending=False)

if len(nan_features) > 0:
    print(f"  Found {len(nan_features)} features with missing values:")
    for feat, count in nan_features.items():
        pct = 100 * count / len(df_merged)
        print(f"  {feat}: {count} ({pct:.2f}%)")
else:
    print("No missing values")

# Check for infinite values
print("Infinite Value Check:")
inf_counts = np.isinf(df_merged[all_features]).sum()
inf_features = inf_counts[inf_counts > 0]

if len(inf_features) > 0:
    print(f"  Found {len(inf_features)} features with infinite values:")
    for feat, count in inf_features.items():
        print(f"  {feat}: {count}")
else:
    print("No infinite values found")

# Check value ranges for bounded features
print("Value Range Validation:")
bounded_features = {
    'entity_reuse_rate': (0, 1),
    'entity_graph_density': (0, 1),
    'entity_isolated_sentences': (0, 1),
    'mean_entity_continuation_rate': (0, 1),
    'entity_largest_component_size': (0, 1),
    'topic_drift_rate': (0, 1),
    'dominant_topic_proportion': (0, 1),
    'topic_diversity': (0, 1),
    'topic_concentration': (0, 1),
    'topic_return_rate': (0, 1)
}

range_issues = []
for feat, (min_val, max_val) in bounded_features.items():
    if feat in df_merged.columns:
        actual_min = df_merged[feat].min()
        actual_max = df_merged[feat].max()

        if actual_min < min_val or actual_max > max_val:
            range_issues.append(feat)
            print(f"{feat}: [{actual_min:.4f}, {actual_max:.4f}] (expected [{min_val}, {max_val}])")

if not range_issues:
    print("  All bounded features within expected ranges!")


  Found 23 features with missing values:
  topic_transition_similarity: 1317 (10.97%)
  long_range_similarity: 1121 (9.34%)
  topic_entropy: 952 (7.93%)
  topic_diversity: 952 (7.93%)
  dominant_topic_proportion: 952 (7.93%)
  topic_concentration: 952 (7.93%)
  mean_nonadjacent_similarity: 359 (2.99%)
  entity_graph_density: 221 (1.84%)
  mean_entity_continuation_rate: 221 (1.84%)
  entity_largest_component_size: 221 (1.84%)
  entity_isolated_sentences: 221 (1.84%)
  entity_mention_density: 221 (1.84%)
  entity_reuse_rate: 221 (1.84%)
  mean_adjacent_cosine_similarity: 168 (1.40%)
  min_adjacent_cosine_similarity: 168 (1.40%)
  semantic_largest_component_size: 168 (1.40%)
  semantic_graph_isolated_sentences: 168 (1.40%)
  semantic_graph_density: 168 (1.40%)
  similarity_decay_rate: 168 (1.40%)
  adjacent_similarity_variance: 168 (1.40%)
  semantic_average_degree: 168 (1.40%)
  topic_drift_rate: 168 (1.40%)
  topic_persistence: 168 (1.40%)
Infinite Value Check:
No infinite values found


In [None]:
print("NAN IMPUTATION")

df_imputed = df_merged.copy()

# Group 1: Density/rate features - impute with 0 (no entities/topics detected)
density_rate_features = [
    'entity_mention_density',
    'entity_reuse_rate',
    'entity_graph_density',
    'mean_entity_continuation_rate',
    'topic_drift_rate',
    'topic_transition_similarity'
]

zero_imputed = []
for feat in density_rate_features:
    if feat in df_imputed.columns and df_imputed[feat].isna().any():
        n_imputed = df_imputed[feat].isna().sum()
        df_imputed[feat].fillna(0, inplace=True)
        zero_imputed.append((feat, n_imputed))

if zero_imputed:
    print("  1. Zero imputation (no cohesion detected):")
    for feat, n in zero_imputed:
        print(f"   {feat}: {n} values")

#Proportion features - impute with 1.0
proportion_features = [
    'entity_isolated_sentences',
    'dominant_topic_proportion'
]

one_imputed = []
for feat in proportion_features:
    if feat in df_imputed.columns and df_imputed[feat].isna().any():
        n_imputed = df_imputed[feat].isna().sum()
        df_imputed[feat].fillna(1.0, inplace=True)
        one_imputed.append((feat, n))

if one_imputed:
    print("One imputation (complete isolation/dominance)")
    for feat, n in one_imputed:
        print(f"   {feat}: {n} values")

# Group 3: Count features - impute with 0
count_features = [
    'topic_switching_frequency',
    'num_distinct_topics'
]

count_imputed = []
for feat in count_features:
    if feat in df_imputed.columns and df_imputed[feat].isna().any():
        n_imputed = df_imputed[feat].isna().sum()
        df_imputed[feat].fillna(0, inplace=True)
        count_imputed.append((feat, n))

if count_imputed:
    print("Zero imputation:")
    for feat, n in count_imputed:
        print(f"   {feat}: {n} values")

#Remaining features - impute with median
median_imputed = []
for feat in all_features:
    if feat in df_imputed.columns and df_imputed[feat].isna().any():
        n_imputed = df_imputed[feat].isna().sum()
        median_val = df_imputed[feat].median()
        df_imputed[feat].fillna(median_val, inplace=True)
        median_imputed.append((feat, n, median_val))

if median_imputed:
    print("  4. Median imputation (remaining features):")
    for feat, n, med in median_imputed:
        print(f"   {feat}: {n} values (median={med:.4f})")

# Verify no NaNs remain
remaining_nans = df_imputed[all_features].isna().sum().sum()
print(f"\nImputation complete: {remaining_nans} NaN values remaining")

NAN IMPUTATION
  1. Zero imputation (no cohesion detected):
   entity_mention_density: 221 values
   entity_reuse_rate: 221 values
   entity_graph_density: 221 values
   mean_entity_continuation_rate: 221 values
   topic_drift_rate: 168 values
   topic_transition_similarity: 1317 values
One imputation (complete isolation/dominance)
   entity_isolated_sentences: 1317 values
   dominant_topic_proportion: 1317 values
  4. Median imputation (remaining features):
   entity_largest_component_size: 1317 values (median=0.7500)
   mean_adjacent_cosine_similarity: 1317 values (median=0.3644)
   min_adjacent_cosine_similarity: 1317 values (median=0.1404)
   adjacent_similarity_variance: 1317 values (median=0.0179)
   semantic_graph_density: 1317 values (median=0.0476)
   similarity_decay_rate: 1317 values (median=-0.0022)
   mean_nonadjacent_similarity: 1317 values (median=0.3369)
   long_range_similarity: 1317 values (median=0.3098)
   semantic_graph_isolated_sentences: 1317 values (median=0.609

In [None]:
#REDUNDANCY ELIMINATION

correlation_matrix = df_imputed[all_features].corr().abs()

# Find highly correlated pairs (> 0.95)
upper_triangle = correlation_matrix.where(
    np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
)

high_corr_pairs = []
threshold = 0.95

for column in upper_triangle.columns:
    high_corr = upper_triangle[column][upper_triangle[column] > threshold]
    for idx in high_corr.index:
        high_corr_pairs.append((column, idx, upper_triangle.loc[idx, column]))

print(f"Found {len(high_corr_pairs)} highly correlated pairs (r > {threshold}):")

if high_corr_pairs:
    redundant_features = set()

    for feat1, feat2, corr_val in sorted(high_corr_pairs, key=lambda x: x[2], reverse=True):
        print(f"{feat1} <-> {feat2}: r={corr_val:.4f}")

        # Keep the feature with lower mean absolute correlation to all others
        avg_corr_1 = correlation_matrix[feat1].abs().mean()
        avg_corr_2 = correlation_matrix[feat2].abs().mean()

        if avg_corr_1 > avg_corr_2:
            redundant_features.add(feat1)
            print(f"    → Removing {feat1} (avg_corr={avg_corr_1:.3f} > {avg_corr_2:.3f})")
        else:
            redundant_features.add(feat2)
            print(f"    → Removing {feat2} (avg_corr={avg_corr_2:.3f} > {avg_corr_1:.3f})")

    #Remove redundant features
    features_to_keep = [f for f in all_features if f not in redundant_features]

    print(f"\nRemoved {len(redundant_features)} redundant features")
    print(f"  Remaining: {len(features_to_keep)} features")

else:
    print("  No highly correlated feature pairs found!")
    features_to_keep = all_features.copy()

# Update feature lists after redundancy removal
backbone_features_kept = [f for f in backbone_features if f in features_to_keep]
entity_features_kept = [f for f in entity_features if f in features_to_keep]
semantic_features_kept = [f for f in semantic_features if f in features_to_keep]
topic_features_kept = [f for f in topic_features if f in features_to_keep]

Found 3 highly correlated pairs (r > 0.95):
topic_diversity <-> topic_entropy: r=0.9784
    → Removing topic_entropy (avg_corr=0.251 > 0.250)
dominant_topic_proportion <-> topic_entropy: r=0.9634
    → Removing topic_entropy (avg_corr=0.251 > 0.231)
topic_diversity <-> dominant_topic_proportion: r=0.9570
    → Removing topic_diversity (avg_corr=0.250 > 0.231)

Removed 2 redundant features
  Remaining: 33 features


In [None]:
# EXPLORATORY DATA ANALYSIS

# Separate AI and Human samples
df_ai = df_imputed[df_imputed['is_ai'] == 1]
df_human = df_imputed[df_imputed['is_ai'] == 0]

# Compute descriptive statistics by class
print("Computing descriptive statistics...")

descriptive_stats = []

for feature in features_to_keep:
    ai_values = df_ai[feature]
    human_values = df_human[feature]

    descriptive_stats.append({
        'feature': feature,
        'ai_mean': ai_values.mean(),
        'ai_std': ai_values.std(),
        'ai_median': ai_values.median(),
        'human_mean': human_values.mean(),
        'human_std': human_values.std(),
        'human_median': human_values.median(),
        'diff_mean': ai_values.mean() - human_values.mean(),
        'diff_pct': 100 * (ai_values.mean() - human_values.mean()) / human_values.mean() if human_values.mean() != 0 else np.nan
    })

df_stats = pd.DataFrame(descriptive_stats)

# Save descriptive statistics
stats_path = os.path.join(OUTPUT_DIR, 'tables', 'descriptive_statistics.csv')
df_stats.to_csv(stats_path, index=False)
print(f"Descriptive statistics saved: {stats_path}")

# Display top differences
print("Top 10 features by absolute mean difference:")
df_stats_sorted = df_stats.sort_values('diff_mean', key=abs, ascending=False)
print(df_stats_sorted[['feature', 'ai_mean', 'human_mean', 'diff_mean', 'diff_pct']].head(10).to_string(index=False))

Computing descriptive statistics...
Descriptive statistics saved: /content/drive/MyDrive/Tesi Magistrale/cohesion_analysis/tables/descriptive_statistics.csv
Top 10 features by absolute mean difference:
                          feature    ai_mean  human_mean  diff_mean   diff_pct
                          yules_k 155.689689  126.320699  29.368990  23.249547
           entity_mention_density 474.369990  463.723917  10.646073   2.295778
        topic_switching_frequency   5.032500    6.019333  -0.986833 -16.394396
          semantic_average_degree   2.155866    1.522370   0.633496  41.612486
              sentence_length_std   9.652472   10.275792  -0.623320  -6.065907
                   avg_tree_depth   5.875135    5.568922   0.306213   5.498610
                topic_persistence   2.842883    2.563553   0.279330  10.896213
                verbs_per_100_tok  14.334342   14.608463  -0.274121  -1.876456
             char_trigram_entropy   8.791704    8.967907  -0.176203  -1.964818
semantic

In [None]:
#Distribution Comparisons (Feature Groups)


print("Generating distribution comparison plots...")

feature_groups = {
    'Entity Cohesion': entity_features_kept,
    'Semantic Cohesion': semantic_features_kept,
    'Topic Coherence': topic_features_kept
}

for group_name, group_features in feature_groups.items():
    if len(group_features) == 0:
        continue

    n_features = len(group_features)
    n_cols = 3
    n_rows = int(np.ceil(n_features / n_cols))

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4*n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes

    for idx, feature in enumerate(group_features):
        ax = axes[idx]

        # Plot distributions
        ai_vals = df_ai[feature].dropna()
        human_vals = df_human[feature].dropna()

        ax.hist(human_vals, bins=30, alpha=0.5, label='Human', density=True, color='blue')
        ax.hist(ai_vals, bins=30, alpha=0.5, label='AI', density=True, color='red')

        ax.set_xlabel(feature, fontsize=10)
        ax.set_ylabel('Density', fontsize=10)
        ax.set_title(f'{feature}\n(AI: {ai_vals.mean():.3f}, Human: {human_vals.mean():.3f})',
                     fontsize=10)
        ax.legend(fontsize=8)
        ax.grid(True, alpha=0.3)

    # Hide unused subplots
    for idx in range(n_features, len(axes)):
        axes[idx].axis('off')

    plt.tight_layout()
    plot_path = os.path.join(OUTPUT_DIR, 'plots', f'distributions_{group_name.lower().replace(" ", "_")}.png')
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    plt.close()

    print(f"  Saved: {plot_path}")

Generating distribution comparison plots...
  Saved: /content/drive/MyDrive/Tesi Magistrale/cohesion_analysis/plots/distributions_entity_cohesion.png
  Saved: /content/drive/MyDrive/Tesi Magistrale/cohesion_analysis/plots/distributions_semantic_cohesion.png
  Saved: /content/drive/MyDrive/Tesi Magistrale/cohesion_analysis/plots/distributions_topic_coherence.png


In [None]:

#STATISTICAL TESTING

def cohens_d(group1, group2):
    n1, n2 = len(group1), len(group2)
    var1, var2 = group1.var(), group2.var()
    pooled_std = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
    return (group1.mean() - group2.mean()) / pooled_std if pooled_std > 0 else 0

statistical_results = []

for feature in features_to_keep:
    ai_vals = df_ai[feature].dropna()
    human_vals = df_human[feature].dropna()

    # T-test
    t_stat, p_value = ttest_ind(ai_vals, human_vals, equal_var=False)

    # Cohen's d
    d = cohens_d(ai_vals, human_vals)

    # Effect size interpretation
    if abs(d) < 0.2:
        effect_size = 'negligible'
    elif abs(d) < 0.5:
        effect_size = 'small'
    elif abs(d) < 0.8:
        effect_size = 'medium'
    else:
        effect_size = 'large'

    statistical_results.append({
        'feature': feature,
        't_statistic': t_stat,
        'p_value': p_value,
        'cohens_d': d,
        'effect_size': effect_size,
        'significant': p_value < 0.05,
        'ai_mean': ai_vals.mean(),
        'human_mean': human_vals.mean()
    })

df_stats_tests = pd.DataFrame(statistical_results)

# Apply Bonferroni correction
bonferroni_threshold = 0.05 / len(features_to_keep)
df_stats_tests['bonferroni_significant'] = df_stats_tests['p_value'] < bonferroni_threshold

# Save statistical results
stats_test_path = os.path.join(OUTPUT_DIR, 'tables', 'statistical_tests.csv')
df_stats_tests.to_csv(stats_test_path, index=False)
print(f"Statistical test results saved: {stats_test_path}")

# Summary statistics
n_significant = (df_stats_tests['p_value'] < 0.05).sum()
n_bonferroni = df_stats_tests['bonferroni_significant'].sum()

print(f"Significant (p < 0.05): {n_significant}/{len(features_to_keep)} ({100*n_significant/len(features_to_keep):.1f}%)")

# Effect size distribution
print(f"Effect size distribution:")
for effect in ['negligible', 'small', 'medium', 'large']:
    count = (df_stats_tests['effect_size'] == effect).sum()
    print(f"{effect.capitalize()}: {count} ({100*count/len(df_stats_tests):.1f}%)")

# Top 15 features by effect size
print("Top 15 features by |Cohen's d|:")
df_top_effects = df_stats_tests.sort_values('cohens_d', key=abs, ascending=False).head(15)
print(df_top_effects[['feature', 'cohens_d', 'p_value', 'effect_size']].to_string(index=False))

Statistical test results saved: /content/drive/MyDrive/Tesi Magistrale/cohesion_analysis/tables/statistical_tests.csv
Significant (p < 0.05): 28/33 (84.8%)
Effect size distribution:
Negligible: 25 (75.8%)
Small: 8 (24.2%)
Medium: 0 (0.0%)
Large: 0 (0.0%)
Top 15 features by |Cohen's d|:
                          feature  cohens_d      p_value effect_size
                trigram_diversity -0.382753 1.175277e-94       small
  semantic_largest_component_size  0.326511 1.307571e-70       small
semantic_graph_isolated_sentences -0.326179 1.809167e-70       small
             char_trigram_entropy -0.297851 3.986830e-59       small
      mean_nonadjacent_similarity  0.281777 3.273303e-53       small
                          yules_k  0.268598 2.326385e-48       small
            long_range_similarity  0.258202 4.896273e-45       small
           semantic_graph_density  0.246637 2.828498e-41       small
                   avg_tree_depth  0.175431 8.856409e-22  negligible
   min_adjacent_cosine_

In [None]:

# VISUALIZATION 2: Effect Size Plot


print("Generating effect size visualization...")

# Sort by Cohen's d
df_plot = df_stats_tests.sort_values('cohens_d', ascending=True)

# Color by significance
colors = ['red' if sig else 'gray' for sig in df_plot['bonferroni_significant']]

fig, ax = plt.subplots(figsize=(10, max(8, len(features_to_keep) * 0.3)))

y_pos = np.arange(len(df_plot))
ax.barh(y_pos, df_plot['cohens_d'], color=colors, alpha=0.7)

ax.set_yticks(y_pos)
ax.set_yticklabels(df_plot['feature'], fontsize=8)
ax.set_xlabel("Cohen's d (AI - Human)", fontsize=12, fontweight='bold')
ax.set_title("Effect Sizes: All Cohesion Features",
             fontsize=14, fontweight='bold')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
ax.axvline(x=-0.2, color='blue', linestyle='--', linewidth=0.5, alpha=0.5)
ax.axvline(x=0.2, color='blue', linestyle='--', linewidth=0.5, alpha=0.5)
ax.axvline(x=-0.5, color='orange', linestyle='--', linewidth=0.5, alpha=0.5)
ax.axvline(x=0.5, color='orange', linestyle='--', linewidth=0.5, alpha=0.5)
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
effect_plot_path = os.path.join(OUTPUT_DIR, 'plots', 'cohens_d_all_features.png')
plt.savefig(effect_plot_path, dpi=300, bbox_inches='tight')
plt.close()

print(f"  Saved: {effect_plot_path}")

Generating effect size visualization...
  Saved: /content/drive/MyDrive/Tesi Magistrale/cohesion_analysis/plots/cohens_d_all_features.png


In [None]:

#XGBOOST FEATURE IMPORTANCE


# Prepare data for XGBoost
X = df_imputed[features_to_keep].values
y = df_imputed['is_ai'].values

print(f"Class balance: {(y==1).mean()} AI / {(y==0).mean()} Human")

# Train XGBoost with optimal hyperparameters
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=9,
    learning_rate=0.15,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='f1')
print(f"F1 scores: {cv_scores}")
print(f"Mean F1: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")

# Train final model
print("Training final model on full dataset...")
xgb_model.fit(X, y)
print("  Training complete")

# Extract feature importance
feature_importance = pd.DataFrame({
    'feature': features_to_keep,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

# Normalize to sum to 1
feature_importance['importance_pct'] = 100 * feature_importance['importance'] / feature_importance['importance'].sum()

# Save feature importance
importance_path = os.path.join(OUTPUT_DIR, 'tables', 'xgboost_feature_importance.csv')
feature_importance.to_csv(importance_path, index=False)
print(f"Feature importance saved: {importance_path}")

print("Top 20 features by XGBoost importance:")
print(feature_importance.head(20).to_string(index=False))


# VISUALIZATION 3: XGBoost Feature Importance


# Top 30 features
top_n = min(30, len(feature_importance))
df_plot_importance = feature_importance.head(top_n).sort_values('importance', ascending=True)

fig, ax = plt.subplots(figsize=(10, max(8, top_n * 0.3)))

y_pos = np.arange(len(df_plot_importance))
ax.barh(y_pos, df_plot_importance['importance'], color='steelblue', alpha=0.8)

ax.set_yticks(y_pos)
ax.set_yticklabels(df_plot_importance['feature'], fontsize=9)
ax.set_xlabel('XGBoost Feature Importance', fontsize=12, fontweight='bold')
ax.set_title(f'Top {top_n} Features by XGBoost Importance\n(F1={cv_scores.mean():.4f})',
             fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
importance_plot_path = os.path.join(OUTPUT_DIR, 'plots', 'xgboost_importance_top30.png')
plt.savefig(importance_plot_path, dpi=300, bbox_inches='tight')
plt.close()

print(f"  Saved: {importance_plot_path}")


Class balance: 0.5 AI / 0.5 Human
F1 scores: [0.84487888 0.85498743 0.85279188 0.85579399 0.86243164]
Mean F1: 0.8542 (±0.0057)
Training final model on full dataset...
  Training complete
Feature importance saved: /content/drive/MyDrive/Tesi Magistrale/cohesion_analysis/tables/xgboost_feature_importance.csv
Top 20 features by XGBoost importance:
                          feature  importance  importance_pct
                          yules_k    0.075597        7.559721
          semantic_average_degree    0.063307        6.330723
        dominant_topic_proportion    0.059080        5.908017
              num_distinct_topics    0.053578        5.357793
              sentence_length_std    0.047956        4.795584
                trigram_diversity    0.047447        4.744661
                 type_token_ratio    0.042047        4.204652
             char_trigram_entropy    0.040805        4.080469
semantic_graph_isolated_sentences    0.038773        3.877320
           semantic_graph_densit

In [None]:

# VISUALIZATION 4: Feature Group Contributions


# Aggregate importance by feature group
group_contributions = []

for group_name, group_features in feature_groups.items():
    group_features_kept = [f for f in group_features if f in features_to_keep]
    if len(group_features_kept) > 0:
        group_importance = feature_importance[feature_importance['feature'].isin(group_features_kept)]['importance'].sum()
        group_contributions.append({
            'group': group_name,
            'total_importance': group_importance,
            'importance_pct': 100 * group_importance / feature_importance['importance'].sum(),
            'importance_per_feature': group_importance / len(group_features_kept)
        })

df_group_contrib = pd.DataFrame(group_contributions).sort_values('total_importance', ascending=False)

print("Feature group contributions:")
print(df_group_contrib.to_string(index=False))

# Save group contributions
group_contrib_path = os.path.join(OUTPUT_DIR, 'tables', 'feature_group_contributions.csv')
df_group_contrib.to_csv(group_contrib_path, index=False)

# Plot group contributions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Total importance by group
ax1.barh(df_group_contrib['group'], df_group_contrib['importance_pct'], color='teal', alpha=0.7)
ax1.set_xlabel('Total Importance (%)', fontsize=11, fontweight='bold')
ax1.set_title('Feature Group Contributions\n(Total Importance)', fontsize=12, fontweight='bold')
ax1.grid(True, alpha=0.3, axis='x')

# Importance per feature
ax2.barh(df_group_contrib['group'], df_group_contrib['importance_per_feature'], color='coral', alpha=0.7)
ax2.set_xlabel('Average Importance per Feature', fontsize=11, fontweight='bold')
ax2.set_title('Feature Group Contributions\n(Per-Feature Average)', fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
group_plot_path = os.path.join(OUTPUT_DIR, 'plots', 'feature_group_contributions.png')
plt.savefig(group_plot_path, dpi=300, bbox_inches='tight')
plt.close()

print(f"  Saved: {group_plot_path}")

Feature group contributions:
            group  total_importance  importance_pct  importance_per_feature
Semantic Cohesion          0.282049       28.204950                0.028205
  Topic Coherence          0.233105       23.310482                0.029138
  Entity Cohesion          0.116043       11.604295                0.019340
  Saved: /content/drive/MyDrive/Tesi Magistrale/cohesion_analysis/plots/feature_group_contributions.png


In [None]:
# VISUALIZATION 5: Correlation Heatmap

# Select top 20 features by importance
top_features = feature_importance.head(20)['feature'].tolist()

# Compute correlation matrix
corr_matrix = df_imputed[top_features].corr()

# Plot heatmap
fig, ax = plt.subplots(figsize=(14, 12))

sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8},
            vmin=-1, vmax=1, ax=ax)

ax.set_title('Correlation Matrix: Top 20 Features by XGBoost Importance',
             fontsize=14, fontweight='bold', pad=20)

plt.tight_layout()
heatmap_path = os.path.join(OUTPUT_DIR, 'plots', 'correlation_heatmap_top20.png')
plt.savefig(heatmap_path, dpi=300, bbox_inches='tight')
plt.close()

print(f"  Saved: {heatmap_path}")

  Saved: /content/drive/MyDrive/Tesi Magistrale/cohesion_analysis/plots/correlation_heatmap_top20.png


In [None]:

X_backbone = df_imputed[backbone_features_kept].values
y_target = df_imputed['is_ai'].values
xgb_backbone = XGBClassifier(
    n_estimators=500,
    max_depth=9,
    learning_rate=0.15,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

backbone_scores = cross_val_score(xgb_backbone, X_backbone, y_target, cv=5, scoring='f1')
backbone_f1 = backbone_scores.mean()

print(f"Backbone-only F1: {backbone_f1:.4f} (±{backbone_scores.std():.4f})")

# Test each cohesion feature individually
individual_results = []

all_cohesion = entity_features_kept + semantic_features_kept + topic_features_kept

print(f"Testing {len(all_cohesion)} cohesion features individually...")

for cohesion_feat in tqdm(all_cohesion, desc="Testing features"):
    features_test = backbone_features_kept + [cohesion_feat]
    X_test = df_imputed[features_test].values

    xgb_test = XGBClassifier(
        n_estimators=500,
        max_depth=9,
        learning_rate=0.15,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False
    )

    test_scores = cross_val_score(xgb_test, X_test, y_target, cv=5, scoring='f1')
    test_f1 = test_scores.mean()

    # Calculate improvement
    improvement = test_f1 - backbone_f1
    improvement_pct = 100 * improvement / backbone_f1

    # Determine feature group
    if cohesion_feat in entity_features_kept:
        group = 'Entity Cohesion'
    elif cohesion_feat in semantic_features_kept:
        group = 'Semantic Cohesion'
    elif cohesion_feat in topic_features_kept:
        group = 'Topic Coherence'
    else:
        group = 'Unknown'

    individual_results.append({
        'feature': cohesion_feat,
        'group': group,
        'backbone_f1': backbone_f1,
        'with_feature_f1': test_f1,
        'improvement': improvement,
        'improvement_pct': improvement_pct,
        'std': test_scores.std()
    })

df_individual = pd.DataFrame(individual_results).sort_values('improvement', ascending=False)

# Save results
individual_path = os.path.join(OUTPUT_DIR, 'tables', 'individual_feature_contributions.csv')
df_individual.to_csv(individual_path, index=False)
print(f"\nIndividual feature results saved: {individual_path}")

# Analyze results
positive_improvements = df_individual[df_individual['improvement'] > 0]
negative_improvements = df_individual[df_individual['improvement'] <= 0]

print(f"Individual Feature Analysis:")
print(f"Features with positive impact: {len(positive_improvements)} ({100*len(positive_improvements)/len(df_individual):.1f}%)")
print(f"Features with negative/neutral impact: {len(negative_improvements)} ({100*len(negative_improvements)/len(df_individual):.1f}%)")
print(f"Best individual feature: {df_individual.iloc[0]['feature']} (+{df_individual.iloc[0]['improvement']:.4f})")
print(f"Worst individual feature: {df_individual.iloc[-1]['feature']} ({df_individual.iloc[-1]['improvement']:.4f})")

print("Top 15 Features to Add to Backbone:")
print(df_individual.head(15)[['feature', 'group', 'improvement', 'improvement_pct']].to_string(index=False))

print("Bottom 10 Features (Avoid Adding):")
print(df_individual.tail(10)[['feature', 'group', 'improvement', 'improvement_pct']].to_string(index=False))

# Group-level summary
print("Group-Level Performance (Individual Additions):")
group_summary = df_individual.groupby('group').agg({
    'improvement': ['mean', 'std', 'min', 'max', 'count'],
}).round(4)
group_summary.columns = ['Mean Δ', 'Std Δ', 'Min Δ', 'Max Δ', 'N Features']
print(group_summary.to_string())


# VISUALIZATION 1: Individual Feature Contributions


print("Creating individual feature contribution visualization...")

fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# Define colors
group_colors = {
    'Entity Cohesion': '#ff7f0e',
    'Semantic Cohesion': '#2ca02c',
    'Topic Coherence': '#d62728'
}

# Plot 1: All features sorted by improvement
ax1 = axes[0, 0]
colors_all = [group_colors[g] for g in df_individual['group']]
y_pos = np.arange(len(df_individual))
ax1.barh(y_pos, df_individual['improvement'], color=colors_all, alpha=0.7)
ax1.axvline(x=0, color='black', linestyle='-', linewidth=1)
ax1.set_xlabel('F1 Improvement over Backbone', fontsize=11, fontweight='bold')
ax1.set_ylabel('Feature Index (sorted by improvement)', fontsize=11, fontweight='bold')
ax1.set_title('A) All Features: Individual Contribution to Backbone\n(Sorted by F1 Improvement)',
              fontsize=12, fontweight='bold')
ax1.grid(True, alpha=0.3, axis='x')

# Plot 2: Top 20 features
ax2 = axes[0, 1]
top_20 = df_individual.head(20).sort_values('improvement', ascending=True)
colors_top20 = [group_colors[g] for g in top_20['group']]
y_pos_top20 = np.arange(len(top_20))
ax2.barh(y_pos_top20, top_20['improvement'], color=colors_top20, alpha=0.85)
ax2.set_yticks(y_pos_top20)
ax2.set_yticklabels(top_20['feature'], fontsize=9)
ax2.set_xlabel('F1 Improvement', fontsize=11, fontweight='bold')
ax2.set_title('B) Top 20 Features Worth Adding\n(Positive Impact)',
              fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='x')
# Add value labels
for i, (yval, val) in enumerate(zip(y_pos_top20, top_20['improvement'])):
    ax2.text(val + 0.0001, yval, f'{val:.4f}', va='center', fontsize=8)

# Plot 3: Bottom 20 features
ax3 = axes[1, 0]
bottom_20 = df_individual.tail(20).sort_values('improvement', ascending=True)
colors_bottom20 = [group_colors[g] for g in bottom_20['group']]
y_pos_bottom20 = np.arange(len(bottom_20))
ax3.barh(y_pos_bottom20, bottom_20['improvement'], color=colors_bottom20, alpha=0.85)
ax3.set_yticks(y_pos_bottom20)
ax3.set_yticklabels(bottom_20['feature'], fontsize=9)
ax3.set_xlabel('F1 Improvement', fontsize=11, fontweight='bold')
ax3.set_title('C) Bottom 20 Features to Avoid\n(Negative/Neutral Impact)',
              fontsize=12, fontweight='bold')
ax3.axvline(x=0, color='black', linestyle='-', linewidth=1)
ax3.grid(True, alpha=0.3, axis='x')

# Plot 4: Group comparison (box plot)
ax4 = axes[1, 1]
group_data = [df_individual[df_individual['group'] == g]['improvement'].values
              for g in ['Entity Cohesion', 'Semantic Cohesion', 'Topic Coherence']]
bp = ax4.boxplot(group_data, labels=['Entity\nCohesion', 'Semantic\nCohesion', 'Topic\nCoherence'],
                  patch_artist=True, showmeans=True)
for patch, color in zip(bp['boxes'], [group_colors[g] for g in ['Entity Cohesion', 'Semantic Cohesion', 'Topic Coherence']]):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
ax4.axhline(y=0, color='black', linestyle='--', linewidth=1, alpha=0.5)
ax4.set_ylabel('F1 Improvement', fontsize=11, fontweight='bold')
ax4.set_title('D) Feature Group Distribution\n(Individual Contributions)',
              fontsize=12, fontweight='bold')
ax4.grid(True, alpha=0.3, axis='y')

plt.suptitle(f'Individual Feature Contributions to Backbone (F1={backbone_f1:.4f})\nWhich Features Should You Add?',
             fontsize=15, fontweight='bold', y=0.995)
plt.tight_layout()
individual_plot_path = os.path.join(OUTPUT_DIR, 'plots', 'individual_feature_contributions.png')
plt.savefig(individual_plot_path, dpi=300, bbox_inches='tight')
plt.close()

print(f"  Saved: {individual_plot_path}")


# PROGRESSIVE FEATURE ADDITION


print("Building optimal feature set progressively...")

# Start with backbone
current_features = backbone_features_kept.copy()
remaining_features = all_cohesion.copy()
progressive_results = []

# Initial baseline
progressive_results.append({
    'step': 0,
    'feature_added': 'Backbone Only',
    'f1_mean': backbone_f1,
    'f1_std': backbone_scores.std(),
    'improvement': 0.0
})

print(f"\n  Step 0: Backbone Only")
print(f"    F1: {backbone_f1:.4f}")


MAX_ADDITIONS = 20

for step in range(1, MAX_ADDITIONS + 1):
    print(f"\n  Step {step}: Testing {len(remaining_features)} remaining features...")

    best_f1 = -1
    best_feature = None
    best_std = 0

    # Test each remaining feature
    for feat in remaining_features:
        test_features = current_features + [feat]
        X_test = df_imputed[test_features].values

        xgb_test = XGBClassifier(
            n_estimators=500,
            max_depth=9,
            learning_rate=0.15,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            eval_metric='logloss',
            use_label_encoder=False
        )

        test_scores = cross_val_score(xgb_test, X_test, y_target, cv=5, scoring='f1')  # FIXED: using y_target
        test_f1 = test_scores.mean()

        if test_f1 > best_f1:
            best_f1 = test_f1
            best_feature = feat
            best_std = test_scores.std()

    # Add best feature to current set
    current_features.append(best_feature)
    remaining_features.remove(best_feature)

    # Calculate improvement
    prev_f1 = progressive_results[-1]['f1_mean']
    improvement = best_f1 - prev_f1

    progressive_results.append({
        'step': step,
        'feature_added': best_feature,
        'f1_mean': best_f1,
        'f1_std': best_std,
        'improvement': improvement
    })

    print(f"    Best feature: {best_feature}")
    print(f"    F1: {best_f1:.4f} (±{best_std:.4f})")
    print(f"    Improvement: +{improvement:.4f}")

    # Stop if improvement is negligible
    if improvement < 0.0001:
        print(f"    → Stopping: improvement below threshold")
        break

df_progressive = pd.DataFrame(progressive_results)

# Save results
progressive_path = os.path.join(OUTPUT_DIR, 'tables', 'progressive_feature_selection.csv')
df_progressive.to_csv(progressive_path, index=False)
print(f"\nProgressive selection results saved: {progressive_path}")

print("Progressive Addition Results:")
print(df_progressive[['step', 'feature_added','f1_mean', 'improvement']].to_string(index=False))

Backbone-only F1: 0.8355 (±0.0117)
Testing 24 cohesion features individually...


Testing features: 100%|██████████| 24/24 [03:54<00:00,  9.77s/it]



Individual feature results saved: /content/drive/MyDrive/Tesi Magistrale/cohesion_analysis/tables/individual_feature_contributions.csv
Individual Feature Analysis:
Features with positive impact: 10 (41.7%)
Features with negative/neutral impact: 14 (58.3%)
Best individual feature: mean_nonadjacent_similarity (+0.0048)
Worst individual feature: topic_return_rate (-0.0032)
Top 15 Features to Add to Backbone:
                          feature             group  improvement  improvement_pct
      mean_nonadjacent_similarity Semantic Cohesion     0.004845         0.579926
semantic_graph_isolated_sentences Semantic Cohesion     0.004608         0.551536
          semantic_average_degree Semantic Cohesion     0.004469         0.534844
           semantic_graph_density Semantic Cohesion     0.003888         0.465291
        dominant_topic_proportion   Topic Coherence     0.003345         0.400417
  mean_adjacent_cosine_similarity Semantic Cohesion     0.002367         0.283334
  semantic_large

KeyboardInterrupt: 