In [None]:
#imports and path configuration

import sys
import os
from pathlib import Path

# add feature extraction paths to sys.path
ROOT_PATH = Path(".").resolve()
FEATURE_PATHS = [
    ROOT_PATH / "Features-Extractions" / "Metacognition",
    ROOT_PATH / "Features-Extractions" / "Coreference", 
    ROOT_PATH / "Features-Extractions" / "Coherence",
    ROOT_PATH / "Features-Extractions" / "Perplexity",
    ROOT_PATH / "Features-Extractions" / "Stylometric",
    ROOT_PATH / "Features-Extractions" / "Temporal Reasoning",
]

for path in FEATURE_PATHS:
    if str(path) not in sys.path:
        sys.path.insert(0, str(path))

# standard imports
import pandas as pd
import numpy as np
import torch
import warnings
warnings.filterwarnings('ignore')

# device setup
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device: {DEVICE}")
if torch.cuda.is_available():
    print(f"gpu: {torch.cuda.get_device_name(0)}")

# paths for lexicons and temporal model
LEXICON_PATH = ROOT_PATH / "Features-Extractions" / "Metacognition" / "jsons"
TEMPORAL_MODEL_PATH = ROOT_PATH / "models" / "Temporal Reasoning" /  "temporal-pairwise-model" / "roberta_matres_binary"
OUTPUT_PATH = ROOT_PATH / "output"
OUTPUT_PATH.mkdir(exist_ok=True)

In [None]:
from pathlib import Path
import pandas as pd


def load_input_data(input_path: str) -> pd.DataFrame:
    """Load data from csv, json, or txt file."""
    input_path = Path(input_path)
    suffix = input_path.suffix.lower()

    if suffix == '.csv':
        df = pd.read_csv(input_path)
    elif suffix == '.json':
        df = pd.read_json(input_path)
    elif suffix == '.txt':
        with open(input_path, 'r', encoding='utf-8') as f:
            texts = [line.strip() for line in f if line.strip()]
        df = pd.DataFrame({'text': texts})
    else:
        raise ValueError(f"unsupported file format: {suffix}")

    # ensure id column
    if 'id' not in df.columns:
        df['id'] = [f'doc_{i:05d}' for i in range(len(df))]

    # ensure text column
    if 'text' not in df.columns:
        if 'generation' in df.columns:
            df['text'] = df['generation']
        else:
            raise ValueError("no 'text' column found")

    print(f"loaded {len(df)} documents")
    return df


# input your file path here (sanitize quotes)
raw_input_path = input("enter path to input file (csv/json/txt): ").strip()
INPUT_FILE = (
    raw_input_path[1:-1]
    if raw_input_path.startswith(("'", '"')) and raw_input_path.endswith(("'", '"'))
    else raw_input_path
)

df_input = load_input_data(INPUT_FILE)
print(df_input.head(2))

In [None]:
#load spacy models

import spacy

def load_spacy():
    print("loading spacy model...")
    try:
        nlp = spacy.load("en_core_web_trf")
    except OSError:
        print("downloading en_core_web_trf...")
        from spacy.cli import download
        download("en_core_web_trf")
        nlp = spacy.load("en_core_web_trf")
    
    if DEVICE.type == 'cuda':
        try:
            spacy.require_gpu()
            print("spacy gpu enabled")
        except:
            pass
    print("spacy loaded")
    return nlp

nlp = load_spacy()

In [None]:
#extract stylometric features (base_ prefix)


from stylometric_extractor import (
    load_nlp_resources, extract_stylometric_features, 
    impute_missing_features
)
from tqdm import tqdm

print("extracting stylometric features...")

# load phoneme resources
_, cmu_dict, g2p_model = load_nlp_resources()

def extract_base_features(df: pd.DataFrame) -> pd.DataFrame:
    """extract stylometric features with base_ prefix"""
    texts = df['text'].astype(str).tolist()
    doc_ids = df['id'].tolist()
    
    all_features = []
    for i, doc in enumerate(tqdm(nlp.pipe(texts, batch_size=8), total=len(texts), desc="base features")):
        text = texts[i]
        feats = extract_stylometric_features(doc, text, cmu_dict, g2p_model)
        feats['id'] = doc_ids[i]
        
        # rename with base_ prefix
        renamed = {'id': doc_ids[i]}
        for k, v in feats.items():
            if k != 'id':
                renamed[f'base_{k}'] = v
        all_features.append(renamed)
        
        if DEVICE.type == 'cuda' and (i + 1) % 50 == 0:
            torch.cuda.empty_cache()
    
    features_df = pd.DataFrame(all_features)
    return features_df

base_features_df = extract_base_features(df_input)
print(f"extracted base features")

In [None]:
#extract perplexity features (perp_ prefix)


from perplexity_extractor import (
    load_language_model, compute_log_perplexity, 
    compute_token_log_probabilities, create_perturbation
)

print("loading pythia model for perplexity...")
perp_model, perp_tokenizer = load_language_model(DEVICE)

def extract_perp_features(df: pd.DataFrame) -> pd.DataFrame:
    """extract perplexity features with perp_ prefix"""
    all_features = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="perp features"):
        doc_id = row['id']
        text = str(row['text'])
        
        features = {'id': doc_id}
        
        # document perplexity
        doc_ppl = compute_log_perplexity(text, perp_model, perp_tokenizer, DEVICE)
        features['perp_doc_perplexity'] = doc_ppl
        
        # sentence-level stats
        doc = nlp(text)
        sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
        
        sent_ppls = []
        for sent in sentences:
            ppl = compute_log_perplexity(sent, perp_model, perp_tokenizer, DEVICE)
            if not np.isnan(ppl) and np.isfinite(ppl):
                sent_ppls.append(ppl)
        
        features['perp_mean_sentence_perplexity'] = np.mean(sent_ppls) if sent_ppls else np.nan
        features['perp_sentence_perplexity_variance'] = np.var(sent_ppls) if len(sent_ppls) > 1 else np.nan
        
        # token probability entropy
        token_log_probs = compute_token_log_probabilities(text, perp_model, perp_tokenizer, DEVICE)
        if len(token_log_probs) > 0:
            token_probs = np.exp(token_log_probs)
            features['perp_token_probability_entropy'] = -np.sum(token_probs * token_log_probs)
        else:
            features['perp_token_probability_entropy'] = np.nan
        
        # curvature
        if not np.isnan(doc_ppl) and not np.isnan(features['perp_mean_sentence_perplexity']):
            features['perp_curvature'] = doc_ppl - features['perp_mean_sentence_perplexity']
        else:
            features['perp_curvature'] = np.nan
        
        # burstiness
        if len(sent_ppls) > 1:
            valid = [p for p in sent_ppls if np.isfinite(p) and p > 0]
            features['perp_burstiness'] = np.max(valid) / np.min(valid) if len(valid) > 1 else np.nan
        else:
            features['perp_burstiness'] = np.nan
        
        # trajectory slope
        if len(sent_ppls) > 2:
            try:
                slope, _ = np.polyfit(range(len(sent_ppls)), sent_ppls, 1)
                features['perp_trajectory_slope'] = slope
            except:
                features['perp_trajectory_slope'] = np.nan
        else:
            features['perp_trajectory_slope'] = np.nan
        
        # perturbation discrepancy
        if not np.isnan(doc_ppl) and len(text.split()) > 20:
            pert_ppls = []
            for _ in range(5):
                pert_text = create_perturbation(text)
                pert_ppl = compute_log_perplexity(pert_text, perp_model, perp_tokenizer, DEVICE)
                if not np.isnan(pert_ppl):
                    pert_ppls.append(pert_ppl)
            if len(pert_ppls) >= 2:
                mean_pert, std_pert = np.mean(pert_ppls), np.std(pert_ppls)
                features['perp_perturbation_discrepancy'] = (doc_ppl - mean_pert) / std_pert if std_pert > 0 else doc_ppl - mean_pert
            else:
                features['perp_perturbation_discrepancy'] = np.nan
        else:
            features['perp_perturbation_discrepancy'] = np.nan
        
        all_features.append(features)
        
        if DEVICE.type == 'cuda' and (idx + 1) % 20 == 0:
            torch.cuda.empty_cache()
    
    return pd.DataFrame(all_features)

perp_features_df = extract_perp_features(df_input)
print(f"extracted perplexity features")

In [None]:

#extract metacognition features


from metacognition_extractor import (
    load_lexicons, build_all_lookups, extract_metacognition_features
)

print("loading metacognition lexicons...")
os.chdir(str(ROOT_PATH / "Features-Extractions" / "Metacognition"))
meta_lexicons = load_lexicons()
meta_lookups = build_all_lookups(meta_lexicons)
os.chdir(str(ROOT_PATH))

def extract_meta_features(df: pd.DataFrame) -> pd.DataFrame:
    """extract metacognition features with meta_ prefix"""
    texts = df['text'].astype(str).tolist()
    doc_ids = df['id'].tolist()
    
    all_features = []
    for i, doc in enumerate(tqdm(nlp.pipe(texts, batch_size=8), total=len(texts), desc="meta features")):
        feats = extract_metacognition_features(doc, doc_ids[i], meta_lookups)
        
        # rename with meta_ prefix
        renamed = {'id': doc_ids[i]}
        for k, v in feats.items():
            if k != 'id':
                renamed[f'meta_{k}'] = v
        all_features.append(renamed)
        
        if DEVICE.type == 'cuda' and (i + 1) % 50 == 0:
            torch.cuda.empty_cache()
    
    return pd.DataFrame(all_features)

meta_features_df = extract_meta_features(df_input)
print(f"extracted metacognitive features")

In [None]:
#extract calibration features (calib_ prefix)


from calibration_extractor import (
    extract_sentence_densities, compute_correlation_feature
)
from scipy.stats import pearsonr

def extract_calib_features(df: pd.DataFrame, meta_df: pd.DataFrame, perp_df: pd.DataFrame) -> pd.DataFrame:
    """extract calibration features with calib_ prefix"""
    
    # build hedge/booster sets from lookups
    hedges = meta_lookups.get('hedges', {})
    hedge_set = set(hedges.keys()) if isinstance(hedges, dict) else set()
    boosters = meta_lookups.get('boosters', {})
    booster_set = set(boosters.keys()) if isinstance(boosters, dict) else set()
    
    all_features = []
    texts = df['text'].astype(str).tolist()
    doc_ids = df['id'].tolist()
    
    for i, doc in enumerate(tqdm(nlp.pipe(texts, batch_size=8), total=len(texts), desc="calib features")):
        doc_id = doc_ids[i]
        features = {'id': doc_id}
        
        sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
        
        if len(sentences) < 5:
            features['calib_hedge_perplexity_correlation'] = 0.0
            features['calib_booster_perplexity_anticorrelation'] = 0.0
            features['calib_metacog_spike_perplexity_ratio'] = 1.0
            features['calib_certainty_perplexity_alignment'] = 0.0
            features['calib_reformulation_complexity_match'] = 0.0
        else:
            # compute per-sentence perplexity and densities
            sent_ppls, hedge_dens, booster_dens = [], [], []
            for sent in sentences:
                ppl = compute_log_perplexity(sent, perp_model, perp_tokenizer, DEVICE)
                if not np.isnan(ppl):
                    sent_ppls.append(ppl)
                    hedge_dens.append(extract_sentence_densities(sent, hedge_set))
                    booster_dens.append(extract_sentence_densities(sent, booster_set))
            
            # correlations
            if len(sent_ppls) >= 5 and np.var(hedge_dens) > 0:
                try:
                    r, _ = pearsonr(hedge_dens, sent_ppls)
                    features['calib_hedge_perplexity_correlation'] = 0.0 if np.isnan(r) else r
                except:
                    features['calib_hedge_perplexity_correlation'] = 0.0
            else:
                features['calib_hedge_perplexity_correlation'] = 0.0
            
            if len(sent_ppls) >= 5 and np.var(booster_dens) > 0:
                try:
                    r, _ = pearsonr(booster_dens, sent_ppls)
                    features['calib_booster_perplexity_anticorrelation'] = 0.0 if np.isnan(r) else r
                except:
                    features['calib_booster_perplexity_anticorrelation'] = 0.0
            else:
                features['calib_booster_perplexity_anticorrelation'] = 0.0
            
            # spike ratio
            if sent_ppls:
                baseline = np.mean(sent_ppls)
                features['calib_metacog_spike_perplexity_ratio'] = min(np.max(sent_ppls) / baseline, 2.5) if baseline > 0 else 1.0
            else:
                features['calib_metacog_spike_perplexity_ratio'] = 1.0
            
            # certainty-perplexity alignment
            meta_row = meta_df[meta_df['id'] == doc_id]
            perp_row = perp_df[perp_df['id'] == doc_id]
            
            if not meta_row.empty and not perp_row.empty:
                cert = meta_row['meta_certainty_overall'].values[0] if 'meta_certainty_overall' in meta_row.columns else 0.5
                doc_ppl = perp_row['perp_doc_perplexity'].values[0] if 'perp_doc_perplexity' in perp_row.columns else 1.0
                if not np.isnan(doc_ppl):
                    perp_norm = 1 / (1 + doc_ppl)
                    features['calib_certainty_perplexity_alignment'] = 1 - abs(cert - perp_norm)
                else:
                    features['calib_certainty_perplexity_alignment'] = 0.0
            else:
                features['calib_certainty_perplexity_alignment'] = 0.0
            
            # reformulation-complexity match
            if not meta_row.empty and not perp_row.empty:
                reform = meta_row['meta_reformulation_density'].values[0] if 'meta_reformulation_density' in meta_row.columns else 0.0
                sent_var = perp_row['perp_sentence_perplexity_variance'].values[0] if 'perp_sentence_perplexity_variance' in perp_row.columns else 0.0
                if not np.isnan(sent_var) and sent_var >= 0.01:
                    features['calib_reformulation_complexity_match'] = reform * sent_var
                else:
                    features['calib_reformulation_complexity_match'] = 0.0
            else:
                features['calib_reformulation_complexity_match'] = 0.0
        
        all_features.append(features)
    
    return pd.DataFrame(all_features)

calib_features_df = extract_calib_features(df_input, meta_features_df, perp_features_df)
print(f"extracted calibration features")

In [None]:
#extract coreference features (coref_ prefix)

from fastcoref import FCoref

from coreference_extraction_local import (
    load_models as load_coref_models, extract_all_features as extract_coref_all
)

print("loading coreference model...")
coref_model, coref_nlp = load_coref_models(str(DEVICE))

def extract_coref_features(df: pd.DataFrame) -> pd.DataFrame:
    """extract coreference features with coref_ prefix"""
    all_features = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="coref features"):
        doc_id = row['id']
        text = str(row['text'])
        
        feats = extract_coref_all(text, coref_model, coref_nlp)
        
        # rename with coref_ prefix
        renamed = {'id': doc_id}
        for k, v in feats.items():
            renamed[f'coref_{k}'] = v
        all_features.append(renamed)
        
        if DEVICE.type == 'cuda' and (idx + 1) % 20 == 0:
            torch.cuda.empty_cache()
    
    return pd.DataFrame(all_features)

coref_features_df = extract_coref_features(df_input)
print(f"extracted {len(coref_features_df.columns) - 1} coref features")

In [None]:
#extract coherence features (coher_ prefix)

from coherence_extractor import (
    load_sentence_transformer, extract_entities_from_doc,
    compute_entity_features, compute_semantic_features,
    create_empty_entity_features, create_empty_semantic_features
)
from sentence_transformers import SentenceTransformer

print("loading sentence-bert...")
sbert = load_sentence_transformer()

def extract_coher_features(df: pd.DataFrame) -> pd.DataFrame:
    """extract coherence features with coher_ prefix"""
    texts = df['text'].astype(str).tolist()
    doc_ids = df['id'].tolist()
    
    all_features = []
    
    for i, doc in enumerate(tqdm(nlp.pipe(texts, batch_size=8), total=len(texts), desc="coher features")):
        doc_id = doc_ids[i]
        
        # entity features
        entity_info = extract_entities_from_doc(doc, doc_id)
        if len(entity_info['sentences']) >= 2 and len(entity_info['all_entities']) > 0:
            entity_feats = compute_entity_features(doc_id, entity_info)
        else:
            entity_feats = create_empty_entity_features(doc_id)
        
        # semantic features
        sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
        if len(sentences) >= 2:
            embeddings = sbert.encode(sentences, batch_size=32, convert_to_numpy=True, show_progress_bar=False)
            semantic_feats = compute_semantic_features(doc_id, embeddings)
        else:
            semantic_feats = create_empty_semantic_features(doc_id)
        
        # combine and rename with coher_ prefix
        renamed = {'id': doc_id}
        for feats in [entity_feats, semantic_feats]:
            for k, v in feats.items():
                if k != 'id':
                    renamed[f'coher_{k}'] = v
        
        all_features.append(renamed)
        
        if DEVICE.type == 'cuda' and (i + 1) % 50 == 0:
            torch.cuda.empty_cache()
    
    return pd.DataFrame(all_features)

coher_features_df = extract_coher_features(df_input)
print(f"extracted {len(coher_features_df.columns) - 1} coher features")

In [None]:
# temporal features extraction (temp_ prefix)

import sys
import subprocess
import gc
from pathlib import Path

# ensure temporal reasoning path is in sys.path
TEMPORAL_PATH = ROOT_PATH / "Features-Extractions" / "Temporal Reasoning"
if str(TEMPORAL_PATH) not in sys.path:
    sys.path.insert(0, str(TEMPORAL_PATH))

# import extraction functions from temporal_extraction_local.py
from temporal_extraction_local import (
    Config as ExtractionConfig,
    check_java,
    download_corenlp,
    start_corenlp_server,
    setup_stanza,
    load_temporal_classifier,
    extract_events_stanza,
    extract_temporal_expressions,
    extract_binary_relations,
    build_raw_temporal_graph,
    build_greedy_temporal_graph,
)

# import feature functions from temporal_features_local.py
from temporal_features_local import (
    Config as FeatureConfig,
    extract_event_structure_features,
    extract_relation_features,
    extract_graph_features,
    extract_constraint_features,
    extract_form_meaning_features,
    extract_graph_organization_features,
    create_empty_features,
)


# step 0: check and install java if needed
print("\n checking java installation")
if not check_java():
    print("  java not found, attempting installation...")
    try:
        subprocess.run(['apt-get', 'update'], check=True, capture_output=True)
        subprocess.run(['apt-get', 'install', '-y', 'default-jdk'], check=True, capture_output=True)
        print("  java installed successfully")
    except Exception as e:
        raise RuntimeError(f"failed to install java: {e}. please install java manually.")
else:
    print("  java found")

# step 1: setup stanza
print("\n initializing stanza...")
nlp_stanza = setup_stanza(DEVICE)

# step 2: setup corenlp for sutime
print("\n  setting up stanford corenlp...")
CORENLP_PATH = ROOT_PATH / "corenlp"
CORENLP_PATH.mkdir(exist_ok=True)
corenlp_dir = download_corenlp(str(CORENLP_PATH))
nlp_corenlp = start_corenlp_server(corenlp_dir)
print(f"  corenlp server started from: {corenlp_dir}")

# step 3: load trained temporal classifier
print("\n loading temporal classifier...")
if not TEMPORAL_MODEL_PATH.exists():
    raise FileNotFoundError(f"temporal model not found at: {TEMPORAL_MODEL_PATH}")
temporal_model, temporal_tokenizer = load_temporal_classifier(str(TEMPORAL_MODEL_PATH), DEVICE)
print(f"  model loaded from: {TEMPORAL_MODEL_PATH}")

# step 4: extract events, timex, and relations
print("\n[  extracting events, temporal expressions, and relations...")
texts_dict = {row['id']: str(row['text']) for _, row in df_input.iterrows()}
events_dict = {}
timex_dict = {}
relations_dict = {}

for idx, row in tqdm(df_input.iterrows(), total=len(df_input), desc="extraction"):
    doc_id = row['id']
    text = str(row['text'])
    
    try:
        # extract events using stanza
        events = extract_events_stanza(nlp_stanza, text, doc_id)
        events_dict[doc_id] = events
        
        # extract temporal expressions using sutime
        timex = extract_temporal_expressions(nlp_corenlp, text, doc_id)
        timex_dict[doc_id] = timex
        
        # extract binary temporal relations
        if len(events) >= 2:
            relations = extract_binary_relations(
                temporal_model, temporal_tokenizer,
                text, events, doc_id, DEVICE
            )
            relations_dict[doc_id] = relations
        else:
            relations_dict[doc_id] = []
            
    except Exception as e:
        print(f"  error {doc_id}: {e}")
        events_dict[doc_id] = []
        timex_dict[doc_id] = []
        relations_dict[doc_id] = []
    
    if DEVICE.type == 'cuda' and (idx + 1) % 50 == 0:
        torch.cuda.empty_cache()

print(f"  events: {sum(len(e) for e in events_dict.values()):,}")
print(f"  timex: {sum(len(t) for t in timex_dict.values()):,}")
print(f"  relations: {sum(len(r) for r in relations_dict.values()):,}")

# step 5: build temporal graphs
print("\n  building temporal graphs...")
raw_graphs = {}
dag_graphs = {}

for doc_id in tqdm(events_dict.keys(), desc="graph construction"):
    events = events_dict[doc_id]
    relations = relations_dict[doc_id]
    
    raw_graphs[doc_id] = build_raw_temporal_graph(relations)
    dag_graphs[doc_id] = build_greedy_temporal_graph(events, relations, doc_id)

non_empty_raw = sum(1 for g in raw_graphs.values() if g is not None)
non_empty_dag = sum(1 for g in dag_graphs.values() if g is not None)
print(f"  raw graphs: {non_empty_raw}, dag graphs: {non_empty_dag}")

# step 6: extract all 32 temporal features
print("\n  extracting temporal features...")
all_features = []

for doc_id in tqdm(events_dict.keys(), desc="feature extraction"):
    try:
        events = events_dict.get(doc_id, [])
        timex_list = timex_dict.get(doc_id, [])
        relations = relations_dict.get(doc_id, [])
        raw_graph = raw_graphs.get(doc_id)
        dag_graph = dag_graphs.get(doc_id)
        text = texts_dict.get(doc_id, "")
        
        features = {'id': doc_id}
        features.update(extract_event_structure_features(doc_id, events, timex_list, text))
        features.update(extract_relation_features(relations, raw_graph))
        features.update(extract_graph_features(dag_graph, raw_graph))
        features.update(extract_constraint_features(relations, events))
        features.update(extract_form_meaning_features(text, events, timex_list, relations, doc_id))
        features.update(extract_graph_organization_features(dag_graph))
        
        all_features.append(features)
    except Exception as e:
        print(f"  error {doc_id}: {e}")
        all_features.append(create_empty_features(doc_id))

temp_features_df = pd.DataFrame(all_features)

# cleanup
print("\ncleaning up...")
try:
    nlp_corenlp.close()
    print("  corenlp server closed")
except:
    pass

del nlp_stanza, temporal_model, temporal_tokenizer
del events_dict, timex_dict, relations_dict, raw_graphs, dag_graphs
gc.collect()
if DEVICE.type == 'cuda':
    torch.cuda.empty_cache()


print(f"temporal features extracted")
print(f"\nfeature columns:")
for col in sorted([c for c in temp_features_df.columns if c != 'id']):
    print(f"  - {col}")

In [None]:

# merge all features


def merge_all_features(*dfs) -> pd.DataFrame:
    """merge all feature dataframes on id"""
    result = dfs[0]
    for df in dfs[1:]:
        result = result.merge(df, on='id', how='outer')
    return result

# merge all feature sets
all_features_df = merge_all_features(
    base_features_df,
    perp_features_df,
    meta_features_df,
    calib_features_df,
    coref_features_df,
    coher_features_df,
    temp_features_df
)

print(f"\nmerged dataset shape: {all_features_df.shape}")
print(f"total features: {len(all_features_df.columns) - 1}")

# feature category breakdown
categories = {
    'base_': 'stylometric (baseline)',
    'perp_': 'perplexity',
    'meta_': 'metacognition',
    'calib_': 'calibration',
    'coref_': 'coreference',
    'coher_': 'coherence',
    'temp_': 'temporal'
}



In [None]:

#  handle missing values and save output


def clean_features(df: pd.DataFrame) -> pd.DataFrame:
    """clean and impute missing values"""
    df = df.copy()
    
    # replace inf with nan
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # report missing
    nan_counts = df.isna().sum()
    nan_cols = nan_counts[nan_counts > 0]
    if len(nan_cols) > 0:
        print(f"columns with missing values: {len(nan_cols)}")
        for col in nan_cols.index[:10]:
            print(f"  {col}: {nan_cols[col]} ({100*nan_cols[col]/len(df):.1f}%)")
        if len(nan_cols) > 10:
            print(f"  ... and {len(nan_cols) - 10} more")
    
    # impute with median
    num_cols = df.select_dtypes(include=[np.number]).columns
    for col in num_cols:
        if df[col].isna().any():
            median_val = df[col].median()
            if np.isnan(median_val):
                median_val = 0.0
            df[col].fillna(median_val, inplace=True)
    
    return df

# clean features
all_features_clean = clean_features(all_features_df)

# save to csv
output_file = OUTPUT_PATH / "all_cognitive_features.csv"
all_features_clean.to_csv(output_file, index=False)
print(f"\nfeatures saved to: {output_file}")

# also save original input with features merged
output_merged = OUTPUT_PATH / "input_with_features.csv"
df_with_features = df_input.merge(all_features_clean, on='id', how='left')
df_with_features.to_csv(output_merged, index=False)
print(f"merged data saved to: {output_merged}")

In [None]:
#  summary statistics


print(f"\ndocuments processed: {len(all_features_clean)}")

# summary stats for each category
print("\nfeature statistics by category:")
for prefix, name in categories.items():
    cols = [c for c in all_features_clean.columns if c.startswith(prefix)]
    if cols:
        subset = all_features_clean[cols]
        print(f"\n{name} ({len(cols)} features):")
        print(f"  mean range: [{subset.mean().min():.4f}, {subset.mean().max():.4f}]")
        print(f"  nan count: {subset.isna().sum().sum()}")

# sample output
print("\nsample output (first 3 rows, first 10 columns):")
print(all_features_clean.iloc[:3, :10].to_string())