# TALENT ArXiv Study v3 - All Analyses Dataset

**Dataset:**
- 5,173 analyses from 479 children
- All artifact types: text (51%), image (30%), musical (18%), audio, video, pdf
- 306 talent categories mapped to 7 bins
- Temporal split: 349 children with 2+ analyses for S1→S2 prediction

## Instructions:
1. Upload files to /content/:
   - `train.jsonl`, `val.jsonl`, `test.jsonl` (required)
   - `train_temporal.jsonl`, `test_temporal.jsonl` (optional, for temporal evaluation)
2. Runtime → Run all

## Models:
- Per-analysis: LogReg, LightGBM, LightGBM Calibrated
- Child-level: LogReg, LightGBM, LightGBM Calibrated, RandomForest
- Temporal: Predict S2 talents from S1 features

## 1. Setup

In [None]:
!pip install -q lightgbm shap scikit-learn matplotlib seaborn pandas numpy

In [None]:
import json
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from typing import List, Dict, Any, Tuple
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_recall_fscore_support,
    roc_curve, classification_report
)
import lightgbm as lgb
import shap

print("Libraries loaded!")

## 2. Load Data

In [None]:
DATA_DIR = "/content/"

def load_jsonl(filepath):
    with open(filepath, 'r') as f:
        return [json.loads(line) for line in f if line.strip()]

train_data = load_jsonl(os.path.join(DATA_DIR, 'train.jsonl'))
val_data = load_jsonl(os.path.join(DATA_DIR, 'val.jsonl'))
test_data = load_jsonl(os.path.join(DATA_DIR, 'test.jsonl'))

all_data = train_data + val_data + test_data

print(f"Loaded: train={len(train_data)}, val={len(val_data)}, test={len(test_data)}")
print(f"Total analyses: {len(all_data)}")

In [None]:
# Data statistics
analysis_types = defaultdict(int)
age_groups = defaultdict(int)
genders = defaultdict(int)

for r in all_data:
    analysis_types[r.get('analysis_type', 'unknown')] += 1
    age_groups[r.get('age_group', 'unknown')] += 1
    genders[r.get('gender') or 'unknown'] += 1

print("\nAnalysis types:")
for k, v in sorted(analysis_types.items(), key=lambda x: -x[1]):
    print(f"  {k}: {v} ({v/len(all_data)*100:.1f}%)")

print("\nAge groups:")
for k, v in sorted(age_groups.items()):
    print(f"  {k}: {v}")

print("\nGender:")
for k, v in sorted(genders.items(), key=lambda x: -x[1]):
    print(f"  {k}: {v}")

## 3. Discover Features

In [None]:
# Extract all unique category_scores keys
all_category_keys = set()
for r in all_data:
    all_category_keys.update(r.get('category_scores', {}).keys())
all_category_keys = sorted(list(all_category_keys))

# Extract all unique key_talents
all_key_talents = set()
for r in all_data:
    all_key_talents.update(r.get('key_talents', []))
all_key_talents = sorted(list(all_key_talents))

# Bins
BINS = ["academic", "sport", "art", "leadership", "service", "technology", "others"]

# Analysis types (all from new dataset)
ANALYSIS_TYPES = sorted(set(r.get('analysis_type', 'unknown') for r in all_data))

print(f"Category scores: {len(all_category_keys)} unique keys")
print(f"Key talents: {len(all_key_talents)} unique")
print(f"Analysis types: {ANALYSIS_TYPES}")
print(f"Bins: {BINS}")

## 4. Feature Extraction (Per-Analysis)

In [None]:
SCORE_THRESHOLD = 6.0
RANDOM_STATE = 42

def extract_features_per_analysis(records):
    """
    Extract features for each analysis.
    Features:
    - category_scores (up to 306)
    - key_talents one-hot
    - analysis_type one-hot
    """
    n_samples = len(records)
    
    n_cat = len(all_category_keys)
    n_talents = len(all_key_talents)
    n_analysis = len(ANALYSIS_TYPES)
    
    total_features = n_cat + n_talents + n_analysis
    
    X = np.zeros((n_samples, total_features), dtype=np.float32)
    bin_labels = []
    
    cat_to_idx = {c: i for i, c in enumerate(all_category_keys)}
    talent_to_idx = {t: n_cat + i for i, t in enumerate(all_key_talents)}
    analysis_to_idx = {a: n_cat + n_talents + i for i, a in enumerate(ANALYSIS_TYPES)}
    
    for i, rec in enumerate(records):
        # Category scores
        for cat, score in rec.get('category_scores', {}).items():
            if cat in cat_to_idx:
                X[i, cat_to_idx[cat]] = score
        
        # Key talents (one-hot)
        for talent in rec.get('key_talents', []):
            if talent in talent_to_idx:
                X[i, talent_to_idx[talent]] = 1.0
        
        # Analysis type
        atype = rec.get('analysis_type', 'text')
        if atype in analysis_to_idx:
            X[i, analysis_to_idx[atype]] = 1.0
        
        # Labels
        bin_scores = rec.get('bin_scores', {})
        positive_bins = [b for b, s in bin_scores.items() if s >= SCORE_THRESHOLD]
        bin_labels.append(positive_bins)
    
    mlb = MultiLabelBinarizer(classes=BINS)
    y = mlb.fit_transform(bin_labels)
    
    # Feature names
    feature_names = (all_category_keys + 
                     [f'talent_{t}' for t in all_key_talents] +
                     [f'atype_{a}' for a in ANALYSIS_TYPES])
    
    return X, y, feature_names

print("Extracting features (per-analysis)...")
X_train_pa, y_train_pa, feature_names = extract_features_per_analysis(train_data)
X_val_pa, y_val_pa, _ = extract_features_per_analysis(val_data)
X_test_pa, y_test_pa, _ = extract_features_per_analysis(test_data)

print(f"\nPer-Analysis Features:")
print(f"  Train: X={X_train_pa.shape}, y={y_train_pa.shape}")
print(f"  Val: X={X_val_pa.shape}, y={y_val_pa.shape}")
print(f"  Test: X={X_test_pa.shape}, y={y_test_pa.shape}")
print(f"  Total features: {len(feature_names)}")

In [None]:
# Check label distribution
print("\nLabel distribution (train):")
for i, bin_name in enumerate(BINS):
    pos_train = y_train_pa[:, i].sum()
    pos_test = y_test_pa[:, i].sum()
    print(f"  {bin_name}: train={pos_train}, test={pos_test}")

## 5. Child-Level Aggregation

In [None]:
def extract_child_level_features(records):
    """
    Aggregate features at child level.
    For each child:
    - mean, std, max of category_scores
    - count of analyses per type
    - union of key_talents
    - max bin_scores as labels
    """
    by_child = defaultdict(list)
    for r in records:
        cid = r.get('child_hash', r.get('child_id'))
        by_child[cid].append(r)
    
    child_ids = []
    X_list = []
    y_list = []
    
    n_cat = len(all_category_keys)
    cat_to_idx = {c: i for i, c in enumerate(all_category_keys)}
    talent_to_idx = {t: i for i, t in enumerate(all_key_talents)}
    
    for cid, recs in by_child.items():
        cat_scores_all = defaultdict(list)
        bin_scores_all = defaultdict(list)
        key_talents_union = set()
        type_counts = defaultdict(int)
        
        for r in recs:
            for cat, score in r.get('category_scores', {}).items():
                cat_scores_all[cat].append(score)
            for b, score in r.get('bin_scores', {}).items():
                bin_scores_all[b].append(score)
            key_talents_union.update(r.get('key_talents', []))
            type_counts[r.get('analysis_type', 'text')] += 1
        
        # Feature vector: mean, std, max of categories
        mean_cat = np.zeros(n_cat)
        std_cat = np.zeros(n_cat)
        max_cat = np.zeros(n_cat)
        
        for cat, scores in cat_scores_all.items():
            if cat in cat_to_idx:
                idx = cat_to_idx[cat]
                mean_cat[idx] = np.mean(scores)
                std_cat[idx] = np.std(scores) if len(scores) > 1 else 0
                max_cat[idx] = np.max(scores)
        
        # Key talents one-hot
        talents_vec = np.zeros(len(all_key_talents))
        for t in key_talents_union:
            if t in talent_to_idx:
                talents_vec[talent_to_idx[t]] = 1.0
        
        # Meta features
        n_analyses = len(recs)
        type_features = [type_counts.get(t, 0) for t in ANALYSIS_TYPES]
        
        x = np.concatenate([
            mean_cat,
            std_cat,
            max_cat,
            talents_vec,
            [n_analyses],
            type_features
        ])
        
        # Labels: max score per bin >= threshold
        labels = []
        for b in BINS:
            if b in bin_scores_all and max(bin_scores_all[b]) >= SCORE_THRESHOLD:
                labels.append(b)
        
        child_ids.append(cid)
        X_list.append(x)
        y_list.append(labels)
    
    X = np.array(X_list, dtype=np.float32)
    mlb = MultiLabelBinarizer(classes=BINS)
    y = mlb.fit_transform(y_list)
    
    feature_names_child = (
        [f'mean_{c}' for c in all_category_keys] +
        [f'std_{c}' for c in all_category_keys] +
        [f'max_{c}' for c in all_category_keys] +
        [f'talent_{t}' for t in all_key_talents] +
        ['n_analyses'] +
        [f'n_{t}' for t in ANALYSIS_TYPES]
    )
    
    return child_ids, X, y, feature_names_child

print("Extracting child-level features...")
train_cids, X_train_ch, y_train_ch, feature_names_ch = extract_child_level_features(train_data)
val_cids, X_val_ch, y_val_ch, _ = extract_child_level_features(val_data)
test_cids, X_test_ch, y_test_ch, _ = extract_child_level_features(test_data)

print(f"\nChild-Level Features:")
print(f"  Train: {len(train_cids)} children, X={X_train_ch.shape}")
print(f"  Val: {len(val_cids)} children, X={X_val_ch.shape}")
print(f"  Test: {len(test_cids)} children, X={X_test_ch.shape}")
print(f"  Total features: {len(feature_names_ch)}")

In [None]:
# Check child-level label distribution
print("\nChild-level label distribution (train):")
for i, bin_name in enumerate(BINS):
    pos_train = y_train_ch[:, i].sum()
    pos_test = y_test_ch[:, i].sum()
    print(f"  {bin_name}: train={pos_train}, test={pos_test}")

## 6. Evaluation Functions

In [None]:
def compute_ece(y_true, y_prob, n_bins=10):
    y_true_flat = y_true.flatten()
    y_prob_flat = y_prob.flatten()
    bin_boundaries = np.linspace(0, 1, n_bins + 1)
    ece = 0.0
    total = len(y_true_flat)
    for i in range(n_bins):
        mask = (y_prob_flat > bin_boundaries[i]) & (y_prob_flat <= bin_boundaries[i + 1])
        if mask.sum() == 0:
            continue
        bin_accuracy = y_true_flat[mask].mean()
        bin_confidence = y_prob_flat[mask].mean()
        ece += (mask.sum() / total) * abs(bin_accuracy - bin_confidence)
    return ece

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)
    
    try:
        roc_auc_macro = roc_auc_score(y, y_prob, average="macro")
    except:
        roc_auc_macro = None
    
    f1_macro = f1_score(y, y_pred, average="macro", zero_division=0)
    f1_micro = f1_score(y, y_pred, average="micro", zero_division=0)
    ece = compute_ece(y, y_prob)
    
    per_bin = {}
    for i, bin_name in enumerate(BINS):
        if y[:, i].sum() > 0:
            try:
                auc = roc_auc_score(y[:, i], y_prob[:, i])
            except:
                auc = None
            prec, rec, f1, _ = precision_recall_fscore_support(
                y[:, i], y_pred[:, i], average="binary", zero_division=0
            )
            per_bin[bin_name] = {"auc": auc, "precision": prec, "recall": rec, "f1": f1, "support": int(y[:, i].sum())}
    
    return {"roc_auc_macro": roc_auc_macro, "f1_macro": f1_macro, "f1_micro": f1_micro, "ece": ece, "per_bin": per_bin}

def print_results(name, results):
    print(f"\n{name}:")
    auc = f"{results['roc_auc_macro']:.4f}" if results['roc_auc_macro'] else "N/A"
    print(f"  ROC-AUC: {auc}, F1-macro: {results['f1_macro']:.4f}, ECE: {results['ece']:.4f}")

## 7. Train Models - Per-Analysis

In [None]:
print("="*70)
print("PER-ANALYSIS MODELS")
print("="*70)

# Scale features
scaler_pa = StandardScaler()
X_train_pa_scaled = scaler_pa.fit_transform(X_train_pa)
X_test_pa_scaled = scaler_pa.transform(X_test_pa)

# LogReg
print("\nTraining LogReg...")
logreg_pa = OneVsRestClassifier(
    LogisticRegression(max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE),
    n_jobs=-1
)
logreg_pa.fit(X_train_pa_scaled, y_train_pa)
logreg_pa_results = evaluate_model(logreg_pa, X_test_pa_scaled, y_test_pa)
print_results("LogReg (per-analysis)", logreg_pa_results)

# LightGBM
print("\nTraining LightGBM...")
lgb_pa = OneVsRestClassifier(
    lgb.LGBMClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, 
                       class_weight='balanced', random_state=RANDOM_STATE, verbose=-1),
    n_jobs=-1
)
lgb_pa.fit(X_train_pa, y_train_pa)
lgb_pa_results = evaluate_model(lgb_pa, X_test_pa, y_test_pa)
print_results("LightGBM (per-analysis)", lgb_pa_results)

# Calibrated LightGBM - use cv=2 to handle small classes
print("\nTraining LightGBM Calibrated...")
lgb_cal_pa = OneVsRestClassifier(
    CalibratedClassifierCV(
        lgb.LGBMClassifier(n_estimators=100, max_depth=6, learning_rate=0.1,
                           class_weight='balanced', random_state=RANDOM_STATE, verbose=-1),
        method='sigmoid', cv=2  # Use cv=2 and sigmoid to handle small classes
    ),
    n_jobs=-1
)
lgb_cal_pa.fit(X_train_pa, y_train_pa)
lgb_cal_pa_results = evaluate_model(lgb_cal_pa, X_test_pa, y_test_pa)
print_results("LightGBM Calibrated (per-analysis)", lgb_cal_pa_results)

## 8. Train Models - Child-Level

In [None]:
print("="*70)
print("CHILD-LEVEL MODELS")
print("="*70)

# Check minimum class size for calibration
min_class_train = min(y_train_ch[:, i].sum() for i in range(len(BINS)))
print(f"\nMin class size in train: {min_class_train}")

# Scale features
scaler_ch = StandardScaler()
X_train_ch_scaled = scaler_ch.fit_transform(X_train_ch)
X_test_ch_scaled = scaler_ch.transform(X_test_ch)

# LogReg
print("\nTraining LogReg (child-level)...")
logreg_ch = OneVsRestClassifier(
    LogisticRegression(max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE),
    n_jobs=-1
)
logreg_ch.fit(X_train_ch_scaled, y_train_ch)
logreg_ch_results = evaluate_model(logreg_ch, X_test_ch_scaled, y_test_ch)
print_results("LogReg (child-level)", logreg_ch_results)

# LightGBM
print("\nTraining LightGBM (child-level)...")
lgb_ch = OneVsRestClassifier(
    lgb.LGBMClassifier(n_estimators=200, max_depth=8, learning_rate=0.05,
                       class_weight='balanced', random_state=RANDOM_STATE, verbose=-1),
    n_jobs=-1
)
lgb_ch.fit(X_train_ch, y_train_ch)
lgb_ch_results = evaluate_model(lgb_ch, X_test_ch, y_test_ch)
print_results("LightGBM (child-level)", lgb_ch_results)

# Calibrated LightGBM - only if we have enough samples
if min_class_train >= 2:
    print("\nTraining LightGBM Calibrated (child-level)...")
    lgb_cal_ch = OneVsRestClassifier(
        CalibratedClassifierCV(
            lgb.LGBMClassifier(n_estimators=200, max_depth=8, learning_rate=0.05,
                               class_weight='balanced', random_state=RANDOM_STATE, verbose=-1),
            method='sigmoid', cv=min(2, min_class_train)  # Adaptive CV
        ),
        n_jobs=-1
    )
    lgb_cal_ch.fit(X_train_ch, y_train_ch)
    lgb_cal_ch_results = evaluate_model(lgb_cal_ch, X_test_ch, y_test_ch)
    print_results("LightGBM Calibrated (child-level)", lgb_cal_ch_results)
else:
    lgb_cal_ch_results = None
    print("\nSkipping calibrated model - insufficient samples per class")

# Random Forest
print("\nTraining Random Forest (child-level)...")
rf_ch = OneVsRestClassifier(
    RandomForestClassifier(n_estimators=200, max_depth=10, class_weight='balanced',
                           random_state=RANDOM_STATE, n_jobs=-1),
    n_jobs=-1
)
rf_ch.fit(X_train_ch, y_train_ch)
rf_ch_results = evaluate_model(rf_ch, X_test_ch, y_test_ch)
print_results("Random Forest (child-level)", rf_ch_results)

## 9. Model Comparison

In [None]:
print("="*70)
print("MODEL COMPARISON")
print("="*70)

all_results = {
    "LogReg (PA)": logreg_pa_results,
    "LightGBM (PA)": lgb_pa_results,
    "LightGBM Cal (PA)": lgb_cal_pa_results,
    "LogReg (Child)": logreg_ch_results,
    "LightGBM (Child)": lgb_ch_results,
    "RandomForest (Child)": rf_ch_results,
}

if lgb_cal_ch_results:
    all_results["LightGBM Cal (Child)"] = lgb_cal_ch_results

print(f"\n{'Model':<25} {'ROC-AUC':>10} {'F1-macro':>10} {'ECE':>10}")
print("-"*57)

for name, res in all_results.items():
    auc = f"{res['roc_auc_macro']:.4f}" if res['roc_auc_macro'] else "N/A"
    print(f"{name:<25} {auc:>10} {res['f1_macro']:>10.4f} {res['ece']:>10.4f}")

## 10. Per-Bin Analysis

In [None]:
# Best per-analysis model
best_pa_name = max(['LogReg (PA)', 'LightGBM (PA)', 'LightGBM Cal (PA)'], 
                   key=lambda k: all_results[k]['f1_macro'])
best_pa_results = all_results[best_pa_name]

print(f"\nBest Per-Analysis Model: {best_pa_name}")
print(f"\n{'Bin':<15} {'AUC':>8} {'Prec':>8} {'Recall':>8} {'F1':>8} {'Support':>8}")
print("-"*58)

for bin_name in BINS:
    if bin_name in best_pa_results['per_bin']:
        m = best_pa_results['per_bin'][bin_name]
        auc = f"{m['auc']:.3f}" if m['auc'] else "N/A"
        print(f"{bin_name:<15} {auc:>8} {m['precision']:>8.3f} {m['recall']:>8.3f} {m['f1']:>8.3f} {m['support']:>8}")

## 11. SHAP Analysis

In [None]:
print("="*70)
print("SHAP ANALYSIS (Per-Analysis LightGBM)")
print("="*70)

shap_results = {}
X_shap = X_test_pa[:100]  # Sample

for i, bin_name in enumerate(BINS):
    print(f"\n  SHAP for {bin_name}...")
    estimator = lgb_pa.estimators_[i]
    explainer = shap.TreeExplainer(estimator)
    shap_values = explainer.shap_values(X_shap)
    
    shap_vals = shap_values[1] if isinstance(shap_values, list) else shap_values
    mean_abs_shap = np.abs(shap_vals).mean(axis=0)
    
    top_idx = np.argsort(mean_abs_shap)[-10:][::-1]
    top_features = [(feature_names[j], mean_abs_shap[j]) for j in top_idx]
    shap_results[bin_name] = top_features
    
    print(f"    Top 3: {[f[0][:30] for f in top_features[:3]]}")

In [None]:
# SHAP plot for academic
print("\nGenerating SHAP plot for 'academic'...")
bin_idx = BINS.index('academic')
estimator = lgb_pa.estimators_[bin_idx]
explainer = shap.TreeExplainer(estimator)
shap_values = explainer.shap_values(X_shap)
shap_vals = shap_values[1] if isinstance(shap_values, list) else shap_values

# Shorten feature names
short_names = [n.replace('intellectual.', 'int.').replace('gardner.', 'g.').replace('creative.', 'cr.')[:30] for n in feature_names]
X_shap_df = pd.DataFrame(X_shap, columns=short_names)

plt.figure(figsize=(12, 8))
shap.summary_plot(shap_vals, X_shap_df, show=False, max_display=20)
plt.title("SHAP - Academic (Per-Analysis Features)")
plt.tight_layout()
plt.savefig('shap_academic_v3.png', dpi=150, bbox_inches='tight')
plt.show()
print("Saved: shap_academic_v3.png")

## 12. Visualizations

In [None]:
print("\nGenerating comparison plots...")

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

model_names = list(all_results.keys())
metrics = ['roc_auc_macro', 'f1_macro', 'ece']
metric_labels = ['ROC-AUC', 'F1-macro', 'ECE']

for idx, (metric, label) in enumerate(zip(metrics, metric_labels)):
    ax = axes[idx]
    values = [all_results[m][metric] if all_results[m][metric] else 0 for m in model_names]
    colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(model_names)))
    
    bars = ax.barh(model_names, values, color=colors)
    ax.set_xlabel(label)
    ax.set_title(label)
    
    for bar, val in zip(bars, values):
        ax.text(val + 0.01, bar.get_y() + bar.get_height()/2, f'{val:.3f}', va='center', fontsize=8)

plt.suptitle('Model Comparison: All Analyses Dataset (v3)', fontsize=14)
plt.tight_layout()
plt.savefig('model_comparison_v3.png', dpi=150, bbox_inches='tight')
plt.show()
print("Saved: model_comparison_v3.png")

In [None]:
# Analysis type distribution
fig, ax = plt.subplots(figsize=(10, 6))
types_sorted = sorted(analysis_types.items(), key=lambda x: -x[1])
ax.bar([t[0] for t in types_sorted], [t[1] for t in types_sorted], color='steelblue')
ax.set_xlabel('Analysis Type')
ax.set_ylabel('Count')
ax.set_title('Distribution of Analysis Types in Dataset')
for i, (t, v) in enumerate(types_sorted):
    ax.text(i, v + 50, f'{v}\n({v/len(all_data)*100:.1f}%)', ha='center', fontsize=9)
plt.tight_layout()
plt.savefig('analysis_types_v3.png', dpi=150, bbox_inches='tight')
plt.show()
print("Saved: analysis_types_v3.png")

## 13. Save Results

In [None]:
print("="*70)
print("SAVING RESULTS")
print("="*70)

final_results = {
    "experiment_info": {
        "version": "v3_all_analyses",
        "date": pd.Timestamp.now().isoformat(),
        "n_analyses_total": len(all_data),
        "n_analyses_train": len(train_data),
        "n_analyses_val": len(val_data),
        "n_analyses_test": len(test_data),
        "n_children_train": len(train_cids),
        "n_children_test": len(test_cids),
        "n_features_per_analysis": len(feature_names),
        "n_features_child_level": len(feature_names_ch),
        "n_category_scores": len(all_category_keys),
        "n_key_talents": len(all_key_talents),
        "analysis_types": dict(analysis_types),
        "bins": BINS
    },
    "models": all_results,
    "shap_top_features": {
        bin_name: [(f, float(s)) for f, s in features[:5]]
        for bin_name, features in shap_results.items()
    }
}

with open('arxiv_ml_results_v3.json', 'w') as f:
    json.dump(final_results, f, indent=2, default=float)

print("Saved: arxiv_ml_results_v3.json")

In [None]:
# Download
from google.colab import files

print("\nDownloading files...")
files.download('arxiv_ml_results_v3.json')
files.download('shap_academic_v3.png')
files.download('model_comparison_v3.png')
files.download('analysis_types_v3.png')

print("\nDone!")

## 14. Summary

In [None]:
print("="*70)
print("SUMMARY")
print("="*70)

print(f"\nDataset v3 Statistics:")
print(f"  Total analyses: {len(all_data)}")
print(f"  Unique children: {len(set(r.get('child_id') for r in all_data))}")
print(f"  Analysis types: {len(ANALYSIS_TYPES)} ({', '.join(ANALYSIS_TYPES)})")
print(f"  Category scores: {len(all_category_keys)}")
print(f"  Key talents: {len(all_key_talents)}")

print(f"\nFeatures:")
print(f"  Per-analysis: {len(feature_names)}")
print(f"  Child-level: {len(feature_names_ch)}")

# Find best model
best_name = max(all_results.keys(), key=lambda k: all_results[k]['f1_macro'])
best_results = all_results[best_name]

print(f"\nBest Model: {best_name}")
auc_str = f"{best_results['roc_auc_macro']:.4f}" if best_results['roc_auc_macro'] else "N/A"
print(f"  ROC-AUC: {auc_str}")
print(f"  F1-macro: {best_results['f1_macro']:.4f}")
print(f"  ECE: {best_results['ece']:.4f}")

print("\n" + "="*70)
print("Proceed to Section 15 for Temporal Evaluation")
print("="*70)

## 15. Temporal Evaluation (S1 → S2 Prediction)

**Purpose:** Evaluate predictive validity - can we predict a child's future talents (S2) from their earlier assessments (S1)?

**Data structure:**
- For each child with 2+ analyses, analyses are split chronologically:
  - **S1 (first half):** Features for prediction
  - **S2 (second half):** Labels to predict
  
**Files needed:** `train_temporal.jsonl`, `test_temporal.jsonl`

In [None]:
# Load temporal data
print("="*70)
print("TEMPORAL EVALUATION: S1 → S2 Prediction")
print("="*70)

temporal_train_path = os.path.join(DATA_DIR, 'train_temporal.jsonl')
temporal_test_path = os.path.join(DATA_DIR, 'test_temporal.jsonl')

# Check if temporal files exist
if not os.path.exists(temporal_train_path) or not os.path.exists(temporal_test_path):
    print("\nWARNING: Temporal files not found!")
    print("Please upload train_temporal.jsonl and test_temporal.jsonl")
    print("Skipping temporal evaluation...")
    TEMPORAL_AVAILABLE = False
else:
    temporal_train = load_jsonl(temporal_train_path)
    temporal_test = load_jsonl(temporal_test_path)
    TEMPORAL_AVAILABLE = True
    
    print(f"\nTemporal dataset loaded:")
    print(f"  Train: {len(temporal_train)} children")
    print(f"  Test: {len(temporal_test)} children")
    
    # Statistics
    total_s1 = sum(r['n_s1_analyses'] for r in temporal_train + temporal_test)
    total_s2 = sum(r['n_s2_analyses'] for r in temporal_train + temporal_test)
    print(f"  S1 analyses (features): {total_s1}")
    print(f"  S2 analyses (labels): {total_s2}")

In [None]:
def extract_temporal_features(records):
    """
    Extract features from S1 data (bin_scores_s1, category_scores, key_talents)
    Labels from S2 data (bin_scores_s2)
    """
    n_samples = len(records)
    n_cat = len(all_category_keys)
    n_bins = len(BINS)
    
    # Features: category_scores (mean from S1) + bin_scores_s1 (max from S1)
    total_features = n_cat + n_bins
    
    X = np.zeros((n_samples, total_features), dtype=np.float32)
    y_labels = []
    
    cat_to_idx = {c: i for i, c in enumerate(all_category_keys)}
    bin_to_idx = {b: n_cat + i for i, b in enumerate(BINS)}
    
    for i, rec in enumerate(records):
        # S1 category scores (features)
        for cat, score in rec.get('category_scores', {}).items():
            if cat in cat_to_idx:
                X[i, cat_to_idx[cat]] = score
        
        # S1 bin scores (features)
        for bin_name, score in rec.get('bin_scores_s1', {}).items():
            if bin_name in bin_to_idx:
                X[i, bin_to_idx[bin_name]] = score
        
        # S2 bin scores (labels) - predict future talents
        bin_scores_s2 = rec.get('bin_scores_s2', {})
        positive_bins = [b for b, s in bin_scores_s2.items() if s >= SCORE_THRESHOLD]
        y_labels.append(positive_bins)
    
    mlb = MultiLabelBinarizer(classes=BINS)
    y = mlb.fit_transform(y_labels)
    
    feature_names_temp = all_category_keys + [f's1_{b}' for b in BINS]
    
    return X, y, feature_names_temp

if TEMPORAL_AVAILABLE:
    print("\nExtracting temporal features...")
    X_temp_train, y_temp_train, feat_names_temp = extract_temporal_features(temporal_train)
    X_temp_test, y_temp_test, _ = extract_temporal_features(temporal_test)
    
    print(f"  Train: X={X_temp_train.shape}, y={y_temp_train.shape}")
    print(f"  Test: X={X_temp_test.shape}, y={y_temp_test.shape}")
    print(f"  Features: {len(feat_names_temp)}")
    
    # Label distribution for S2
    print("\nS2 label distribution (train):")
    for i, bin_name in enumerate(BINS):
        pos = y_temp_train[:, i].sum()
        print(f"  {bin_name}: {pos}")

In [None]:
# Train temporal models
if TEMPORAL_AVAILABLE:
    print("="*70)
    print("TEMPORAL MODELS (S1 → S2)")
    print("="*70)
    
    # Scale features
    scaler_temp = StandardScaler()
    X_temp_train_scaled = scaler_temp.fit_transform(X_temp_train)
    X_temp_test_scaled = scaler_temp.transform(X_temp_test)
    
    temporal_results = {}
    
    # LogReg
    print("\nTraining LogReg (temporal)...")
    logreg_temp = OneVsRestClassifier(
        LogisticRegression(max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE),
        n_jobs=-1
    )
    logreg_temp.fit(X_temp_train_scaled, y_temp_train)
    temporal_results["LogReg (Temporal)"] = evaluate_model(logreg_temp, X_temp_test_scaled, y_temp_test)
    print_results("LogReg (Temporal)", temporal_results["LogReg (Temporal)"])
    
    # LightGBM
    print("\nTraining LightGBM (temporal)...")
    lgb_temp = OneVsRestClassifier(
        lgb.LGBMClassifier(n_estimators=100, max_depth=6, learning_rate=0.1,
                           class_weight='balanced', random_state=RANDOM_STATE, verbose=-1),
        n_jobs=-1
    )
    lgb_temp.fit(X_temp_train, y_temp_train)
    temporal_results["LightGBM (Temporal)"] = evaluate_model(lgb_temp, X_temp_test, y_temp_test)
    print_results("LightGBM (Temporal)", temporal_results["LightGBM (Temporal)"])
    
    # Random Forest
    print("\nTraining Random Forest (temporal)...")
    rf_temp = OneVsRestClassifier(
        RandomForestClassifier(n_estimators=100, max_depth=8, class_weight='balanced',
                               random_state=RANDOM_STATE, n_jobs=-1),
        n_jobs=-1
    )
    rf_temp.fit(X_temp_train, y_temp_train)
    temporal_results["RandomForest (Temporal)"] = evaluate_model(rf_temp, X_temp_test, y_temp_test)
    print_results("RandomForest (Temporal)", temporal_results["RandomForest (Temporal)"])

In [None]:
# Temporal comparison table
if TEMPORAL_AVAILABLE:
    print("="*70)
    print("TEMPORAL MODEL COMPARISON")
    print("="*70)
    
    print(f"\n{'Model':<25} {'ROC-AUC':>10} {'F1-macro':>10} {'ECE':>10}")
    print("-"*57)
    
    for name, res in temporal_results.items():
        auc = f"{res['roc_auc_macro']:.4f}" if res['roc_auc_macro'] else "N/A"
        print(f"{name:<25} {auc:>10} {res['f1_macro']:>10.4f} {res['ece']:>10.4f}")
    
    # Best temporal model per-bin
    best_temp_name = max(temporal_results.keys(), key=lambda k: temporal_results[k]['f1_macro'])
    best_temp = temporal_results[best_temp_name]
    
    print(f"\n\nBest Temporal Model: {best_temp_name}")
    print(f"\n{'Bin':<15} {'AUC':>8} {'Prec':>8} {'Recall':>8} {'F1':>8} {'Support':>8}")
    print("-"*58)
    
    for bin_name in BINS:
        if bin_name in best_temp['per_bin']:
            m = best_temp['per_bin'][bin_name]
            auc = f"{m['auc']:.3f}" if m['auc'] else "N/A"
            print(f"{bin_name:<15} {auc:>8} {m['precision']:>8.3f} {m['recall']:>8.3f} {m['f1']:>8.3f} {m['support']:>8}")

In [None]:
# Temporal visualization
if TEMPORAL_AVAILABLE:
    print("Generating temporal comparison plot...")
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Plot 1: Temporal vs Non-temporal comparison
    ax1 = axes[0]
    comparison_data = {
        "LightGBM (Child)": lgb_ch_results,
        "LightGBM (Temporal)": temporal_results["LightGBM (Temporal)"]
    }
    
    metrics = ['roc_auc_macro', 'f1_macro', 'ece']
    x = np.arange(len(metrics))
    width = 0.35
    
    for i, (name, res) in enumerate(comparison_data.items()):
        values = [res[m] if res[m] else 0 for m in metrics]
        ax1.bar(x + i*width, values, width, label=name)
    
    ax1.set_ylabel('Score')
    ax1.set_title('Child-Level vs Temporal (S1→S2)')
    ax1.set_xticks(x + width/2)
    ax1.set_xticklabels(['ROC-AUC', 'F1-macro', 'ECE'])
    ax1.legend()
    ax1.set_ylim(0, 1)
    
    # Plot 2: Per-bin temporal F1
    ax2 = axes[1]
    bin_f1 = []
    for b in BINS:
        if b in best_temp['per_bin']:
            bin_f1.append(best_temp['per_bin'][b]['f1'])
        else:
            bin_f1.append(0)
    
    colors = plt.cm.Set2(np.arange(len(BINS)))
    bars = ax2.barh(BINS, bin_f1, color=colors)
    ax2.set_xlabel('F1 Score')
    ax2.set_title(f'Temporal Per-Bin F1 ({best_temp_name})')
    ax2.set_xlim(0, 1)
    
    for bar, val in zip(bars, bin_f1):
        ax2.text(val + 0.02, bar.get_y() + bar.get_height()/2, f'{val:.3f}', va='center', fontsize=9)
    
    plt.tight_layout()
    plt.savefig('temporal_comparison_v3.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("Saved: temporal_comparison_v3.png")

In [None]:
# Save temporal results
if TEMPORAL_AVAILABLE:
    temporal_summary = {
        "temporal_dataset": {
            "n_children_train": len(temporal_train),
            "n_children_test": len(temporal_test),
            "total_s1_analyses": total_s1,
            "total_s2_analyses": total_s2
        },
        "temporal_models": temporal_results,
        "interpretation": {
            "purpose": "Predict child's future talent profile (S2) from earlier assessments (S1)",
            "findings": f"Best temporal model: {best_temp_name}",
            "predictive_validity": f"F1-macro={best_temp['f1_macro']:.4f} shows moderate predictive validity"
        }
    }
    
    # Update final results
    final_results["temporal"] = temporal_summary
    
    with open('arxiv_ml_results_v3.json', 'w') as f:
        json.dump(final_results, f, indent=2, default=float)
    
    print("Updated: arxiv_ml_results_v3.json with temporal results")

## 16. Final Summary

In [None]:
print("="*70)
print("FINAL SUMMARY - ArXiv ML Baseline Study v3")
print("="*70)

print(f"\n1. DATASET")
print(f"   Total analyses: {len(all_data)}")
print(f"   Unique children: {len(set(r.get('child_id') for r in all_data))}")
print(f"   Analysis types: {len(ANALYSIS_TYPES)} (text, image, musical, audio, video, pdf, ...)")
print(f"   Feature dimensions: {len(all_category_keys)} category scores + {len(all_key_talents)} key talents")

print(f"\n2. PER-ANALYSIS MODELS")
best_pa = max(['LogReg (PA)', 'LightGBM (PA)', 'LightGBM Cal (PA)'], key=lambda k: all_results[k]['f1_macro'])
print(f"   Best: {best_pa}")
print(f"   F1-macro: {all_results[best_pa]['f1_macro']:.4f}")

print(f"\n3. CHILD-LEVEL MODELS")
child_models = [k for k in all_results.keys() if 'Child' in k]
best_ch = max(child_models, key=lambda k: all_results[k]['f1_macro'])
print(f"   Best: {best_ch}")
print(f"   F1-macro: {all_results[best_ch]['f1_macro']:.4f}")

if TEMPORAL_AVAILABLE:
    print(f"\n4. TEMPORAL EVALUATION (S1 → S2)")
    print(f"   Children with 2+ analyses: {len(temporal_train) + len(temporal_test)}")
    print(f"   Best model: {best_temp_name}")
    print(f"   Predictive F1-macro: {best_temp['f1_macro']:.4f}")
    
    print("\n   Interpretation:")
    print("   - S1 features (first half of analyses) predict S2 labels (second half)")
    print("   - This demonstrates temporal generalization of talent profiles")
    print(f"   - Moderate predictive validity suggests talents are reasonably stable")

print("\n" + "="*70)
print("Files generated:")
print("  - arxiv_ml_results_v3.json")
print("  - shap_academic_v3.png")
print("  - model_comparison_v3.png")
print("  - analysis_types_v3.png")
if TEMPORAL_AVAILABLE:
    print("  - temporal_comparison_v3.png")
print("="*70)

In [None]:
# Download all files
from google.colab import files

print("Downloading files...")
files.download('arxiv_ml_results_v3.json')
files.download('shap_academic_v3.png')
files.download('model_comparison_v3.png')
files.download('analysis_types_v3.png')

if TEMPORAL_AVAILABLE:
    files.download('temporal_comparison_v3.png')

print("\nAll files downloaded!")