# Individual Risk & Profile Calculator (RSF + Cluster/Subcluster)

This notebook loads **already trained** artifacts exported from your training notebooks and provides:

- **Clinical profile**: `cluster_pred` (1/2) and, if `cluster_pred==1`, `subcluster_pred` (e.g., 11/12) + `p_subcluster11` + `threshold_used`.
- **Individual survival** (RSF): `S_1y`, `S_3y`, `S_5y` at 365/1095/1825 days, plus `risk_3y = 1 - S_3y`.
- **Risk group**: `bajo/medio/alto` using terciles of `risk_3y` loaded from `artifact/risk_cutpoints.json`.
- **Quality/confidence warnings**: `many_missing_flag`, `low_confidence_flag`, `unknown_category_flag`.

**No retraining and no recalibration** happens here.


In [1]:
# Imports & environment info
import json
import sys
import platform
from pathlib import Path

import joblib
import numpy as np
import pandas as pd

import sklearn

pd.set_option('display.max_columns', 200)

print('Python:', sys.version)
print('Platform:', platform.platform())
print('numpy:', np.__version__)
print('pandas:', pd.__version__)
print('sklearn:', sklearn.__version__)
print('joblib:', joblib.__version__)

RANDOM_STATE = 42


Python: 3.10.12 (main, Nov  4 2025, 08:48:33) [GCC 11.4.0]
Platform: Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.35
numpy: 2.2.6
pandas: 2.3.3
sklearn: 1.7.2
joblib: 1.5.2


In [2]:
# Load artifacts

ARTIFACT_DIR = Path('artifact')

PATHS = {
    'feature_schema': ARTIFACT_DIR / 'feature_schema.json',
    'risk_model_rsf': ARTIFACT_DIR / 'risk_model_rsf.joblib',
    'risk_model_cox': ARTIFACT_DIR / 'risk_model_cox.joblib',  # optional
    'cluster_assigner': ARTIFACT_DIR / 'cluster_assigner.joblib',
    'subcluster1_assigner': ARTIFACT_DIR / 'subcluster1_assigner.joblib',
    'subcluster1_label_map': ARTIFACT_DIR / 'subcluster1_label_map.json',
    'subcluster1_summary': ARTIFACT_DIR / 'subcluster1_summary.json',  # optional
    'train_summary': ARTIFACT_DIR / 'train_summary.json',  # optional
    'risk_cutpoints': ARTIFACT_DIR / 'risk_cutpoints.json',  # recommended for risk_group
}

required_keys = [
    'feature_schema',
    'risk_model_rsf',
    'cluster_assigner',
    'subcluster1_assigner',
    'subcluster1_label_map',
]

missing_required = [k for k in required_keys if not PATHS[k].exists()]
if missing_required:
    raise FileNotFoundError(
        'Missing required artifacts: '
        + ', '.join(f'{k} -> {PATHS[k]}' for k in missing_required)
        + '. Run model.ipynb export cells (and subgroups.ipynb export cells) first.'
    )

with open(PATHS['feature_schema'], 'r', encoding='utf-8') as f:
    FEATURE_SCHEMA = json.load(f)

EXPECTED_COLUMNS = FEATURE_SCHEMA['expected_columns']
NUM_COLS = FEATURE_SCHEMA['num_cols']
CAT_COLS = FEATURE_SCHEMA['cat_cols']
FLAG_COLS = FEATURE_SCHEMA['flag_cols']
HORIZONS_DAYS = FEATURE_SCHEMA.get('horizons_days', [365, 1095, 1825])

risk_model_rsf = joblib.load(PATHS['risk_model_rsf'])
cluster_assigner = joblib.load(PATHS['cluster_assigner'])
subcluster1_assigner = joblib.load(PATHS['subcluster1_assigner'])

risk_model_cox = None
if PATHS['risk_model_cox'].exists():
    try:
        risk_model_cox = joblib.load(PATHS['risk_model_cox'])
        print('Loaded optional:', PATHS['risk_model_cox'])
    except Exception as e:
        print('Warning: could not load risk_model_cox.joblib:', repr(e))

with open(PATHS['subcluster1_label_map'], 'r', encoding='utf-8') as f:
    SUBCLUSTER1_LABEL_MAP = json.load(f)

print('Loaded:', PATHS['feature_schema'])
print('Loaded:', PATHS['risk_model_rsf'])
print('Loaded:', PATHS['cluster_assigner'])
print('Loaded:', PATHS['subcluster1_assigner'])
print('Loaded:', PATHS['subcluster1_label_map'])
if PATHS['risk_cutpoints'].exists():
    print('Loaded:', PATHS['risk_cutpoints'])
else:
    print('Warning: missing', PATHS['risk_cutpoints'], '-> risk_group will be None')

# Optional: compare versions against the saved subcluster summary (if present)
if PATHS['subcluster1_summary'].exists():
    try:
        with open(PATHS['subcluster1_summary'], 'r', encoding='utf-8') as f:
            _sub_sum = json.load(f)
        saved = _sub_sum.get('versions', {})
        if saved:
            if saved.get('numpy') and saved['numpy'] != np.__version__:
                print(f"Warning: numpy version differs (saved={saved['numpy']} current={np.__version__})")
            if saved.get('sklearn') and saved['sklearn'] != sklearn.__version__:
                print(f"Warning: sklearn version differs (saved={saved['sklearn']} current={sklearn.__version__})")
    except Exception as e:
        print('Warning: could not parse subcluster1_summary.json:', repr(e))

print()
print('Schema summary:')
print('  n_expected_columns:', len(EXPECTED_COLUMNS))
print('  n_num_cols:', len(NUM_COLS))
print('  n_cat_cols:', len(CAT_COLS))
print('  n_flag_cols:', len(FLAG_COLS))
print('  horizons_days:', HORIZONS_DAYS)


Loaded optional: artifact/risk_model_cox.joblib
Loaded: artifact/feature_schema.json
Loaded: artifact/risk_model_rsf.joblib
Loaded: artifact/cluster_assigner.joblib
Loaded: artifact/subcluster1_assigner.joblib
Loaded: artifact/subcluster1_label_map.json
Loaded: artifact/risk_cutpoints.json

Schema summary:
  n_expected_columns: 22
  n_num_cols: 6
  n_cat_cols: 10
  n_flag_cols: 6
  horizons_days: [365, 1095, 1825]


In [3]:
# Preprocessor introspection utilities
# - Decide defaults for missing categorical columns ("Missing" if seen by OHE, otherwise "Unknown")
# - Detect unknown categories (optional warning)

def _get_preprocess_from_pipeline(pipeline):
    if not hasattr(pipeline, 'named_steps'):
        return None
    if 'preprocess' in pipeline.named_steps:
        return pipeline.named_steps['preprocess']
    return None


def _get_ohe_from_preprocess(preprocess):
    """Try to retrieve the fitted OneHotEncoder used for categorical columns."""
    if preprocess is None:
        return None

    # ColumnTransformer usually has named_transformers_ after fit.
    if hasattr(preprocess, 'named_transformers_') and isinstance(getattr(preprocess, 'named_transformers_'), dict):
        return preprocess.named_transformers_.get('cat')

    return None


def _build_known_categories_map(preprocess, cat_cols):
    """Return {cat_col: set(known_categories)} if possible; otherwise None."""
    ohe = _get_ohe_from_preprocess(preprocess)
    if ohe is None:
        return None
    if not hasattr(ohe, 'categories_'):
        return None

    cats = list(ohe.categories_)
    if len(cats) != len(cat_cols):
        print('Warning: OHE categories_ length != len(cat_cols). Unknown-category checks may be unreliable.')

    known = {}
    for i, col in enumerate(cat_cols):
        if i >= len(cats):
            break
        known[col] = set(str(x) for x in cats[i].tolist())
    return known


# Prefer to use the cluster_assigner preprocessor (same schema) for category inspection.
_preprocess_for_checks = _get_preprocess_from_pipeline(cluster_assigner)
KNOWN_CATEGORIES = _build_known_categories_map(_preprocess_for_checks, CAT_COLS)

if KNOWN_CATEGORIES is None:
    print(
        'Note: could not inspect OneHotEncoder categories_. '
        'unknown_category_flag will be always False. '
        'This is safe because the encoder is configured with handle_unknown="ignore".'
    )

CAT_DEFAULT_BY_COL = {}
if KNOWN_CATEGORIES is not None:
    for c in CAT_COLS:
        CAT_DEFAULT_BY_COL[c] = 'Missing' if 'Missing' in KNOWN_CATEGORIES.get(c, set()) else 'Unknown'
else:
    CAT_DEFAULT_BY_COL = {c: 'Unknown' for c in CAT_COLS}

print('Categorical default values (first 5):')
for c in CAT_COLS[:5]:
    print(f'  {c}: {CAT_DEFAULT_BY_COL[c]}')


Categorical default values (first 5):
  asa: Missing
  histo_defin: Unknown
  grado_histologi: Unknown
  FIGO2023: Missing
  afectacion_linf: Missing


In [4]:
# Input alignment helpers

def build_dataframe_from_dict(patient_dict: dict) -> pd.DataFrame:
    """Build a single-row DataFrame from a patient dict (no validation yet)."""
    if not isinstance(patient_dict, dict):
        raise TypeError('patient_dict must be a dict')
    return pd.DataFrame([patient_dict])


def align_dataframe(
    df_in: pd.DataFrame,
    expected_columns: list[str] = EXPECTED_COLUMNS,
    num_cols: list[str] = NUM_COLS,
    cat_cols: list[str] = CAT_COLS,
    flag_cols: list[str] = FLAG_COLS,
) -> pd.DataFrame:
    """Align a batch DataFrame to the expected schema.

    Rules:
    - Reorder columns to expected_columns
    - If missing expected column:
        - numeric -> 0.0
        - categorical -> "Missing" if known, else "Unknown"
        - flags -> 0
    - Extra columns are ignored
    """
    if not isinstance(df_in, pd.DataFrame):
        raise TypeError('df_in must be a pandas DataFrame')

    df = df_in.copy()

    # Add missing columns
    for col in expected_columns:
        if col in df.columns:
            continue

        if col in num_cols:
            df[col] = 0.0
        elif col in cat_cols:
            df[col] = CAT_DEFAULT_BY_COL.get(col, 'Unknown')
        elif col in flag_cols:
            df[col] = 0
        else:
            df[col] = 0

    # Keep only expected columns, in order
    df = df.loc[:, expected_columns]

    # Normalize dtypes (robust for calculator inputs)
    for col in num_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0).astype(float)

    for col in flag_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    for col in cat_cols:
        df[col] = df[col].astype(object)
        df[col] = df[col].where(df[col].notna(), CAT_DEFAULT_BY_COL.get(col, 'Unknown'))
        df[col] = df[col].astype(str)

    return df


In [5]:
# RSF survival prediction helpers

def _step_fn_value_at(step_fn, t_days: float) -> float:
    """Evaluate a scikit-survival StepFunction at horizon t_days.

    Strategy:
    - If possible, clamp t to the maximum supported time in the step function
    - Fall back to a manual 'last time <= t' rule using (x,y)
    """
    t = float(t_days)

    if hasattr(step_fn, 'x'):
        try:
            x = np.asarray(step_fn.x, dtype=float)
            if x.size > 0:
                t = min(t, float(x.max()))
        except Exception:
            pass

    try:
        return float(step_fn(t))
    except Exception:
        if hasattr(step_fn, 'x') and hasattr(step_fn, 'y'):
            x = np.asarray(step_fn.x, dtype=float)
            y = np.asarray(step_fn.y, dtype=float)
            if x.size == 0 or y.size == 0:
                return float('nan')
            idx = int(np.searchsorted(x, t, side='right') - 1)
            idx = max(0, min(idx, y.size - 1))
            return float(y[idx])
        raise


def rsf_predict_survival_at_horizons(
    X_aligned: pd.DataFrame,
    horizons_days: list[int] = HORIZONS_DAYS,
) -> pd.DataFrame:
    """Return S(t) at requested horizons for each row in X_aligned."""
    preprocess = risk_model_rsf.named_steps['preprocess']
    model = risk_model_rsf.named_steps['model']

    X_t = preprocess.transform(X_aligned)
    sf_list = model.predict_survival_function(X_t)

    out = {
        'S_1y': [],
        'S_3y': [],
        'S_5y': [],
        'risk_3y': [],
    }

    t1, t3, t5 = horizons_days

    for sf in sf_list:
        s1 = _step_fn_value_at(sf, t1)
        s3 = _step_fn_value_at(sf, t3)
        s5 = _step_fn_value_at(sf, t5)
        out['S_1y'].append(s1)
        out['S_3y'].append(s3)
        out['S_5y'].append(s5)
        out['risk_3y'].append(float(1.0 - s3))

    return pd.DataFrame(out, index=X_aligned.index)


def cox_predict_survival_at_horizons(
    X_aligned: pd.DataFrame,
    horizons_days: list[int] = HORIZONS_DAYS,
) -> pd.DataFrame | None:
    """Optional baseline (if risk_model_cox is available)."""
    if risk_model_cox is None:
        return None

    preprocess = risk_model_cox.named_steps['preprocess']
    model = risk_model_cox.named_steps['model']

    X_t = preprocess.transform(X_aligned)
    sf_list = model.predict_survival_function(X_t)

    t1, t3, t5 = horizons_days

    out = {
        'S_1y_cox': [],
        'S_3y_cox': [],
        'S_5y_cox': [],
    }

    for sf in sf_list:
        out['S_1y_cox'].append(_step_fn_value_at(sf, t1))
        out['S_3y_cox'].append(_step_fn_value_at(sf, t3))
        out['S_5y_cox'].append(_step_fn_value_at(sf, t5))

    return pd.DataFrame(out, index=X_aligned.index)


In [6]:
# Risk-group thresholds (terciles) from precomputed cutpoints (recommended for production)
#
# In production you should NOT load any patient CSV to compute these.
# Instead, model.ipynb exports artifact/risk_cutpoints.json with two numbers:
#   - q33: 33% quantile of risk_3y on the training set
#   - q66: 66% quantile of risk_3y on the training set

RISK_TERCILES: tuple[float, float] | None = None

cutpoints_path = PATHS.get('risk_cutpoints', ARTIFACT_DIR / 'risk_cutpoints.json')
if cutpoints_path.exists():
    with open(cutpoints_path, 'r', encoding='utf-8') as f:
        cutpoints = json.load(f)

    q33 = float(cutpoints['q33'])
    q66 = float(cutpoints['q66'])
    RISK_TERCILES = (q33, q66)

    print('Loaded risk cutpoints from:', cutpoints_path)
    print('  q33:', q33)
    print('  q66:', q66)
else:
    print('Warning: risk_cutpoints.json not found -> risk_group will be None.')
    print('Generate it by running model.ipynb export cells to create:', cutpoints_path)


def risk_group_from_risk_3y(risk_3y: float, terciles: tuple[float, float] | None = RISK_TERCILES) -> str | None:
    if terciles is None or not np.isfinite(risk_3y):
        return None
    q33, q66 = terciles
    if risk_3y <= q33:
        return 'bajo'
    if risk_3y <= q66:
        return 'medio'
    return 'alto'


Loaded risk cutpoints from: artifact/risk_cutpoints.json
  q33: 0.10676842322737941
  q66: 0.32160908694597884


In [7]:
# Warning helpers

MISS_COLS = [c for c in EXPECTED_COLUMNS if c.endswith('_miss')]


def compute_many_missing_flag(
    X_aligned: pd.DataFrame,
    missing_threshold: int = 2,
    ratio_threshold: float = 0.67,
) -> pd.Series:
    """Return a boolean Series per row."""
    if not MISS_COLS:
        return pd.Series(False, index=X_aligned.index)

    miss_sum = X_aligned[MISS_COLS].sum(axis=1)
    miss_ratio = miss_sum / float(len(MISS_COLS))

    return (miss_sum >= missing_threshold) | (miss_ratio >= ratio_threshold)


def compute_unknown_category_flag(X_aligned: pd.DataFrame) -> pd.Series:
    """Return a boolean Series per row.

    If OHE categories are not accessible, returns all False.
    """
    if KNOWN_CATEGORIES is None:
        return pd.Series(False, index=X_aligned.index)

    flags = pd.Series(False, index=X_aligned.index)
    for col in CAT_COLS:
        known = KNOWN_CATEGORIES.get(col)
        if not known:
            continue
        vals = X_aligned[col].astype(str)
        flags = flags | (~vals.isin(known))

    return flags


def compute_low_confidence_flag(p_subcluster11: float | None, margin: float = 0.10) -> bool:
    if p_subcluster11 is None or not np.isfinite(p_subcluster11):
        return False
    return abs(float(p_subcluster11) - 0.5) < float(margin)


In [8]:
# Cluster + subcluster assignment

CLUSTER_CONFIDENCE_MARGIN = 0.10


def _get_final_estimator(pipeline):
    if hasattr(pipeline, 'named_steps'):
        for key in ['clf', 'model', 'classifier', 'estimator']:
            if key in pipeline.named_steps:
                return pipeline.named_steps[key]
        return pipeline.steps[-1][1]
    return pipeline


def assign_cluster_and_subcluster(
    X_aligned: pd.DataFrame,
    threshold: float = 0.5,
    margin: float = 0.10,
) -> pd.DataFrame:
    """Assign cluster (1/2) and, if cluster==1, assign subcluster within cluster 1.

    Also exposes cluster confidence:
    - p_cluster1 = P(cluster==1)
    - low_confidence_cluster = |p_cluster1 - 0.5| < 0.10

    Note: `margin` controls the subcluster low-confidence flag, not the cluster one.
    """

    out = pd.DataFrame(index=X_aligned.index)

    # --- Cluster prediction ---
    out['cluster_pred'] = cluster_assigner.predict(X_aligned).astype(int)

    # --- Cluster confidence (probability of cluster==1) ---
    out['p_cluster1'] = np.nan
    out['low_confidence_cluster'] = False

    if hasattr(cluster_assigner, 'predict_proba'):
        try:
            cluster_clf = _get_final_estimator(cluster_assigner)
            if not hasattr(cluster_clf, 'classes_'):
                raise ValueError('cluster_assigner classifier does not expose classes_.')

            classes = np.asarray(cluster_clf.classes_)
            idx_1 = np.where(classes == 1)[0]
            if idx_1.size == 0:
                raise ValueError(f'cluster_assigner classes_ does not include label 1: {classes.tolist()}')

            proba = cluster_assigner.predict_proba(X_aligned)
            p_cluster1 = proba[:, int(idx_1[0])].astype(float)
            out['p_cluster1'] = p_cluster1

            p = out['p_cluster1'].astype(float)
            out['low_confidence_cluster'] = p.notna() & (np.abs(p - 0.5) < CLUSTER_CONFIDENCE_MARGIN)
        except Exception as e:
            print('Warning: could not compute p_cluster1 / low_confidence_cluster:', repr(e))
    else:
        print(
            'Note: cluster_assigner has no predict_proba(); '
            'p_cluster1 will be NaN and low_confidence_cluster will be False.'
        )

    # --- Default subcluster outputs ---
    out['subcluster_pred'] = pd.Series([None] * len(out), index=out.index, dtype=object)
    out['p_subcluster11'] = np.nan
    out['threshold_used'] = np.nan
    out['low_confidence_flag'] = False

    # If not in cluster 1, subcluster assignment is not applicable.
    idx_cluster1 = out['cluster_pred'] == 1
    if not idx_cluster1.any():
        return out

    # Identify which model class corresponds to original label 11
    o2m = SUBCLUSTER1_LABEL_MAP.get('original_to_model', {})
    orig_labels_sorted = SUBCLUSTER1_LABEL_MAP.get('original_labels_sorted')

    if '11' not in o2m:
        raise ValueError('subcluster1_label_map.json does not contain a mapping for original label "11".')

    model_label_11 = int(o2m['11'])

    sub_clf = _get_final_estimator(subcluster1_assigner)
    if not hasattr(sub_clf, 'classes_'):
        raise ValueError('subcluster1_assigner classifier does not expose classes_.')

    sub_classes = np.asarray(sub_clf.classes_)
    col_idx = int(np.where(sub_classes == model_label_11)[0][0])

    sub_proba = subcluster1_assigner.predict_proba(X_aligned.loc[idx_cluster1])
    p11 = sub_proba[:, col_idx].astype(float)

    out.loc[idx_cluster1, 'p_subcluster11'] = p11
    out.loc[idx_cluster1, 'threshold_used'] = float(threshold)

    # Determine the alternative original label (assumes binary within cluster 1)
    if orig_labels_sorted is None:
        orig_labels_sorted = sorted(int(k) for k in o2m.keys())

    orig_labels_sorted = [int(x) for x in orig_labels_sorted]
    if 11 not in orig_labels_sorted:
        raise ValueError(f'Expected original label 11 in original_labels_sorted; got {orig_labels_sorted}')

    if len(orig_labels_sorted) != 2:
        # Fallback to argmax if not binary
        pred_model = subcluster1_assigner.predict(X_aligned.loc[idx_cluster1]).astype(int)
        m2o = SUBCLUSTER1_LABEL_MAP.get('model_to_original', {})
        out.loc[idx_cluster1, 'subcluster_pred'] = [int(m2o.get(str(v), v)) for v in pred_model]
    else:
        other_label = next(lbl for lbl in orig_labels_sorted if lbl != 11)
        out.loc[idx_cluster1, 'subcluster_pred'] = np.where(p11 >= float(threshold), 11, other_label)

    # Confidence flag for subcluster assignment
    out.loc[idx_cluster1, 'low_confidence_flag'] = [compute_low_confidence_flag(v, margin=margin) for v in p11]

    return out


In [9]:
# Main: predict_one

def predict_one(
    patient_dict: dict,
    threshold: float = 0.5,
    margin: float = 0.10,
    missing_threshold: int = 3,
) -> dict:
    """Predict profile + RSF survival for a single patient dict."""

    df_one = build_dataframe_from_dict(patient_dict)
    X_one = align_dataframe(df_one)

    # Warnings based on inputs only
    many_missing_flag = bool(compute_many_missing_flag(X_one, missing_threshold=missing_threshold).iloc[0])
    unknown_category_flag = bool(compute_unknown_category_flag(X_one).iloc[0])

    profile = assign_cluster_and_subcluster(X_one, threshold=threshold, margin=margin).iloc[0].to_dict()
    surv = rsf_predict_survival_at_horizons(X_one).iloc[0].to_dict()

    risk_group = risk_group_from_risk_3y(float(surv['risk_3y']))

    result = {
        'cluster_pred': int(profile['cluster_pred']),
        'p_cluster1': None if pd.isna(profile.get('p_cluster1', np.nan)) else float(profile['p_cluster1']),
        'low_confidence_cluster': bool(profile.get('low_confidence_cluster', False)),
        'subcluster_pred': profile['subcluster_pred'],
        'p_subcluster11': None if pd.isna(profile['p_subcluster11']) else float(profile['p_subcluster11']),
        'threshold_used': None if pd.isna(profile['threshold_used']) else float(profile['threshold_used']),
        'S_1y': float(surv['S_1y']),
        'S_3y': float(surv['S_3y']),
        'S_5y': float(surv['S_5y']),
        'risk_3y': float(surv['risk_3y']),
        'risk_group': risk_group,
        'warnings': {
            'many_missing_flag': many_missing_flag,
            'low_confidence_cluster': bool(profile.get('low_confidence_cluster', False)),
            'low_confidence_flag': bool(profile.get('low_confidence_flag', False)),
            'unknown_category_flag': unknown_category_flag,
        },
    }

    # Human-readable output
    print()
    print('=== Patient prediction ===')
    print('cluster_pred:', result['cluster_pred'])
    print('p_cluster1:', result['p_cluster1'])
    print('low_confidence_cluster:', result['low_confidence_cluster'])

    if result['cluster_pred'] == 1:
        print('subcluster_pred:', result['subcluster_pred'])
        print('p_subcluster11:', result['p_subcluster11'])
        print('threshold_used:', result['threshold_used'])

    print(f"S(1y)={result['S_1y']:.3f} | S(3y)={result['S_3y']:.3f} | S(5y)={result['S_5y']:.3f}")
    print(f"risk_3y={result['risk_3y']:.3f} | risk_group={result['risk_group']}")

    if any(result['warnings'].values()):
        print('Warnings:', result['warnings'])
    else:
        print('Warnings: none')

    if unknown_category_flag:
        print('Note: OneHotEncoder is configured with handle_unknown="ignore" (unknown categories are safely ignored).')

    return result


In [10]:
# Main: predict_batch

def predict_batch(
    input_csv_path: str | Path,
    output_csv_path: str | Path = 'predicciones_calculadora.csv',
    threshold: float = 0.5,
    margin: float = 0.10,
    missing_threshold: int = 3,
) -> pd.DataFrame:
    """Predict for a batch CSV (one row per patient)."""

    input_csv_path = Path(input_csv_path)
    if not input_csv_path.exists():
        raise FileNotFoundError(f'Input CSV not found: {input_csv_path}')

    df_in = pd.read_csv(input_csv_path)
    if 'Unnamed: 0' in df_in.columns and df_in['Unnamed: 0'].is_unique:
        df_in = df_in.set_index('Unnamed: 0')

    X = align_dataframe(df_in)

    # Profile (includes p_cluster1 + low_confidence_cluster)
    profile_df = assign_cluster_and_subcluster(X, threshold=threshold, margin=margin)

    # Survival
    surv_df = rsf_predict_survival_at_horizons(X)

    # Warnings
    many_missing = compute_many_missing_flag(X, missing_threshold=missing_threshold)
    unknown_cat = compute_unknown_category_flag(X)

    # Risk groups
    risk_groups = surv_df['risk_3y'].apply(lambda v: risk_group_from_risk_3y(float(v)))

    out = pd.concat([profile_df, surv_df], axis=1)
    out['risk_group'] = risk_groups
    out['many_missing_flag'] = many_missing.astype(bool)
    out['unknown_category_flag'] = unknown_cat.astype(bool)

    # Persist
    output_csv_path = Path(output_csv_path)
    out.to_csv(output_csv_path, index=True)
    print('Saved:', output_csv_path)

    return out


In [11]:
# Demo

from IPython.display import display

demo_candidates = [
    Path('cluster/df_con_subclusters.csv'),
    Path('df_con_subclusters.csv'),
    Path('cluster/df_con_clusters.csv'),
    Path('df_con_clusters.csv'),
]

demo_path = next((p for p in demo_candidates if p.exists()), None)

if demo_path is None:
    print('No demo CSV found. To demo, provide df_con_subclusters.csv or df_con_clusters.csv.')
else:
    print('Running demo using:', demo_path)
    df_demo = pd.read_csv(demo_path)
    if 'Unnamed: 0' in df_demo.columns and df_demo['Unnamed: 0'].is_unique:
        df_demo = df_demo.set_index('Unnamed: 0')

    # Take 2 real rows
    df_demo_2 = df_demo.sample(n=min(2, df_demo.shape[0]), random_state=RANDOM_STATE)

    for idx, row in df_demo_2.iterrows():
        patient_dict = row.to_dict()
        print('\n--- Demo patient index:', idx, '---')
        _ = predict_one(patient_dict)

    print('\nBatch demo: saving predicciones_calculadora.csv')
    pred_batch = predict_batch(demo_path, output_csv_path='predicciones_calculadora.csv')
    print('Preview:')
    display(pred_batch.head())


Running demo using: cluster/df_con_subclusters.csv

--- Demo patient index: 125 ---

=== Patient prediction ===
cluster_pred: 1
p_cluster1: 0.7505195278116013
low_confidence_cluster: False
subcluster_pred: 11
p_subcluster11: 0.9936728944993931
threshold_used: 0.5
S(1y)=0.759 | S(3y)=0.363 | S(5y)=0.240
risk_3y=0.637 | risk_group=alto

--- Demo patient index: 51 ---

=== Patient prediction ===
cluster_pred: 1
p_cluster1: 0.6909448148222535
low_confidence_cluster: False
subcluster_pred: 12
p_subcluster11: 0.024528676267898897
threshold_used: 0.5
S(1y)=0.976 | S(3y)=0.888 | S(5y)=0.859
risk_3y=0.112 | risk_group=medio

Batch demo: saving predicciones_calculadora.csv
Saved: predicciones_calculadora.csv
Preview:


Unnamed: 0_level_0,cluster_pred,p_cluster1,low_confidence_cluster,subcluster_pred,p_subcluster11,threshold_used,low_confidence_flag,S_1y,S_3y,S_5y,risk_3y,risk_group,many_missing_flag,unknown_category_flag
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,2,0.052351,False,,,,False,0.837742,0.591839,0.403103,0.408161,alto,False,False
1,2,0.091775,False,,,,False,0.932035,0.793764,0.697461,0.206236,medio,False,False
2,1,0.996905,False,12.0,0.042484,0.5,False,0.849486,0.557211,0.43305,0.442789,alto,False,False
3,2,0.013298,False,,,,False,0.975269,0.878956,0.854979,0.121044,medio,False,False
4,2,0.010296,False,,,,False,0.990009,0.959361,0.940761,0.040639,bajo,False,False
