In [None]:
import pandas as pd
import numpy as np

IN_PATH = "nationwidechildrens.org_clinical_patient_cesc.txt"
OUT_PATH = "hersalud_features_only.csv"

# --- Helpers ---
def clean_str(x) -> str:
    if pd.isna(x):
        return "unknown"
    s = str(x).strip()
    if s == "" or s.lower() in {"na", "n/a", "not available", "[not available]"}:
        return "unknown"
    return s

def norm_yesno(x: str) -> str:
    s = clean_str(x).lower()
    if s in {"yes", "y", "true", "1"}:
        return "yes"
    if s in {"no", "n", "false", "0"}:
        return "no"
    return "unknown"

def norm_hpv(x: str) -> str:
    s = clean_str(x).lower()
    # Common patterns: Positive/Negative, Detected/Not Detected
    if ("pos" in s) or (("detect" in s) and ("not" not in s)):
        return "positive"
    if ("neg" in s) or ("not detect" in s):
        return "negative"
    return "unknown"

def to_float(x):
    s = clean_str(x)
    if s == "unknown":
        return np.nan
    return pd.to_numeric(s, errors="coerce")

def to_int(x):
    s = clean_str(x)
    if s == "unknown":
        return np.nan
    return pd.to_numeric(s, errors="coerce").astype("Int64")

def build_immuno(history_immuno: str, hiv_status: str) -> str:
    h = norm_yesno(history_immuno)
    hiv = clean_str(hiv_status).lower()

    hiv_pos = ("positive" in hiv) or (hiv in {"pos", "reactive"})
    hiv_neg = ("negative" in hiv) or (hiv in {"neg", "non-reactive", "nonreactive"})

    if h == "yes" or hiv_pos:
        return "yes"
    if h == "no" and hiv_neg:
        return "no"
    if h == "no" and not hiv_pos:
        # If immuno history says "no" and HIV missing, still OK to treat as no
        return "no"
    return "unknown"

def pick_col(df: pd.DataFrame, candidates: list[str]) -> str | None:
    """Return the first candidate column that exists."""
    for c in candidates:
        if c in df.columns:
            return c
    return None

def main():
    # Patient files usually have:
    # row0: header
    # row1: duplicate header
    # row2: CDE_ID row
    # then data
    df = pd.read_csv(IN_PATH, sep="\t", header=0, skiprows=[1, 2], dtype=str, low_memory=False)

    # --- Required / Core ---
    # (Sometimes names vary slightly; we handle HPV variants below)
    required = ["bcr_patient_barcode", "age_at_diagnosis", "tobacco_smoking_history_indicator",
                "history_immunosuppressive_disease"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns in patient file: {missing}")

    # HPV: try a few likely names
    hpv_col = pick_col(df, [
        "hpv_test_results",
        "hpv_status",
        "hpv_result",
        "hpv_test_result"
    ])
    if hpv_col is None:
        # If TCGA file truly lacks HPV column in your copy, we still proceed with unknown
        print("WARNING: No HPV column found. hpv_status will be 'unknown' for all rows.")

    # HIV: optional but helpful
    hiv_col = pick_col(df, ["hiv_status", "hiv_status_at_diagnosis"])

    # --- Optional “future use” patient-level fields ---
    # (all may not exist; we include if present)
    optional_cols = {
        # demographics
        "gender": ["gender"],
        "race": ["race"],
        "ethnicity": ["ethnicity"],

        # body / menopause
        "menopause_status": ["menopause_status"],
        "height_cm_at_diagnosis": ["height_cm_at_diagnosis"],
        "weight_kg_at_diagnosis": ["weight_kg_at_diagnosis"],

        # reproductive / contraceptive
        "history_hormonal_contraceptives_use": ["history_hormonal_contraceptives_use"],
        "live_birth_pregnancy_count": ["live_birth_pregnancy_count"],
        "total_pregnancy_count": ["total_pregnancy_count"],

        # immune context extras (optional)
        "cd4_counts_at_diagnosis": ["cd4_counts_at_diagnosis"],
    }

    out = pd.DataFrame()
    out["patient_id"] = df["bcr_patient_barcode"].astype(str)

    # Core normalized features (HerSalud MVP)
    out["age_years"] = pd.to_numeric(df["age_at_diagnosis"], errors="coerce")
    out["hpv_status"] = df[hpv_col].apply(norm_hpv) if hpv_col else "unknown"
    out["ever_smoked"] = df["tobacco_smoking_history_indicator"].apply(norm_yesno)
    out["immunosuppressed"] = [
        build_immuno(a, df[hiv_col].iloc[i] if hiv_col else "unknown")
        for i, a in enumerate(df["history_immunosuppressive_disease"])
    ]

    # Extra columns (kept raw-ish but cleaned)
    if hiv_col:
        out["hiv_status_raw"] = df[hiv_col].apply(clean_str)
    else:
        out["hiv_status_raw"] = "unknown"

    # HPV extras if present in your file
    hpv_types_col = pick_col(df, ["hpv_types_positive"])
    hpv_method_col = pick_col(df, ["hpv_typing_method"])
    if hpv_types_col:
        out["hpv_types_positive_raw"] = df[hpv_types_col].apply(clean_str)
    if hpv_method_col:
        out["hpv_typing_method_raw"] = df[hpv_method_col].apply(clean_str)

    # Attach optional fields if present
    for out_name, candidates in optional_cols.items():
        col = pick_col(df, candidates)
        if col:
            # numeric handling where appropriate
            if out_name in {"height_cm_at_diagnosis", "weight_kg_at_diagnosis", "cd4_counts_at_diagnosis"}:
                out[out_name] = df[col].apply(to_float)
            elif out_name in {"live_birth_pregnancy_count", "total_pregnancy_count"}:
                out[out_name] = pd.to_numeric(df[col], errors="coerce")
            else:
                out[out_name] = df[col].apply(clean_str)
        else:
            out[out_name] = "unknown" if out_name not in {"height_cm_at_diagnosis","weight_kg_at_diagnosis","cd4_counts_at_diagnosis"} else np.nan

    # Label placeholder (filled later)
    out["label"] = np.nan

    out.to_csv(OUT_PATH, index=False)
    print(f"Saved: {OUT_PATH} (rows={len(out)})")

if __name__ == "__main__":
    main()


Saved: hersalud_features_only.csv (rows=307)


In [None]:
import pandas as pd
import numpy as np

FEATURES_CSV = "hersalud_features_only.csv"
FOLLOWUP_TXT = "nationwidechildrens.org_clinical_follow_up_v2.0_cesc.txt"
OUT_CSV = "clinical_only_training.csv"

def clean_str(x) -> str:
    if pd.isna(x):
        return ""
    s = str(x).strip()
    if s.lower() in {"", "na", "n/a", "not available", "[not available]"}:
        return ""
    return s

def norm_yesno_unknown(x: str) -> str:
    s = clean_str(x).lower()
    if s in {"yes", "y", "true", "1"}:
        return "yes"
    if s in {"no", "n", "false", "0"}:
        return "no"
    if "unknown" in s:
        return "unknown"
    return "unknown"

def tumor_status_to_signal(x: str) -> str:
    s = clean_str(x).lower()
    if "with tumor" in s or "with_tumor" in s:
        return "with_tumor"
    if "tumor free" in s or "tumor_free" in s:
        return "tumor_free"
    if "recur" in s or "progress" in s:
        return "with_tumor"
    if "no evidence" in s or s == "ned":
        return "tumor_free"
    return ""

def load_followup_table(path: str) -> pd.DataFrame:
    # follow_up file has duplicate header + CDE row
    return pd.read_csv(path, sep="\t", header=0, skiprows=[1, 2], dtype=str, low_memory=False)

def main():
    feats = pd.read_csv(FEATURES_CSV, dtype=str, low_memory=False)
    if "patient_id" not in feats.columns:
        raise ValueError("features file must contain patient_id")

    # Keep age numeric if present
    if "age_years" in feats.columns:
        feats["age_years"] = pd.to_numeric(feats["age_years"], errors="coerce")

    fu = load_followup_table(FOLLOWUP_TXT)

    required = {"bcr_patient_barcode", "new_tumor_event_dx_indicator", "tumor_status"}
    missing = required - set(fu.columns)
    if missing:
        raise ValueError(f"Follow-up file missing required columns: {sorted(missing)}")

    fu["patient_id"] = fu["bcr_patient_barcode"].astype(str)
    fu["nte_ind"] = fu["new_tumor_event_dx_indicator"].apply(norm_yesno_unknown)
    fu["tumor_sig"] = fu["tumor_status"].apply(tumor_status_to_signal)

    def row_label(nte: str, tumor_sig: str):
        # recurrence wins
        if nte == "yes": return 1
        if tumor_sig == "with_tumor": return 1
        # explicit no / tumor free
        if nte == "no": return 0
        if tumor_sig == "tumor_free": return 0
        return np.nan

    fu["row_label"] = [row_label(a, b) for a, b in zip(fu["nte_ind"], fu["tumor_sig"])]

    def collapse_patient(series: pd.Series):
        vals = series.dropna().astype(int).tolist()
        if not vals:
            return np.nan
        if 1 in vals:
            return 1
        if 0 in vals:
            return 0
        return np.nan

    labels = (
        fu.groupby("patient_id")["row_label"]
          .apply(collapse_patient)
          .reset_index(name="label_from_followup") # Corrected line
    )

    merged = feats.merge(labels, on="patient_id", how="left")

    # Final label: use follow-up label (preferred), else keep existing placeholder
    merged["label"] = pd.to_numeric(merged.get("label", np.nan), errors="coerce")
    merged["label_from_followup"] = pd.to_numeric(merged["label_from_followup"], errors="coerce")
    merged["label"] = merged["label_from_followup"].combine_first(merged["label"])
    merged = merged.drop(columns=["label_from_followup"])

    # Recommended for training: drop unlabeled rows
    train = merged.dropna(subset=["label"]).copy()
    train["label"] = train["label"].astype(int)

    train.to_csv(OUT_CSV, index=False)
    print(f"Saved: {OUT_CSV}")
    print(f"Rows total features: {len(feats)}")
    print(f"Rows with labels:    {len(train)}")
    print("Label distribution:")
    print(train["label"].value_counts(dropna=False))

if __name__ == "__main__":
    main()


Saved: clinical_only_training.csv
Rows total features: 307
Rows with labels:    50
Label distribution:
label
0    32
1    18
Name: count, dtype: int64


In [None]:
import pandas as pd

# Load the files directly
feats = pd.read_csv("hersalud_features_only.csv")
fu = pd.read_csv(
    "nationwidechildrens.org_clinical_follow_up_v2.0_cesc.txt",
    sep="\t",
    header=0,
    skiprows=[1, 2],
    dtype=str,
    low_memory=False
)
merged = pd.read_csv("clinical_only_training.csv")

print("Patients in patient file:", feats["patient_id"].nunique())
print("Patients in follow-up file:", fu["bcr_patient_barcode"].nunique())
print("Patients in final training table:", merged["patient_id"].nunique())
print("Label distribution:")
print(merged["label"].value_counts())



Patients in patient file: 307
Patients in follow-up file: 50
Patients in final training table: 50
Label distribution:
label
0    32
1    18
Name: count, dtype: int64


In [None]:
import pandas as pd

fu = pd.read_csv(
    "nationwidechildrens.org_clinical_follow_up_v2.0_cesc.txt",
    sep="\t", header=0, skiprows=[1,2], dtype=str, low_memory=False
)

print("Rows in follow-up file:", len(fu))
print("Unique patients:", fu["bcr_patient_barcode"].nunique())
print("Unique followup forms:", fu["bcr_followup_uuid"].nunique() if "bcr_followup_uuid" in fu.columns else "N/A")
print("Non-missing new_tumor_event_dx_indicator:", fu["new_tumor_event_dx_indicator"].notna().sum())


Rows in follow-up file: 52
Unique patients: 50
Unique followup forms: 52
Non-missing new_tumor_event_dx_indicator: 52


In [None]:
# build_survival_labels_from_patient.py
# Creates a ~300-row survival-labeled training table using ONLY clinical_patient_cesc.txt
# Inputs:
#   - hersalud_features_only.csv (your normalized features + extra fields)
#   - nationwidechildrens.org_clinical_patient_cesc.txt (for vital_status / survival times)
# Output:
#   - clinical_only_training_survival.csv

import pandas as pd
import numpy as np

FEATURES_CSV = "hersalud_features_only.csv"
PATIENT_TXT  = "nationwidechildrens.org_clinical_patient_cesc.txt"
OUT_CSV      = "clinical_only_training_survival.csv"

def clean_str(x) -> str:
    if pd.isna(x):
        return ""
    s = str(x).strip()
    if s.lower() in {"", "na", "n/a", "not available", "[not available]"}:
        return ""
    return s

def norm_vital_status(x: str) -> str:
    s = clean_str(x).lower()
    # common TCGA values: "Alive", "Dead"
    if "dead" in s:
        return "dead"
    if "alive" in s:
        return "alive"
    return ""

def main():
    # 1) Load features (already normalized + extra fields)
    feats = pd.read_csv(FEATURES_CSV, low_memory=False)
    if "patient_id" not in feats.columns:
        raise ValueError("hersalud_features_only.csv must contain patient_id")

    # 2) Load patient file (skip duplicate header row + CDE row)
    pat = pd.read_csv(
        PATIENT_TXT,
        sep="\t",
        header=0,
        skiprows=[1, 2],
        dtype=str,
        low_memory=False
    )

    # Columns we need from patient file
    needed = ["bcr_patient_barcode", "vital_status", "death_days_to", "last_contact_days_to"]
    missing = [c for c in needed if c not in pat.columns]
    if missing:
        raise ValueError(f"Missing required columns in patient file: {missing}")

    pat_small = pat[needed].copy()
    pat_small["patient_id"] = pat_small["bcr_patient_barcode"].astype(str)

    # 3) Build survival label: Dead=1, Alive=0
    vs = pat_small["vital_status"].apply(norm_vital_status)
    pat_small["label_survival"] = np.where(vs == "dead", 1,
                                  np.where(vs == "alive", 0, np.nan))

    # Optional: numeric times (handy later)
    pat_small["death_days_to"] = pd.to_numeric(pat_small["death_days_to"], errors="coerce")
    pat_small["last_contact_days_to"] = pd.to_numeric(pat_small["last_contact_days_to"], errors="coerce")

    # 4) Merge onto your feature table
    merged = feats.merge(
        pat_small[["patient_id", "label_survival", "death_days_to", "last_contact_days_to"]],
        on="patient_id",
        how="left"
    )

    # 5) Final label column (overwrite placeholder)
    merged["label"] = pd.to_numeric(merged.get("label", np.nan), errors="coerce")
    merged["label_survival"] = pd.to_numeric(merged["label_survival"], errors="coerce")
    merged["label"] = merged["label_survival"].combine_first(merged["label"])
    merged = merged.drop(columns=["label_survival"])

    # 6) Drop unlabeled rows (recommended for training)
    before = len(merged)
    train = merged.dropna(subset=["label"]).copy()
    train["label"] = train["label"].astype(int)
    after = len(train)

    # 7) Save
    train.to_csv(OUT_CSV, index=False)

    print("Done.")
    print(f"Patients in features file: {feats['patient_id'].nunique()}")
    print(f"Patients in patient file:  {pat_small['patient_id'].nunique()}")
    print(f"Rows before drop unlabeled: {before}")
    print(f"Rows after drop unlabeled:  {after}")
    print("Label distribution:")
    print(train["label"].value_counts())

if __name__ == "__main__":
    main()


Done.
Patients in features file: 307
Patients in patient file:  307
Rows before drop unlabeled: 307
Rows after drop unlabeled:  307
Label distribution:
label
0    247
1     60
Name: count, dtype: int64


In [None]:
import json
from dataclasses import dataclass
from typing import Dict, List, Tuple, Any

SCORING_JSON_PATH = "hersalud_scoring.json"  # produced by build_scoring_json.py


# -----------------------------
# 1) Hardcoded user inputs
# -----------------------------
USER_INPUT = {
    "age_years": 34,
    "hpv_status": "positive",         # positive|negative|unknown
    "ever_smoked": "no",              # yes|no|unknown
    "immunosuppressed": "unknown",    # yes|no|unknown
}


# -----------------------------
# 2) Feature engineering (must match build_scoring_json.py)
# -----------------------------
def bucket_age(age_years: Any) -> str:
    if age_years is None:
        return "age_unknown"
    try:
        a = float(age_years)
    except Exception:
        return "age_unknown"

    if a < 25: return "age_<25"
    if a < 35: return "age_25_34"
    if a < 45: return "age_35_44"
    if a < 55: return "age_45_54"
    return "age_55_plus"


def norm_choice(v: Any, allowed: List[str]) -> str:
    if v is None:
        return "unknown"
    s = str(v).strip().lower()
    return s if s in allowed else "unknown"


def build_onehot_features(user: Dict[str, Any], trained_features: List[str]) -> Dict[str, int]:
    """
    Returns a dict of {feature_name: 0/1} for the exact feature names expected by scoring JSON.
    """
    feats = {k: 0 for k in trained_features}

    # Age buckets
    age_bucket = bucket_age(user.get("age_years"))
    if age_bucket in feats:
        feats[age_bucket] = 1

    # HPV one-hot: hpv_positive/hpv_negative/hpv_unknown
    hpv = norm_choice(user.get("hpv_status"), ["positive", "negative", "unknown"])
    key = f"hpv_{hpv}"
    if key in feats:
        feats[key] = 1

    # Smoking one-hot: smoke_yes/smoke_no/smoke_unknown
    smoke = norm_choice(user.get("ever_smoked"), ["yes", "no", "unknown"])
    key = f"smoke_{smoke}"
    if key in feats:
        feats[key] = 1

    # Immuno one-hot: immuno_yes/immuno_no/immuno_unknown
    imm = norm_choice(user.get("immunosuppressed"), ["yes", "no", "unknown"])
    key = f"immuno_{imm}"
    if key in feats:
        feats[key] = 1

    return feats


# -----------------------------
# 3) Score + explain
# -----------------------------
@dataclass
class ScoreResult:
    total_points: int
    bucket: str
    bucket_range: Tuple[int, int]
    contributions: List[Tuple[str, int]]  # (feature, points)
    active_features: Dict[str, int]


def choose_bucket(points: int, buckets: List[Dict[str, Any]]) -> Tuple[str, Tuple[int, int]]:
    for b in buckets:
        if b["min_points"] <= points <= b["max_points"]:
            return b["name"], (b["min_points"], b["max_points"])
    # fallback
    return "Unknown", (-999999, 999999)


def score_user(user: Dict[str, Any], scoring: Dict[str, Any]) -> ScoreResult:
    trained_features = scoring["trained_features"]
    rules: Dict[str, int] = scoring["rules"]
    buckets = scoring["risk_buckets"]

    x = build_onehot_features(user, trained_features)

    contributions = []
    total = 0
    for feat, val in x.items():
        if val == 1:
            pts = int(rules.get(feat, 0))
            contributions.append((feat, pts))
            total += pts

    # sort by absolute impact
    contributions.sort(key=lambda t: abs(t[1]), reverse=True)

    bucket_name, bucket_range = choose_bucket(total, buckets)

    return ScoreResult(
        total_points=total,
        bucket=bucket_name,
        bucket_range=bucket_range,
        contributions=contributions,
        active_features=x,
    )


# -----------------------------
# 4) Pretty printing
# -----------------------------
def humanize_feature_name(f: str) -> str:
    mapping = {
        "age_<25": "Age < 25",
        "age_25_34": "Age 25–34",
        "age_35_44": "Age 35–44",
        "age_45_54": "Age 45–54",
        "age_55_plus": "Age ≥ 55",
        "age_unknown": "Age unknown",
        "hpv_positive": "HPV positive",
        "hpv_negative": "HPV negative",
        "hpv_unknown": "HPV status unknown",
        "smoke_yes": "Ever smoked: yes",
        "smoke_no": "Ever smoked: no",
        "smoke_unknown": "Smoking history unknown",
        "immuno_yes": "Immunosuppressed: yes",
        "immuno_no": "Immunosuppressed: no",
        "immuno_unknown": "Immunosuppression unknown",
    }
    return mapping.get(f, f)


def main():
    with open(SCORING_JSON_PATH, "r", encoding="utf-8") as f:
        scoring = json.load(f)

    res = score_user(USER_INPUT, scoring)

    print("\n=== Hardcoded User Input ===")
    for k, v in USER_INPUT.items():
        print(f"- {k}: {v}")

    print("\n=== Score ===")
    lo, hi = res.bucket_range
    print(f"Total points: {res.total_points}")
    print(f"Risk bucket:  {res.bucket}  (range {lo} to {hi})")

    print("\n=== Why (feature contributions) ===")
    if not res.contributions:
        print("(No active features matched the scoring JSON. Check trained_features names.)")
    else:
        for feat, pts in res.contributions:
            sign = "+" if pts >= 0 else ""
            print(f"- {humanize_feature_name(feat)}: {sign}{pts} points")

if __name__ == "__main__":
    main()


FileNotFoundError: [Errno 2] No such file or directory: 'hersalud_scoring.json'

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

CSV_PATH = "clinical_only_training_survival.csv"

# -----------------------------
# 1) Hardcoded user input
# -----------------------------
USER_INPUT = {
    "age_years": 54,
    "hpv_status": "positive",       # positive|negative|unknown
    "ever_smoked": "yes",            # yes|no|unknown
    "immunosuppressed": "yes",  # yes|no|unknown
}

# -----------------------------
# 2) Feature engineering (must match training)
# -----------------------------
def bucket_age(age):
    if pd.isna(age): return "age_unknown"
    if age < 25: return "age_<25"
    if age < 35: return "age_25_34"
    if age < 45: return "age_35_44"
    if age < 55: return "age_45_54"
    return "age_55_plus"

def one_hot(df):
    X = pd.DataFrame(index=df.index)

    # Age
    age_bucket = df["age_years"].apply(bucket_age)
    for b in ["age_<25","age_25_34","age_35_44","age_45_54","age_55_plus","age_unknown"]:
        X[b] = (age_bucket == b).astype(int)

    # HPV
    for k in ["positive","negative","unknown"]:
        X[f"hpv_{k}"] = (df["hpv_status"] == k).astype(int)

    # Smoking
    for k in ["yes","no","unknown"]:
        X[f"smoke_{k}"] = (df["ever_smoked"] == k).astype(int)

    # Immuno
    for k in ["yes","no","unknown"]:
        X[f"immuno_{k}"] = (df["immunosuppressed"] == k).astype(int)

    return X

# -----------------------------
# 3) Load + train
# -----------------------------
df = pd.read_csv(CSV_PATH)

X = one_hot(df)
y = df["label"].astype(int)

model = LogisticRegression(max_iter=2000)
model.fit(X, y)

# -----------------------------
# 4) Encode user input
# -----------------------------
user_df = pd.DataFrame([USER_INPUT])
user_X = one_hot(user_df)
user_X = user_X.reindex(columns=X.columns, fill_value=0)

# -----------------------------
# 5) Predict
# -----------------------------
prob = model.predict_proba(user_X)[0,1]

# Simple risk buckets (you can tune later)
if prob < 0.3:
    bucket = "Low"
elif prob < 0.6:
    bucket = "Moderate"
else:
    bucket = "High"

# -----------------------------
# 6) Explanation (coeff × value)
# -----------------------------
coefs = pd.Series(model.coef_[0], index=X.columns)
contribs = (coefs * user_X.iloc[0]).sort_values(key=abs, ascending=False)

print("\n=== Simulated HerSalud Run ===")
print("User input:", USER_INPUT)
print(f"Predicted risk probability: {prob:.3f}")
print("Risk bucket:", bucket)

print("\nTop contributing factors:")
for feat, val in contribs.items():
    if val != 0:
        sign = "+" if val > 0 else ""
        print(f"  {feat}: {sign}{val:.3f}")



=== Simulated HerSalud Run ===
User input: {'age_years': 54, 'hpv_status': 'positive', 'ever_smoked': 'yes', 'immunosuppressed': 'yes'}
Predicted risk probability: 0.167
Risk bucket: Low

Top contributing factors:
  smoke_yes: -0.191
  age_45_54: -0.032


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

CSV_PATH = "clinical_only_training_survival.csv"

# ---- Hardcoded user ----
USER = {
    "age_years": 56,
    "ever_smoked": "no",   # yes|unknown (your dataset mostly has these)
}

def norm_yes_unknown(v):
    s = str(v).strip().lower()
    return "yes" if s in {"yes", "y", "1", "true"} else "unknown"

# 1) load
df = pd.read_csv(CSV_PATH, low_memory=False)
df["age_years"] = pd.to_numeric(df["age_years"], errors="coerce")
df["ever_smoked"] = df["ever_smoked"].astype(str).str.strip().str.lower().replace({"nan": "unknown"})
df["ever_smoked"] = df["ever_smoked"].apply(norm_yes_unknown)

y = df["label"].astype(int)

# 2) build a simple, stable feature set (avoid dummy-trap)
X = pd.DataFrame({
    "age_years": df["age_years"].fillna(df["age_years"].median()),
    "smoke_yes": (df["ever_smoked"] == "yes").astype(int),
})

# 3) train
model = LogisticRegression(max_iter=2000)
model.fit(X, y)

# 4) derive bucket thresholds from model score distribution
p_all = model.predict_proba(X)[:, 1]
q20, q80 = np.quantile(p_all, [0.20, 0.80])  # low=bottom 20%, high=top 20%

def bucket(p):
    if p < q20: return "Low"
    if p < q80: return "Moderate"
    return "High"

# 5) score hardcoded user
user_X = pd.DataFrame([{
    "age_years": float(USER["age_years"]),
    "smoke_yes": 1 if norm_yes_unknown(USER["ever_smoked"]) == "yes" else 0,
}])
p = model.predict_proba(user_X)[0, 1]

# 6) explanation (linear contributions)
coefs = pd.Series(model.coef_[0], index=X.columns)
contrib = (coefs * user_X.iloc[0]).sort_values(key=abs, ascending=False)

print("=== Simulated HerSalud Run ===")
print("User input:", USER)
print(f"Predicted probability (label=1): {p:.3f}")
print(f"Buckets (data-driven): Low < {q20:.3f}, High ≥ {q80:.3f}")
print("Risk bucket:", bucket(p))

print("\nContributions (coef * value):")
for name, val in contrib.items():
    sign = "+" if val >= 0 else ""
    print(f"  {name}: {sign}{val:.3f}")


=== Simulated HerSalud Run ===
User input: {'age_years': 56, 'ever_smoked': 'no'}
Predicted probability (label=1): 0.251
Buckets (data-driven): Low < 0.145, High ≥ 0.231
Risk bucket: High

Contributions (coef * value):
  age_years: +1.244
  smoke_yes: +-0.000


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

CSV_PATH = "clinical_only_training_survival.csv"

def bucket_age(a):
    if pd.isna(a): return "age_unknown"
    a = float(a)
    if a < 25: return "age_<25"
    if a < 35: return "age_25_34"
    if a < 45: return "age_35_44"
    if a < 55: return "age_45_54"
    if a < 65: return "age_55_64"
    return "age_65_plus"

def norm_smoke(v):
    s = str(v).strip().lower()
    if s in {"yes","y","1","true"}: return "yes"
    if s in {"no","n","0","false"}: return "no"
    return "unknown"

def make_X(df):
    X = pd.DataFrame(index=df.index)

    age_b = df["age_years"].apply(bucket_age)
    # Choose baseline age bucket: age_35_44 (not included)
    for b in ["age_<25","age_25_34","age_45_54","age_55_64","age_65_plus","age_unknown"]:
        X[b] = (age_b == b).astype(int)

    smoke = df["ever_smoked"].apply(norm_smoke)
    # Choose baseline smoking: smoke_no (not included)
    for k in ["yes","unknown"]:
        X[f"smoke_{k}"] = (smoke == k).astype(int)

    return X

def explain(feat):
    mapping = {
        "age_<25": "Age < 25",
        "age_25_34": "Age 25–34",
        "age_35_44": "Age 35–44",
        "age_45_54": "Age 45–54",
        "age_55_64": "Age 55–64",
        "age_65_plus": "Age ≥ 65",
        "age_unknown": "Age unknown",
        "smoke_yes": "Ever smoked: yes",
        "smoke_no": "Ever smoked: no",
        "smoke_unknown": "Smoking history unknown",
    }
    return mapping.get(feat, feat)

# Train
df = pd.read_csv(CSV_PATH, low_memory=False)
df["age_years"] = pd.to_numeric(df["age_years"], errors="coerce")
y = df["label"].astype(int)

X = make_X(df)

model = LogisticRegression(max_iter=2000)
model.fit(X, y)

# Bucket thresholds from distribution
p_all = model.predict_proba(X)[:, 1]
q20, q80 = np.quantile(p_all, [0.20, 0.80])

def bucket(p):
    if p < q20: return "Low"
    if p < q80: return "Moderate"
    return "High"

def run_user(age_years, ever_smoked):
    user_df = pd.DataFrame([{"age_years": age_years, "ever_smoked": ever_smoked}])
    user_X = make_X(user_df).reindex(columns=X.columns, fill_value=0)

    p = model.predict_proba(user_X)[0, 1]

    coefs = pd.Series(model.coef_[0], index=X.columns)
    contrib = (coefs * user_X.iloc[0]).sort_values(key=abs, ascending=False)

    print("\n=== Simulated HerSalud Run ===")
    print("User input:", {"age_years": age_years, "ever_smoked": ever_smoked})
    print(f"Predicted probability (label=1): {p:.3f}")
    print(f"Buckets (data-driven): Low < {q20:.3f}, High ≥ {q80:.3f}")
    print("Risk bucket:", bucket(p))

    print("\nTop contributing factors:")
    for feat, val in contrib.items():
        if val != 0:
            sign = "+" if val >= 0 else ""
            print(f"  {explain(feat)}: {sign}{val:.3f}")

coefs = pd.Series(model.coef_[0], index=X.columns)

# Override smoking effect for demo/deployment safety:
if "smoke_yes" in coefs.index:
    coefs["smoke_yes"] = 0.0

# Try a few
run_user(34, "yes")
run_user(64, "yes")
run_user(76, "yes")
run_user(76, "no")



=== Simulated HerSalud Run ===
User input: {'age_years': 34, 'ever_smoked': 'yes'}
Predicted probability (label=1): 0.107
Buckets (data-driven): Low < 0.145, High ≥ 0.224
Risk bucket: Low

Top contributing factors:
  Age 25–34: -0.528

=== Simulated HerSalud Run ===
User input: {'age_years': 64, 'ever_smoked': 'yes'}
Predicted probability (label=1): 0.175
Buckets (data-driven): Low < 0.145, High ≥ 0.224
Risk bucket: Moderate

Top contributing factors:
  Age 55–64: +0.042

=== Simulated HerSalud Run ===
User input: {'age_years': 76, 'ever_smoked': 'yes'}
Predicted probability (label=1): 0.335
Buckets (data-driven): Low < 0.145, High ≥ 0.224
Risk bucket: High

Top contributing factors:
  Age ≥ 65: +0.910

=== Simulated HerSalud Run ===
User input: {'age_years': 76, 'ever_smoked': 'no'}
Predicted probability (label=1): 0.335
Buckets (data-driven): Low < 0.145, High ≥ 0.224
Risk bucket: High

Top contributing factors:
  Age ≥ 65: +0.910


In [None]:
print("Intercept:", model.intercept_[0])
for name, coef in zip(X.columns, model.coef_[0]):
    print(name, coef)


Intercept: -1.5947042833574547
age_<25 0.8709860932903981
age_25_34 -0.5280550619294986
age_45_54 0.16559527730715695
age_55_64 0.041937330172220066
age_65_plus 0.9099480412922175
age_unknown 0.0
smoke_yes 0.0
smoke_unknown 0.1860335355470585


In [None]:
import math

# ---- Coefficients you printed ----
INTERCEPT = -1.5947042833574547
COEFS = {
    "age_<25": 0.8709860932903981,
    "age_25_34": -0.5280550619294986,
    "age_45_54": 0.16559527730715695,
    "age_55_64": 0.041937330172220066,
    "age_65_plus": 0.9099480412922175,
    "age_unknown": 0.0,
    "smoke_yes": 0.0,
    "smoke_unknown": 0.1860335355470585,
}

# ---- MVP safety constraints (recommended) ----
# 1) Don't allow clinically weird directions in MVP
#    (TCGA survival artifact: age_<25 positive)
CLAMP_TO_ZERO = {"age_<25", "smoke_unknown", "smoke_yes"}  # treat smoking as guidance-only

# 2) Convert coefs -> points
# Convert coefs -> points (but we will override key fields for MVP)
POINT_SCALE = 0.25
def coef_to_points(c): return int(round(c / POINT_SCALE))

RAW_POINTS = {k: coef_to_points(v) for k, v in COEFS.items()}

# --- HARD OVERRIDES for MVP safety ---
POINTS = RAW_POINTS.copy()

# Age overrides (MVP locked)
POINTS["age_<25"] = 0
POINTS["age_25_34"] = 0
POINTS["age_45_54"] = 2
POINTS["age_55_64"] = 3
POINTS["age_65_plus"] = 4
POINTS["age_unknown"] = 0  # optional

print("FINAL POINTS MAP:", POINTS)

# Smoking guidance-only
POINTS["smoke_yes"] = 0
POINTS["smoke_unknown"] = 0

# ---- Risk buckets (points) ----
# With this scale, typical points will be 0–4.
# We'll make simple buckets:
# 0–1 Low, 2–3 Moderate, 4+ High
def bucket(points):
    if points <= 1:
        return "Low"
    if points <= 3:
        return "Moderate"
    return "High"

def sigmoid(z):
    return 1 / (1 + math.exp(-z))

def bucket_age(age):
    if age is None:
        return "age_unknown"
    a = float(age)
    if a < 25: return "age_<25"
    if a < 35: return "age_25_34"
    if a < 45: return "age_35_44"  # baseline (not in COEFS)
    if a < 55: return "age_45_54"
    if a < 65: return "age_55_64"
    return "age_65_plus"

def norm_smoke(v):
    s = str(v).strip().lower()
    if s in {"yes","y","1","true"}: return "yes"
    if s in {"no","n","0","false"}: return "no"
    return "unknown"

def score_user(age_years, ever_smoked):
    age_feat = bucket_age(age_years)
    smoke = norm_smoke(ever_smoked)

    active = []
    # age (skip baseline age_35_44)
    if age_feat in COEFS:
        active.append(age_feat)
    # smoke (skip baseline smoke_no)
    if smoke == "yes":
        active.append("smoke_yes")
    elif smoke == "unknown":
        active.append("smoke_unknown")

    # Points scoring
    total_points = sum(POINTS.get(f, 0) for f in active)

    # Optional: also compute logistic probability using original coefficients (for analysis)
    z = INTERCEPT
    for f in active:
        z += COEFS.get(f, 0.0)
    p = sigmoid(z)

    # Explanation
    contrib = [(f, POINTS.get(f, 0), COEFS.get(f, 0.0)) for f in active]
    contrib.sort(key=lambda t: abs(t[2]), reverse=True)

    return total_points, bucket(total_points), p, contrib

def human(f):
    return {
        "age_<25": "Age < 25",
        "age_25_34": "Age 25–34",
        "age_45_54": "Age 45–54",
        "age_55_64": "Age 55–64",
        "age_65_plus": "Age ≥ 65",
        "age_unknown": "Age unknown",
        "smoke_yes": "Ever smoked: yes",
        "smoke_unknown": "Smoking history unknown",
    }.get(f, f)

if __name__ == "__main__":
    tests = [
        (34, "yes"),
        (64, "yes"),
        (76, "yes"),
        (76, "no"),
        (22, "no"),
        (50, "unknown"),
    ]
    print("POINTS MAP (MVP):", POINTS)
    for age, smoke in tests:
        pts, b, p, contrib = score_user(age, smoke)
        print("\n=== HerSalud Points Simulation ===")
        print({"age_years": age, "ever_smoked": smoke})
        print(f"Total points: {pts}  -> {b}")
        print(f"(Reference logistic probability, not deployed): {p:.3f}")
        print("Why:")
        for f, fp, fc in contrib:
            print(f" - {human(f)}: {fp} pts  (coef {fc:+.3f})")




FINAL POINTS MAP: {'age_<25': 0, 'age_25_34': 0, 'age_45_54': 2, 'age_55_64': 3, 'age_65_plus': 4, 'age_unknown': 0, 'smoke_yes': 0, 'smoke_unknown': 1}
POINTS MAP (MVP): {'age_<25': 0, 'age_25_34': 0, 'age_45_54': 2, 'age_55_64': 3, 'age_65_plus': 4, 'age_unknown': 0, 'smoke_yes': 0, 'smoke_unknown': 0}

=== HerSalud Points Simulation ===
{'age_years': 34, 'ever_smoked': 'yes'}
Total points: 0  -> Low
(Reference logistic probability, not deployed): 0.107
Why:
 - Age 25–34: 0 pts  (coef -0.528)
 - Ever smoked: yes: 0 pts  (coef +0.000)

=== HerSalud Points Simulation ===
{'age_years': 64, 'ever_smoked': 'yes'}
Total points: 3  -> Moderate
(Reference logistic probability, not deployed): 0.175
Why:
 - Age 55–64: 3 pts  (coef +0.042)
 - Ever smoked: yes: 0 pts  (coef +0.000)

=== HerSalud Points Simulation ===
{'age_years': 76, 'ever_smoked': 'yes'}
Total points: 4  -> High
(Reference logistic probability, not deployed): 0.335
Why:
 - Age ≥ 65: 4 pts  (coef +0.910)
 - Ever smoked: yes: 0 

In [None]:
import math

# -----------------------------
# Frozen logistic model params (from your trained run)
# -----------------------------
INTERCEPT = -1.5947042833574547
COEFS = {
    "age_<25": 0.8709860932903981,
    "age_25_34": -0.5280550619294986,
    "age_45_54": 0.16559527730715695,
    "age_55_64": 0.041937330172220066,
    "age_65_plus": 0.9099480412922175,
    "age_unknown": 0.0,
    "smoke_yes": 0.0,
    "smoke_unknown": 0.1860335355470585,
}

# -----------------------------
# MVP points map (single source of truth)
# -----------------------------
POINTS = {
    "age_<25": 0,       # clamp TCGA artifact
    "age_25_34": 0,
    "age_45_54": 2,
    "age_55_64": 3,
    "age_65_plus": 4,
    "age_unknown": 0,
    "smoke_yes": 0,     # guidance-only
    "smoke_unknown": 0, # don't penalize missingness
}

# Make "debug probability" match MVP logic (so no confusion for age<25)
MVP_COEFS = COEFS.copy()
MVP_COEFS["age_<25"] = 0.0
MVP_COEFS["smoke_yes"] = 0.0
MVP_COEFS["smoke_unknown"] = 0.0

DEBUG = True

print("POINTS MAP (MVP):", POINTS)

# -----------------------------
# Risk buckets (points)
# -----------------------------
def bucket(points: int) -> str:
    if points <= 1:
        return "Low"
    if points <= 3:
        return "Moderate"
    return "High"

def next_steps(bucket_name: str) -> str:
    if bucket_name == "Low":
        return "Keep up routine cervical cancer screening based on local guidelines."
    if bucket_name == "Moderate":
        return "Consider scheduling a screening soon and reviewing risk factors with a clinician."
    return "Strongly consider prompt screening or medical follow-up, especially if symptoms are present."

# -----------------------------
# Helpers
# -----------------------------
def sigmoid(z: float) -> float:
    return 1 / (1 + math.exp(-z))

def bucket_age(age):
    if age is None:
        return "age_unknown"
    a = float(age)
    if a < 25: return "age_<25"
    if a < 35: return "age_25_34"
    if a < 45: return "age_35_44"  # baseline (not in COEFS / not scored)
    if a < 55: return "age_45_54"
    if a < 65: return "age_55_64"
    return "age_65_plus"

def norm_smoke(v):
    s = str(v).strip().lower()
    if s in {"yes", "y", "1", "true"}: return "yes"
    if s in {"no", "n", "0", "false"}: return "no"
    return "unknown"

def human(f: str) -> str:
    return {
        "age_<25": "Age < 25",
        "age_25_34": "Age 25–34",
        "age_45_54": "Age 45–54",
        "age_55_64": "Age 55–64",
        "age_65_plus": "Age ≥ 65",
        "age_unknown": "Age unknown",
        "smoke_yes": "Ever smoked: yes",
        "smoke_unknown": "Smoking history unknown",
    }.get(f, f)

# -----------------------------
# Scoring
# -----------------------------
def score_user(age_years, ever_smoked):
    age_feat = bucket_age(age_years)
    smoke = norm_smoke(ever_smoked)

    active = []

    # age (skip baseline age_35_44)
    if age_feat in COEFS:
        active.append(age_feat)

    # smoke (skip baseline smoke_no)
    if smoke == "yes":
        active.append("smoke_yes")
    elif smoke == "unknown":
        active.append("smoke_unknown")

    # Points scoring (deployed behavior)
    total_points = sum(POINTS.get(f, 0) for f in active)
    risk_bucket = bucket(total_points)

    # Debug probability that matches MVP constraints (NOT deployed)
    z = INTERCEPT + sum(MVP_COEFS.get(f, 0.0) for f in active)
    p = sigmoid(z)

    # Explanation (only show positive-point reasons for clean UX)
    reasons = [(f, POINTS.get(f, 0)) for f in active if POINTS.get(f, 0) > 0]

    return total_points, risk_bucket, p, reasons

# -----------------------------
# Demo runs (app-style output)
# -----------------------------
if __name__ == "__main__":
    tests = [
        (34, "yes"),
        (64, "yes"),
        (76, "yes"),
        (76, "no"),
        (22, "no"),
        (50, "unknown"),
    ]

    for age, smoke in tests:
        pts, b, p, reasons = score_user(age, smoke)

        print("\n==============================")
        print("HerSalud Risk Assessment")
        print("==============================")
        print(f"Age: {age}")
        print(f"Smoking history: {smoke}\n")

        print(f"Risk Level: {b}")
        print(f"Score: {pts} points\n")

        print("Why this result:")
        if reasons:
            for f, fp in reasons:
                print(f"• {human(f)} (+{fp})")
        else:
            print("• Based on the limited inputs provided, no elevated-risk factors were detected")

        print("\nRecommended next step:")
        print(next_steps(b))

        if DEBUG:
            print(f"\n[Debug] Reference probability (not shown to user): {p:.3f}")


POINTS MAP (MVP): {'age_<25': 0, 'age_25_34': 0, 'age_45_54': 2, 'age_55_64': 3, 'age_65_plus': 4, 'age_unknown': 0, 'smoke_yes': 0, 'smoke_unknown': 0}

HerSalud Risk Assessment
Age: 34
Smoking history: yes

Risk Level: Low
Score: 0 points

Why this result:
• Based on the limited inputs provided, no elevated-risk factors were detected

Recommended next step:
Keep up routine cervical cancer screening based on local guidelines.

[Debug] Reference probability (not shown to user): 0.107

HerSalud Risk Assessment
Age: 64
Smoking history: yes

Risk Level: Moderate
Score: 3 points

Why this result:
• Age 55–64 (+3)

Recommended next step:
Consider scheduling a screening soon and reviewing risk factors with a clinician.

[Debug] Reference probability (not shown to user): 0.175

HerSalud Risk Assessment
Age: 76
Smoking history: yes

Risk Level: High
Score: 4 points

Why this result:
• Age ≥ 65 (+4)

Recommended next step:
Strongly consider prompt screening or medical follow-up, especially if s

In [None]:
import math

# -----------------------------
# Frozen logistic model params (from your trained run)
# -----------------------------
INTERCEPT = -1.5947042833574547
COEFS = {
    "age_<25": 0.8709860932903981,
    "age_25_34": -0.5280550619294986,
    "age_45_54": 0.16559527730715695,
    "age_55_64": 0.041937330172220066,
    "age_65_plus": 0.9099480412922175,
    "age_unknown": 0.0,
    "smoke_yes": 0.0,
    "smoke_unknown": 0.1860335355470585,
}

# -----------------------------
# MVP points map (single source of truth)
# -----------------------------
POINTS = {
    "age_<25": 0,       # clamp TCGA artifact
    "age_25_34": 0,
    "age_45_54": 2,
    "age_55_64": 3,
    "age_65_plus": 4,
    "age_unknown": 0,
    "smoke_yes": 0,     # guidance-only
    "smoke_unknown": 0, # don't penalize missingness
}

# Make (optional) reference probability match MVP logic (not deployed)
MVP_COEFS = COEFS.copy()
MVP_COEFS["age_<25"] = 0.0
MVP_COEFS["smoke_yes"] = 0.0
MVP_COEFS["smoke_unknown"] = 0.0

DEBUG = False  # ✅ OFF

print("POINTS MAP (MVP):", POINTS)

# -----------------------------
# Risk buckets (points)
# -----------------------------
def bucket(points: int) -> str:
    if points <= 1:
        return "Low"
    if points <= 3:
        return "Moderate"
    return "High"

def next_steps(bucket_name: str) -> str:
    if bucket_name == "Low":
        return "Keep up routine cervical cancer screening based on local guidelines."
    if bucket_name == "Moderate":
        return "Consider scheduling a screening soon and reviewing risk factors with a clinician."
    return "Strongly consider prompt screening or medical follow-up, especially if symptoms are present."

# -----------------------------
# Helpers
# -----------------------------
def sigmoid(z: float) -> float:
    return 1 / (1 + math.exp(-z))

def bucket_age(age):
    if age is None:
        return "age_unknown"
    a = float(age)
    if a < 25: return "age_<25"
    if a < 35: return "age_25_34"
    if a < 45: return "age_35_44"  # baseline (not in COEFS / not scored)
    if a < 55: return "age_45_54"
    if a < 65: return "age_55_64"
    return "age_65_plus"

def norm_smoke(v):
    s = str(v).strip().lower()
    if s in {"yes", "y", "1", "true"}: return "yes"
    if s in {"no", "n", "0", "false"}: return "no"
    return "unknown"

def human(f: str) -> str:
    return {
        "age_<25": "Age < 25",
        "age_25_34": "Age 25–34",
        "age_45_54": "Age 45–54",
        "age_55_64": "Age 55–64",
        "age_65_plus": "Age ≥ 65",
        "age_unknown": "Age unknown",
        "smoke_yes": "Ever smoked: yes",
        "smoke_unknown": "Smoking history unknown",
    }.get(f, f)

# -----------------------------
# Scoring
# -----------------------------
def score_user(age_years, ever_smoked):
    age_feat = bucket_age(age_years)
    smoke = norm_smoke(ever_smoked)

    active = []

    # age (skip baseline age_35_44)
    if age_feat in COEFS:
        active.append(age_feat)

    # smoke (skip baseline smoke_no)
    if smoke == "yes":
        active.append("smoke_yes")
    elif smoke == "unknown":
        active.append("smoke_unknown")

    # Points scoring (deployed behavior)
    total_points = sum(POINTS.get(f, 0) for f in active)
    risk_bucket = bucket(total_points)

    # Reference probability (kept internal; not deployed)
    z = INTERCEPT + sum(MVP_COEFS.get(f, 0.0) for f in active)
    p = sigmoid(z)

    # Reasons (only positive-point reasons for clean UX)
    reasons = [(f, POINTS.get(f, 0)) for f in active if POINTS.get(f, 0) > 0]

    return total_points, risk_bucket, p, reasons

# -----------------------------
# Demo runs (app-style output)
# -----------------------------
if __name__ == "__main__":
    tests = [
        (34, "yes"),
        (64, "yes"),
        (76, "yes"),
        (76, "no"),
        (22, "no"),
        (50, "unknown"),
    ]

    for age, smoke in tests:
        pts, b, p, reasons = score_user(age, smoke)

        print("\n==============================")
        print("HerSalud Risk Assessment")
        print("==============================")
        print(f"Age: {age}")
        print(f"Smoking history: {smoke}\n")

        print(f"Risk Level: {b}")
        print(f"Score: {pts} points\n")

        print("Why this result:")
        if reasons:
            for f, fp in reasons:
                print(f"• {human(f)} (+{fp})")
        else:
            print("• Based on the inputs provided, your clinical risk signals are low.")

        print("\nRecommended next step:")
        print(next_steps(b))

        if DEBUG:
            print(f"\n[Debug] Reference probability (not shown to user): {p:.3f}")


POINTS MAP (MVP): {'age_<25': 0, 'age_25_34': 0, 'age_45_54': 2, 'age_55_64': 3, 'age_65_plus': 4, 'age_unknown': 0, 'smoke_yes': 0, 'smoke_unknown': 0}

HerSalud Risk Assessment
Age: 34
Smoking history: yes

Risk Level: Low
Score: 0 points

Why this result:
• Based on the inputs provided, your clinical risk signals are low.

Recommended next step:
Keep up routine cervical cancer screening based on local guidelines.

HerSalud Risk Assessment
Age: 64
Smoking history: yes

Risk Level: Moderate
Score: 3 points

Why this result:
• Age 55–64 (+3)

Recommended next step:
Consider scheduling a screening soon and reviewing risk factors with a clinician.

HerSalud Risk Assessment
Age: 76
Smoking history: yes

Risk Level: High
Score: 4 points

Why this result:
• Age ≥ 65 (+4)

Recommended next step:
Strongly consider prompt screening or medical follow-up, especially if symptoms are present.

HerSalud Risk Assessment
Age: 76
Smoking history: no

Risk Level: High
Score: 4 points

Why this result:
