# ACC Survival Analysis: Univariate Cox Screening (Track A)

**Goal**: Screen candidate predictors via univariate Cox regression; keep variables with p < 0.05

**Cohort**: Track A (primary analysis)

**Endpoints**: OS and CSS

**Output**: Univariate table (HR, 95% CI, p-value) for each predictor

## 1. Setup and Import Libraries

In [27]:
import pandas as pd
import numpy as np
from pathlib import Path
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
import warnings
import json

warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_rows", 100)

print("Libraries loaded successfully!")

Libraries loaded successfully!


## 2. Load Track A Training Data

In [28]:
# Load Track A training data
data_dir = Path("../data/processed")
train = pd.read_pickle(data_dir / "trackA_train.pkl")

print(f"Track A Training data loaded: {train.shape}")
print(f"  OS events: {train['event_os'].sum()} ({train['event_os'].mean()*100:.1f}%)")
print(
    f"  CSS events: {train['event_css'].sum()} ({train['event_css'].mean()*100:.1f}%)"
)

print(f"\nColumns: {train.columns.tolist()}")

Track A Training data loaded: (926, 16)
  OS events: 384 (41.5%)
  CSS events: 246 (26.6%)

Columns: ['ID', 'age', 'sex', 'grade', 'radiotherapy', 'chemotherapy', 'tumor_number', 'race', 'marital_status', 'urban_rural', 'time_os', 'event_os', 'TNMstage', 'site', 'event_css', 'time_css']


In [29]:
# Define candidate variables for screening
# Split into non-staging and staging variables
non_staging_vars = [
    "age",
    "sex",
    "site",
    "grade",
    "radiotherapy",
    "chemotherapy",
    "tumor_number",
    "race",
    "marital_status",
    "urban_rural",
]

# Staging variable: TNMstage (combined) - as used in reference R code
# Reference: 绘制诺模图代码.txt uses only TNMstage in the Cox model
staging_vars = ["TNMstage"]

print(f"Non-staging variables: {len(non_staging_vars)}")
for v in non_staging_vars:
    n_levels = train[v].nunique()
    n_missing = train[v].isna().sum()
    print(f"  {v}: {n_levels} levels, {n_missing} missing")

print(f"\nStaging variable: {staging_vars}")
print(f"  TNMstage levels: {train['TNMstage'].dropna().unique().tolist()}")
print(f"  TNMstage missing: {train['TNMstage'].isna().sum()}")

Non-staging variables: 10
  age: 3 levels, 0 missing
  sex: 2 levels, 0 missing
  site: 4 levels, 0 missing
  grade: 5 levels, 0 missing
  radiotherapy: 2 levels, 0 missing
  chemotherapy: 2 levels, 0 missing
  tumor_number: 2 levels, 0 missing
  race: 4 levels, 0 missing
  marital_status: 7 levels, 0 missing
  urban_rural: 6 levels, 0 missing

Staging variable: ['TNMstage']
  TNMstage levels: [1, '4B', '4A', 2, 3, '4C', 4, '4NOS']
  TNMstage missing: 0


## 3. Univariate Cox Regression Function

In [30]:
def fit_univariate_cox(df, var, time_col, event_col):
    """
    Fit univariate Cox model for a single categorical predictor.
    Returns DataFrame with HR, 95% CI, p-value for each level.
    """
    # Create dummy variables (drop first as reference)
    dummies = pd.get_dummies(df[var], prefix=var, drop_first=True)

    # Prepare data for Cox model
    cox_data = pd.concat(
        [
            df[[time_col, event_col]].reset_index(drop=True),
            dummies.reset_index(drop=True),
        ],
        axis=1,
    )

    # Remove rows with missing values
    cox_data = cox_data.dropna()

    if len(cox_data) < 50:
        return None, None

    # Fit Cox model
    cph = CoxPHFitter()
    try:
        cph.fit(cox_data, duration_col=time_col, event_col=event_col)
    except Exception as e:
        print(f"    Warning: {var} failed - {e}")
        return None, None

    # Extract results
    summary = cph.summary.copy()
    summary["variable"] = var
    summary["level"] = summary.index.str.replace(f"{var}_", "")

    # Get reference level
    ref_level = (
        df[var].cat.categories[0] if hasattr(df[var], "cat") else df[var].unique()[0]
    )

    # Add reference row
    ref_row = pd.DataFrame(
        {
            "variable": [var],
            "level": [f"{ref_level} (ref)"],
            "exp(coef)": [1.0],
            "exp(coef) lower 95%": [np.nan],
            "exp(coef) upper 95%": [np.nan],
            "p": [np.nan],
            "coef": [0.0],
        }
    )

    # Combine results
    results = pd.concat(
        [
            ref_row,
            summary[
                [
                    "variable",
                    "level",
                    "exp(coef)",
                    "exp(coef) lower 95%",
                    "exp(coef) upper 95%",
                    "p",
                    "coef",
                ]
            ],
        ],
        ignore_index=True,
    )

    # Overall p-value (likelihood ratio test)
    overall_p = cph.log_likelihood_ratio_test().p_value

    return results, overall_p

## 4. Univariate Cox Screening: OS Endpoint

In [31]:
print("=" * 70)
print("UNIVARIATE COX SCREENING: OVERALL SURVIVAL (OS)")
print("=" * 70)
print("\n--- Non-staging Variables ---")

os_results = []
os_pvalues = {}

for var in non_staging_vars:
    print(f"\nFitting: {var}")
    results, overall_p = fit_univariate_cox(train, var, "time_os", "event_os")

    if results is not None:
        os_results.append(results)
        os_pvalues[var] = overall_p
        print(
            f"  Overall p-value: {overall_p:.4f} {'***' if overall_p < 0.001 else '**' if overall_p < 0.01 else '*' if overall_p < 0.05 else ''}"
        )
    else:
        print(f"  Skipped (insufficient data)")

# Combine all results
os_univariate_df = pd.concat(os_results, ignore_index=True)

UNIVARIATE COX SCREENING: OVERALL SURVIVAL (OS)

--- Non-staging Variables ---

Fitting: age
  Overall p-value: 0.0000 ***

Fitting: sex
  Overall p-value: 0.2181 

Fitting: site
  Overall p-value: 0.0022 **

Fitting: grade
  Overall p-value: 0.0001 ***

Fitting: radiotherapy
  Overall p-value: 0.0006 ***

Fitting: chemotherapy
  Overall p-value: 0.0000 ***

Fitting: tumor_number
  Overall p-value: 0.0017 **

Fitting: race
  Overall p-value: 0.3818 

Fitting: marital_status
  Overall p-value: 0.0008 ***

Fitting: urban_rural
  Overall p-value: 0.7331 


In [32]:
# Display OS univariate results
print("\n" + "=" * 70)
print("OS UNIVARIATE RESULTS (HR, 95% CI, p-value)")
print("=" * 70)

# Format for display
os_display = os_univariate_df.copy()
os_display["HR (95% CI)"] = os_display.apply(
    lambda x: (
        f"{x['exp(coef)']:.2f} ({x['exp(coef) lower 95%']:.2f}-{x['exp(coef) upper 95%']:.2f})"
        if pd.notna(x["exp(coef) lower 95%"])
        else "1.00 (ref)"
    ),
    axis=1,
)
os_display["p-value"] = os_display["p"].apply(
    lambda x: f"{x:.4f}" if pd.notna(x) else "-"
)

print(
    os_display[["variable", "level", "HR (95% CI)", "p-value"]].to_string(index=False)
)


OS UNIVARIATE RESULTS (HR, 95% CI, p-value)
      variable                     level       HR (95% CI) p-value
           age               45-59 (ref)        1.00 (ref)       -
           age                       ＜45  0.80 (0.56-1.14)  0.2154
           age                       ＞60  2.14 (1.69-2.72)  0.0000
           sex                   女 (ref)        1.00 (ref)       -
           sex                         男  1.14 (0.93-1.39)  0.2169
          site              口腔口咽其它 (ref)        1.00 (ref)       -
          site                      喉和下咽  0.72 (0.36-1.42)  0.3393
          site                      大唾液腺  0.80 (0.63-1.01)  0.0576
          site                 鼻腔鼻窦副鼻窦鼻咽  1.33 (1.00-1.78)  0.0487
         grade                1分化好 (ref)        1.00 (ref)       -
         grade                      2中分化  1.64 (0.92-2.92)  0.0954
         grade                      3分化差  3.08 (1.70-5.61)  0.0002
         grade                   4未分化间变性  3.78 (1.95-7.35)  0.0001
         grade   

In [33]:
# Identify significant variables (p < 0.05)
os_significant = [var for var, p in os_pvalues.items() if p < 0.05]
os_nonsignificant = [var for var, p in os_pvalues.items() if p >= 0.05]

print("\n" + "=" * 70)
print("OS VARIABLE SCREENING SUMMARY")
print("=" * 70)
print(f"\nSignificant (p < 0.05) - {len(os_significant)} variables:")
for var in os_significant:
    print(f"  {var}: p = {os_pvalues[var]:.4f}")

print(f"\nNon-significant (p >= 0.05) - {len(os_nonsignificant)} variables:")
for var in os_nonsignificant:
    print(f"  {var}: p = {os_pvalues[var]:.4f}")


OS VARIABLE SCREENING SUMMARY

Significant (p < 0.05) - 7 variables:
  age: p = 0.0000
  site: p = 0.0022
  grade: p = 0.0001
  radiotherapy: p = 0.0006
  chemotherapy: p = 0.0000
  tumor_number: p = 0.0017
  marital_status: p = 0.0008

Non-significant (p >= 0.05) - 3 variables:
  sex: p = 0.2181
  race: p = 0.3818
  urban_rural: p = 0.7331


## 4b. OS Staging Variable (TNMstage)

Staging variable requires special handling because it has missing values.
We fit **TNMstage (combined)**: Overall stage (1, 2, 3, 4A, 4B, 4C) - exclude missing

**Note**: Following reference R code (绘制诺模图代码.txt), we use only TNMstage as the staging variable, not separate T/N/M components.

In [34]:
def fit_univariate_cox_staging(df, var, time_col, event_col):
    """
    Fit univariate Cox model for staging variable, excluding missing/unknown values.
    """
    # Create subset excluding missing values for this variable
    df_subset = df.copy()

    # Remove rows where variable is NA
    df_subset = df_subset[df_subset[var].notna()]

    # Convert to string for filtering
    df_subset[var] = df_subset[var].astype(str)

    # Exclude unknown values
    unknown_patterns = ["Unknown", "UNK Stage", "nan", "NaN", "NA"]
    for pattern in unknown_patterns:
        df_subset = df_subset[df_subset[var] != pattern]

    # For T, N, M: exclude TX, NX, MX (unknown)
    if var in ["T", "N", "M"]:
        df_subset = df_subset[~df_subset[var].str.endswith("X", na=False)]

    print(f"  N after excluding missing/unknown: {len(df_subset)} (from {len(df)})")

    if len(df_subset) < 50:
        print(f"  Skipped: insufficient data")
        return None, None, 0

    # Check number of events
    n_events = df_subset[event_col].sum()
    if n_events < 10:
        print(f"  Skipped: too few events ({n_events})")
        return None, None, 0

    # Get unique levels
    levels = df_subset[var].unique()
    print(f"  Levels: {sorted(levels)}")

    # Create dummy variables (drop first as reference)
    dummies = pd.get_dummies(df_subset[var], prefix=var, drop_first=True)

    if dummies.shape[1] == 0:
        print(f"  Skipped: only one level")
        return None, None, len(df_subset)

    # Prepare data for Cox model
    cox_data = pd.concat(
        [
            df_subset[[time_col, event_col]].reset_index(drop=True),
            dummies.reset_index(drop=True),
        ],
        axis=1,
    )

    cox_data = cox_data.dropna()

    # Fit Cox model
    cph = CoxPHFitter()
    try:
        cph.fit(cox_data, duration_col=time_col, event_col=event_col)
    except Exception as e:
        print(f"  Warning: failed - {e}")
        return None, None, len(df_subset)

    # Extract results
    summary = cph.summary.copy()
    summary["variable"] = var
    summary["level"] = summary.index.str.replace(f"{var}_", "")

    # Get reference level (first in sorted order)
    ref_level = sorted(levels)[0]

    # Add reference row
    ref_row = pd.DataFrame(
        {
            "variable": [var],
            "level": [f"{ref_level} (ref)"],
            "exp(coef)": [1.0],
            "exp(coef) lower 95%": [np.nan],
            "exp(coef) upper 95%": [np.nan],
            "p": [np.nan],
            "coef": [0.0],
        }
    )

    results = pd.concat(
        [
            ref_row,
            summary[
                [
                    "variable",
                    "level",
                    "exp(coef)",
                    "exp(coef) lower 95%",
                    "exp(coef) upper 95%",
                    "p",
                    "coef",
                ]
            ],
        ],
        ignore_index=True,
    )

    overall_p = cph.log_likelihood_ratio_test().p_value

    return results, overall_p, len(df_subset)


print("Staging Cox function defined (excludes missing/unknown values)")

Staging Cox function defined (excludes missing/unknown values)


In [35]:
print("\n" + "=" * 70)
print("OS STAGING ANALYSIS (missing values excluded)")
print("=" * 70)

os_staging_results = []
os_staging_pvalues = {}
os_staging_n = {}

# TNMstage (combined) - following reference R code
print("\n--- TNMstage (Combined Stage) ---")
print("Fitting: TNMstage")
results, overall_p, n_used = fit_univariate_cox_staging(
    train, "TNMstage", "time_os", "event_os"
)
if results is not None:
    os_staging_results.append(results)
    os_staging_pvalues["TNMstage"] = overall_p
    os_staging_n["TNMstage"] = n_used
    print(
        f"  Overall p-value: {overall_p:.4f} {'***' if overall_p < 0.001 else '**' if overall_p < 0.01 else '*' if overall_p < 0.05 else ''}"
    )
else:
    print("  Skipped (insufficient data or convergence failed)")

# Combine staging results
if os_staging_results:
    os_staging_df = pd.concat(os_staging_results, ignore_index=True)


OS STAGING ANALYSIS (missing values excluded)

--- TNMstage (Combined Stage) ---
Fitting: TNMstage
  N after excluding missing/unknown: 926 (from 926)
  Levels: ['1', '2', '3', '4', '4A', '4B', '4C', '4NOS']
  Overall p-value: 0.0000 ***


In [36]:
# Display OS staging results
if os_staging_results:
    print("\n" + "=" * 70)
    print("OS STAGING RESULTS (HR, 95% CI, p-value)")
    print("=" * 70)

    os_staging_display = os_staging_df.copy()
    os_staging_display["HR (95% CI)"] = os_staging_display.apply(
        lambda x: (
            f"{x['exp(coef)']:.2f} ({x['exp(coef) lower 95%']:.2f}-{x['exp(coef) upper 95%']:.2f})"
            if pd.notna(x["exp(coef) lower 95%"])
            else "1.00 (ref)"
        ),
        axis=1,
    )
    os_staging_display["p-value"] = os_staging_display["p"].apply(
        lambda x: f"{x:.4f}" if pd.notna(x) else "-"
    )

    print(
        os_staging_display[["variable", "level", "HR (95% CI)", "p-value"]].to_string(
            index=False
        )
    )

    # Add staging results to overall OS results
    os_univariate_df = pd.concat([os_univariate_df, os_staging_df], ignore_index=True)
    os_pvalues.update(os_staging_pvalues)

    print("\n--- Staging Summary (OS) ---")
    if "TNMstage" in os_staging_pvalues:
        sig = (
            "***"
            if os_staging_pvalues["TNMstage"] < 0.001
            else (
                "**"
                if os_staging_pvalues["TNMstage"] < 0.01
                else "*" if os_staging_pvalues["TNMstage"] < 0.05 else ""
            )
        )
        print(
            f"  TNMstage: p = {os_staging_pvalues['TNMstage']:.4f} {sig} (N = {os_staging_n['TNMstage']})"
        )


OS STAGING RESULTS (HR, 95% CI, p-value)
variable   level       HR (95% CI) p-value
TNMstage 1 (ref)        1.00 (ref)       -
TNMstage       2  1.51 (1.04-2.18)  0.0303
TNMstage       3  2.01 (1.39-2.90)  0.0002
TNMstage       4   0.00 (0.00-inf)  0.9925
TNMstage      4A  2.95 (2.10-4.15)  0.0000
TNMstage      4B  4.03 (2.70-6.02)  0.0000
TNMstage      4C 8.57 (5.99-12.25)  0.0000
TNMstage    4NOS 5.15 (1.86-14.25)  0.0016

--- Staging Summary (OS) ---
  TNMstage: p = 0.0000 *** (N = 926)


## 5. Univariate Cox Screening: CSS Endpoint

In [37]:
print("=" * 70)
print("UNIVARIATE COX SCREENING: CANCER-SPECIFIC SURVIVAL (CSS)")
print("=" * 70)
print("\n--- Non-staging Variables ---")

css_results = []
css_pvalues = {}

for var in non_staging_vars:
    print(f"\nFitting: {var}")
    results, overall_p = fit_univariate_cox(train, var, "time_css", "event_css")

    if results is not None:
        css_results.append(results)
        css_pvalues[var] = overall_p
        print(
            f"  Overall p-value: {overall_p:.4f} {'***' if overall_p < 0.001 else '**' if overall_p < 0.01 else '*' if overall_p < 0.05 else ''}"
        )
    else:
        print(f"  Skipped (insufficient data)")

# Combine all results
css_univariate_df = pd.concat(css_results, ignore_index=True)

UNIVARIATE COX SCREENING: CANCER-SPECIFIC SURVIVAL (CSS)

--- Non-staging Variables ---

Fitting: age
  Overall p-value: 0.0136 *

Fitting: sex
  Overall p-value: 0.3010 

Fitting: site
  Overall p-value: 0.0528 

Fitting: grade
  Overall p-value: 0.0000 ***

Fitting: radiotherapy
  Overall p-value: 0.0664 

Fitting: chemotherapy
  Overall p-value: 0.0000 ***

Fitting: tumor_number
  Overall p-value: 0.0264 *

Fitting: race
  Overall p-value: 0.3829 

Fitting: marital_status
  Overall p-value: 0.6106 

Fitting: urban_rural
  Overall p-value: 0.5919 


In [38]:
# Display CSS univariate results
print("\n" + "=" * 70)
print("CSS UNIVARIATE RESULTS (HR, 95% CI, p-value)")
print("=" * 70)

# Format for display
css_display = css_univariate_df.copy()
css_display["HR (95% CI)"] = css_display.apply(
    lambda x: (
        f"{x['exp(coef)']:.2f} ({x['exp(coef) lower 95%']:.2f}-{x['exp(coef) upper 95%']:.2f})"
        if pd.notna(x["exp(coef) lower 95%"])
        else "1.00 (ref)"
    ),
    axis=1,
)
css_display["p-value"] = css_display["p"].apply(
    lambda x: f"{x:.4f}" if pd.notna(x) else "-"
)

print(
    css_display[["variable", "level", "HR (95% CI)", "p-value"]].to_string(index=False)
)


CSS UNIVARIATE RESULTS (HR, 95% CI, p-value)
      variable                     level       HR (95% CI) p-value
           age               45-59 (ref)        1.00 (ref)       -
           age                       ＜45  0.85 (0.58-1.24)  0.3973
           age                       ＞60  1.35 (1.02-1.79)  0.0364
           sex                   女 (ref)        1.00 (ref)       -
           sex                         男  1.14 (0.89-1.47)  0.2997
          site              口腔口咽其它 (ref)        1.00 (ref)       -
          site                      喉和下咽  0.92 (0.42-2.00)  0.8380
          site                      大唾液腺  0.84 (0.63-1.13)  0.2526
          site                 鼻腔鼻窦副鼻窦鼻咽  1.37 (0.95-1.96)  0.0899
         grade                1分化好 (ref)        1.00 (ref)       -
         grade                      2中分化  1.74 (0.79-3.82)  0.1666
         grade                      3分化差 4.90 (2.26-10.62)  0.0001
         grade                   4未分化间变性 5.29 (2.26-12.38)  0.0001
         grade  

In [39]:
# Identify significant variables (p < 0.05)
css_significant = [var for var, p in css_pvalues.items() if p < 0.05]
css_nonsignificant = [var for var, p in css_pvalues.items() if p >= 0.05]

print("\n" + "=" * 70)
print("CSS VARIABLE SCREENING SUMMARY")
print("=" * 70)
print(f"\nSignificant (p < 0.05) - {len(css_significant)} variables:")
for var in css_significant:
    print(f"  {var}: p = {css_pvalues[var]:.4f}")

print(f"\nNon-significant (p >= 0.05) - {len(css_nonsignificant)} variables:")
for var in css_nonsignificant:
    print(f"  {var}: p = {css_pvalues[var]:.4f}")


CSS VARIABLE SCREENING SUMMARY

Significant (p < 0.05) - 4 variables:
  age: p = 0.0136
  grade: p = 0.0000
  chemotherapy: p = 0.0000
  tumor_number: p = 0.0264

Non-significant (p >= 0.05) - 6 variables:
  sex: p = 0.3010
  site: p = 0.0528
  radiotherapy: p = 0.0664
  race: p = 0.3829
  marital_status: p = 0.6106
  urban_rural: p = 0.5919


## 5b. CSS Staging Variable (TNMstage)

In [40]:
print("\n" + "=" * 70)
print("CSS STAGING ANALYSIS (missing values excluded)")
print("=" * 70)

css_staging_results = []
css_staging_pvalues = {}
css_staging_n = {}

# TNMstage (combined) - following reference R code
print("\n--- TNMstage (Combined Stage) ---")
print("Fitting: TNMstage")
results, overall_p, n_used = fit_univariate_cox_staging(
    train, "TNMstage", "time_css", "event_css"
)
if results is not None:
    css_staging_results.append(results)
    css_staging_pvalues["TNMstage"] = overall_p
    css_staging_n["TNMstage"] = n_used
    print(
        f"  Overall p-value: {overall_p:.4f} {'***' if overall_p < 0.001 else '**' if overall_p < 0.01 else '*' if overall_p < 0.05 else ''}"
    )
else:
    print("  Skipped (insufficient data or convergence failed)")

# Combine staging results
if css_staging_results:
    css_staging_df = pd.concat(css_staging_results, ignore_index=True)


CSS STAGING ANALYSIS (missing values excluded)

--- TNMstage (Combined Stage) ---
Fitting: TNMstage
  N after excluding missing/unknown: 926 (from 926)
  Levels: ['1', '2', '3', '4', '4A', '4B', '4C', '4NOS']
  Overall p-value: 0.0000 ***


In [41]:
# Display CSS staging results
if css_staging_results:
    print("\n" + "=" * 70)
    print("CSS STAGING RESULTS (HR, 95% CI, p-value)")
    print("=" * 70)

    css_staging_display = css_staging_df.copy()
    css_staging_display["HR (95% CI)"] = css_staging_display.apply(
        lambda x: (
            f"{x['exp(coef)']:.2f} ({x['exp(coef) lower 95%']:.2f}-{x['exp(coef) upper 95%']:.2f})"
            if pd.notna(x["exp(coef) lower 95%"])
            else "1.00 (ref)"
        ),
        axis=1,
    )
    css_staging_display["p-value"] = css_staging_display["p"].apply(
        lambda x: f"{x:.4f}" if pd.notna(x) else "-"
    )

    print(
        css_staging_display[["variable", "level", "HR (95% CI)", "p-value"]].to_string(
            index=False
        )
    )

    # Add staging results to overall CSS results
    css_univariate_df = pd.concat(
        [css_univariate_df, css_staging_df], ignore_index=True
    )
    css_pvalues.update(css_staging_pvalues)

    print("\n--- Staging Summary (CSS) ---")
    if "TNMstage" in css_staging_pvalues:
        sig = (
            "***"
            if css_staging_pvalues["TNMstage"] < 0.001
            else (
                "**"
                if css_staging_pvalues["TNMstage"] < 0.01
                else "*" if css_staging_pvalues["TNMstage"] < 0.05 else ""
            )
        )
        print(
            f"  TNMstage: p = {css_staging_pvalues['TNMstage']:.4f} {sig} (N = {css_staging_n['TNMstage']})"
        )


CSS STAGING RESULTS (HR, 95% CI, p-value)
variable   level         HR (95% CI) p-value
TNMstage 1 (ref)          1.00 (ref)       -
TNMstage       2    2.70 (1.51-4.83)  0.0008
TNMstage       3    3.36 (1.88-6.02)  0.0000
TNMstage       4     0.00 (0.00-inf)  0.9927
TNMstage      4A   6.60 (3.86-11.27)  0.0000
TNMstage      4B  10.11 (5.67-18.02)  0.0000
TNMstage      4C 20.38 (11.83-35.11)  0.0000
TNMstage    4NOS  11.95 (3.50-40.85)  0.0001

--- Staging Summary (CSS) ---
  TNMstage: p = 0.0000 *** (N = 926)


## 6. Combined Screening Summary

In [42]:
# Create summary table comparing OS and CSS screening
all_vars = non_staging_vars + ["TNMstage"]

# Identify significant variables (p < 0.05)
os_significant = [var for var, p in os_pvalues.items() if p < 0.05]
os_nonsignificant = [var for var, p in os_pvalues.items() if p >= 0.05]
css_significant = [var for var, p in css_pvalues.items() if p < 0.05]
css_nonsignificant = [var for var, p in css_pvalues.items() if p >= 0.05]

screening_summary = pd.DataFrame(
    {
        "Variable": all_vars,
        "OS p-value": [os_pvalues.get(v, np.nan) for v in all_vars],
        "OS Significant": [v in os_significant for v in all_vars],
        "CSS p-value": [css_pvalues.get(v, np.nan) for v in all_vars],
        "CSS Significant": [v in css_significant for v in all_vars],
    }
)

# Format p-values
screening_summary["OS p-value"] = screening_summary["OS p-value"].apply(
    lambda x: f"{x:.4f}" if pd.notna(x) else "NA"
)
screening_summary["CSS p-value"] = screening_summary["CSS p-value"].apply(
    lambda x: f"{x:.4f}" if pd.notna(x) else "NA"
)

print("\n" + "=" * 70)
print("UNIVARIATE SCREENING SUMMARY: OS vs CSS (All Variables)")
print("=" * 70)
print(screening_summary.to_string(index=False))


UNIVARIATE SCREENING SUMMARY: OS vs CSS (All Variables)
      Variable OS p-value  OS Significant CSS p-value  CSS Significant
           age     0.0000            True      0.0136             True
           sex     0.2181           False      0.3010            False
          site     0.0022            True      0.0528            False
         grade     0.0001            True      0.0000             True
  radiotherapy     0.0006            True      0.0664            False
  chemotherapy     0.0000            True      0.0000             True
  tumor_number     0.0017            True      0.0264             True
          race     0.3818           False      0.3829            False
marital_status     0.0008            True      0.6106            False
   urban_rural     0.7331           False      0.5919            False
      TNMstage     0.0000            True      0.0000             True


In [43]:
# Variables to advance to multivariate analysis
print("\n" + "=" * 70)
print("VARIABLES ADVANCING TO MULTIVARIATE ANALYSIS")
print("=" * 70)

print(f"\nOS Model Candidates ({len(os_significant)} variables):")
print(f"  {', '.join(os_significant)}")

print(f"\nCSS Model Candidates ({len(css_significant)} variables):")
print(f"  {', '.join(css_significant)}")

# Variables significant in both
both_significant = set(os_significant) & set(css_significant)
print(f"\nSignificant in BOTH endpoints ({len(both_significant)} variables):")
print(f"  {', '.join(sorted(both_significant)) if both_significant else 'None'}")

# Note about staging
print("\n--- Staging Variable Note ---")
print("Following reference R code (绘制诺模图代码.txt):")
print("  Using TNMstage (combined stage) as the staging variable")
print("  This approach provides better clinical interpretability and model parsimony")


VARIABLES ADVANCING TO MULTIVARIATE ANALYSIS

OS Model Candidates (8 variables):
  age, site, grade, radiotherapy, chemotherapy, tumor_number, marital_status, TNMstage

CSS Model Candidates (5 variables):
  age, grade, chemotherapy, tumor_number, TNMstage

Significant in BOTH endpoints (5 variables):
  TNMstage, age, chemotherapy, grade, tumor_number

--- Staging Variable Note ---
Following reference R code (绘制诺模图代码.txt):
  Using TNMstage (combined stage) as the staging variable
  This approach provides better clinical interpretability and model parsimony


## 7. Save Results

In [44]:
# Save univariate results
output_dir = Path("../data/processed")

# Save detailed results (now includes staging)
os_univariate_df.to_csv(output_dir / "univariate_os_trackA.csv", index=False)
css_univariate_df.to_csv(output_dir / "univariate_css_trackA.csv", index=False)

# Save screening summary
screening_summary.to_csv(
    output_dir / "univariate_screening_summary_trackA.csv", index=False
)

# Save selected variables for next notebook
selected_vars = {
    "os_significant_vars": os_significant,
    "css_significant_vars": css_significant,
    "os_pvalues": {k: float(v) for k, v in os_pvalues.items()},
    "css_pvalues": {k: float(v) for k, v in css_pvalues.items()},
    "staging_var": "TNMstage",  # Following reference R code
    "staging_n": {
        "os": (
            {k: int(v) for k, v in os_staging_n.items()}
            if "os_staging_n" in dir()
            else {}
        ),
        "css": (
            {k: int(v) for k, v in css_staging_n.items()}
            if "css_staging_n" in dir()
            else {}
        ),
    },
    "screening_threshold": 0.05,
    "track": "A",
}

with open(output_dir / "univariate_selected_vars_trackA.json", "w") as f:
    json.dump(selected_vars, f, indent=2)

print("Results saved:")
print(f"  univariate_os_trackA.csv")
print(f"  univariate_css_trackA.csv")
print(f"  univariate_screening_summary_trackA.csv")
print(f"  univariate_selected_vars_trackA.json")

Results saved:
  univariate_os_trackA.csv
  univariate_css_trackA.csv
  univariate_screening_summary_trackA.csv
  univariate_selected_vars_trackA.json


## Summary

### Univariate Cox Screening Results (Track A)

**Variables screened**: 11 total
- Non-staging: 10 variables (age, sex, site, grade, radiotherapy, chemotherapy, tumor_number, race, marital_status, urban_rural)
- Staging: 1 variable (TNMstage combined)

### Staging Analysis
**TNMstage (combined)**: Overall stage (1, 2, 3, 4A, 4B, 4C)
- Missing values excluded before fitting
- Following reference R code (绘制诺模图代码.txt): uses TNMstage only, not separate T/N/M

### Screening Criterion
- **Threshold**: p < 0.05 (likelihood ratio test)
- **Method**: Univariate Cox proportional hazards regression
- **Staging**: Missing values excluded before fitting

### Next Steps
- **Notebook 03**: Forward-stepwise multivariate Cox using significant variables
- Use TNMstage (combined) as the staging variable
- OS model: use `os_significant_vars`
- CSS model: use `css_significant_vars`