In [None]:
import anndata as ad
import pandas as pd
import numpy as np
from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import warnings
warnings.filterwarnings('ignore')

In [None]:
adata= sc.read('../Data/dataset_annotated.h5ad')
survival_data = adata.obs[['overall survival days', 'overall survival event', 'classification']].copy()
survival_data.dropna(subset=['overall survival days', 'overall survival event', 'classification'], inplace=True)
survival_data['overall survival event'] = survival_data['overall survival event'].astype(int) # Ensure event is 0 or 1

print(f"\nPrepared survival data shape: {survival_data.shape}")
print(f"Number of events in prepared data: {survival_data['overall survival event'].sum()}")

In [None]:
# --- 3. Kaplan-Meier Survival Curves ---
print("\nPerforming Kaplan-Meier analysis...")
kmf = KaplanMeierFitter()

plt.figure(figsize=(10, 7))
ax = plt.subplot(111)

# Plot KM curve for each cluster
for cluster_name in sorted(survival_data['classification'].unique()): # Sorting for consistent color mapping
    cluster_data = survival_data[survival_data['classification'] == cluster_name]
    if len(cluster_data) > 0:
        kmf.fit(cluster_data['overall survival days'], event_observed=cluster_data['overall survival event'], label=cluster_name)
        kmf.plot(ax=ax, ci_show=False) # ci_show=False to avoid crowded plot

plt.title('Overall Survival by Clusters')
plt.xlabel('Days')
plt.ylabel('Survival probability')
plt.grid(True)
plt.legend(loc='upper right') # Position legend as needed
plt.ylim(0.7, 1.01) # Adjust y-axis limit based on typical survival range
plt.tight_layout()
plt.show()

Figure 1: Estimated Overall Survival Probabilities by Gene Expression Clusters.

This figure displays Kaplan-Meier curves illustrating the estimated overall survival probability over time for patients grouped by the six identified breast cancer gene expression clusters: Basal-G3, LumA-G2, LumB-G3, Her2-G2, Her2-G3, and Normal-G2.

- X-axis: Represents the follow-up time from diagnosis or sample collection, shown in days.

- Y-axis: Represents the estimated probability of overall survival, ranging from 1.0 (100% survival) down to approximately 0.7 (70% survival) within the shown timeframe.

- Colored Lines: Each distinct line corresponds to one of the six clusters, differentiated by color and labeled in the legend. The vertical steps in each curve indicate time points at which an event (death) occurred within that specific cluster.

Interpretation: The Kaplan-Meier curves reveal substantial heterogeneity in estimated overall survival across the identified molecular clusters. The Normal-G2 and LumA-G2 clusters demonstrate the most favorable estimated overall survival outcomes, with the highest survival probabilities maintained throughout the observation period. The Basal-G3 cluster shows the least favorable prognosis, with the steepest decline in estimated survival probability. The LumB-G3 cluster exhibits intermediate survival outcomes, better than the Basal and Her2 groups but worse than LumA-G2 and Normal-G2.

Notably, among the Her2-classified clusters, the Her2-G2 cluster displays unexpectedly poor survival, with its curve falling consistently below the Her2-G3 cluster and tracking closely with the curve for the Basal-G3 cluster. This visual observation is counter-intuitive given that Grade 2 tumors typically have a better prognosis than Grade 3, and highlights that the molecular characteristics captured by this Her2-G2 cluster may contribute to a more aggressive clinical behavior than suggested by grade alone. These distinct survival profiles emphasize the prognostic relevance of the identified clusters and warrant further investigation into the biological underpinnings driving the poorer outcomes observed in Basal-G3 and this specific Her2-G2 cluster. Statistical analysis (e.g., Cox regression) is necessary to quantify the hazard differences and assess their significance.

In [None]:
print("\nPerforming Cox Regression...")
cox_data = survival_data.copy()

# Create dummy variables for 'classification', ensuring 'Her2-G3' is the reference
classification_dummies = pd.get_dummies(cox_data['classification'], prefix='cluster')

# Check if the reference column exists before dropping
ref_col_name = 'cluster_Her2-G3'
if ref_col_name in classification_dummies.columns:
    cox_data = cox_data.join(classification_dummies.drop(columns=[ref_col_name]))
    print(f"\nDummy variables created with '{ref_col_name}' as the reference.")
else:
    # Handle case where reference group might not be present in the filtered data
    print(f"\nWarning: Reference cluster '{ref_col_name}' not found in the data. Cannot set it as reference.")
    # Decide how to proceed - either stop, or use a different reference, or add the column with zeros.
    # For now, we'll proceed without dropping the reference column, which means lifelines will choose one automatically.
    # You might need to adjust this based on your data.
    cox_data = cox_data.join(classification_dummies)


# Define duration and event columns for the fitter
duration_col = 'overall survival days'
event_col = 'overall survival event'

# Identify predictor columns (all dummy columns)
predictor_cols = classification_dummies.columns.drop(ref_col_name, errors='ignore').tolist()

# Instantiate and fit the CoxPHFitter
cph = CoxPHFitter()


cph.fit(cox_data[predictor_cols + [duration_col, event_col]],
        duration_col=duration_col,
        event_col=event_col)

# --- 5. Print Cox Model Summary ---
print("\nCox Proportional Hazards Model Summary:")
# --- 6. Plot Cox Model Hazard Ratios (log scale) ---
# Plotting log(HR) with 95% CIs
# summary_df = cph.summary.copy()
# # Ensure we only plot the coefficients for the cluster dummies
# summary_df = summary_df[summary_df.index.str.startswith('cluster')]
cph.print_summary()

In [None]:
cph.plot()

Figure 2: Forest Plot of Hazard Ratios for Overall Survival by Cluster, Referenced to Her2-G3.

This figure displays the estimated log of the Hazard Ratio (log(HR)) and corresponding 95% confidence intervals for overall survival, comparing each identified cluster to the Her2-G3 cluster. The results were derived from a Cox Proportional Hazards model fitted to the breast cancer cohort (n=3273 observations, 336 events). The Her2-G3 cluster served as the reference group in this model.

- X-axis: Represents the estimated log(HR) and its 95% confidence interval. The vertical dashed line at 0 on the log(HR) scale corresponds to a Hazard Ratio of 1 (exp(0)=1), indicating no difference in hazard compared to the Her2-G3 reference group. Points or intervals to the right of the line (positive log(HR)) indicate an increased hazard (higher risk of death), while points or intervals to the left (negative log(HR)) indicate a decreased hazard (lower risk of death).

- Y-axis: Lists the other identified clusters, representing the terms in the Cox model (cluster_Basal-G3, cluster_Her2-G2, etc.).

- Square Markers: Indicate the point estimate of the log(HR) for each cluster relative to the Her2-G3 reference group.

- Horizontal Lines: Represent the 95% confidence interval for the estimated log(HR). Confidence intervals that do not cross the vertical line at 0 indicate a statistically significant difference in hazard (typically at alpha=0.05) compared to the reference group.

Interpretation: The Cox model results show significant differences in overall survival hazard among several clusters compared to Her2-G3. The Basal-G3 cluster exhibits a significantly increased hazard (log(HR) 0.53, 95% CI: 0.18-0.88, p<0.005), indicating poorer overall survival compared to Her2-G3. Conversely, the LumA-G2 (log(HR) -0.52, 95% CI: -0.83--0.20, p<0.005), LumB-G3 (log(HR) -0.41, 95% CI: -0.78--0.04, p=0.03), and Normal-G2 (log(HR) -0.72, 95% CI: -1.09--0.35, p<0.005) clusters all show significantly decreased hazard (better overall survival) compared to Her2-G3. The Her2-G2 cluster shows an estimated log(HR) of 0.41 (95% CI: -0.19-1.01), indicating a trend towards higher hazard than Her2-G3, but the 95% confidence interval crosses 0, meaning this difference is not statistically significant (p=0.18) in this model. This aligns with the observation from the Kaplan-Meier plot where Her2-G2 survival tracked similarly to Basal-G3 but did not show a statistically distinct hazard from Her2-G3 in the Cox model. The overall Cox model demonstrates good predictive power (concordance = 0.61) and is highly statistically significant (Likelihood ratio test p < 0.005).