
# BioNeuralNet Cancer Example 2

In [None]:
from bioneuralnet.datasets.dataset_loader import DatasetLoader

brca = DatasetLoader("TCGA_BRCA")
print(brca.shape)
print(brca)


In [None]:
from bioneuralnet.utils.preprocess import select_top_k_variance, top_anova_f_features

# 1) Load your data
brca_meth_df   = brca.data["BRCA_Meth"]
brca_rna_df    = brca.data["BRCA_RNA"]
pam50_df       = brca.data["BRCA_PAM50"]

# 2) Encode PAM50 → numeric codes
mapping = {
    "Normal": 0,
    "Basal":  1,
    "Her2":   2,
    "LumA":   3,
    "LumB":   4,
}
pam50_series = pam50_df["PAM50"].map(mapping)

# sanity check
print(pam50_series.value_counts())  # note the () at the end!

# 3) Select top-k by variance
top_k = 2000
meth_var = select_top_k_variance(brca_meth_df, k=top_k)
rna_var  = select_top_k_variance(brca_rna_df,  k=top_k)
print("Variance‐based:")
print("  Meth shape:", meth_var.shape)  
print("  RNA  shape:", rna_var.shape)

# 4) Select top-k by ANOVA F-test
#    Note: the function signature is (X, y, max_features=…)
meth_anova = top_anova_f_features(
    brca_meth_df,
    pam50_series,
    max_features=top_k
)
rna_anova = top_anova_f_features(
    brca_rna_df,
    pam50_series,
    max_features=top_k
)
print("ANOVA‐based:")
print("  Meth shape:", meth_anova.shape)
print("  RNA  shape:", rna_anova.shape)

# 5) (Optional) save to CSV
meth_var.to_csv("brca_meth_top2000_var.csv",   index_label="sample_id")
rna_var.to_csv("brca_rna_top2000_var.csv",    index_label="sample_id")
meth_anova.to_csv("brca_meth_top2000_anova.csv", index_label="sample_id")
rna_anova.to_csv("brca_rna_top2000_anova.csv",  index_label="sample_id")


In [None]:
# 1) Compute intersections
common_meth = meth_var.columns.intersection(meth_anova.columns)
common_rna  = rna_var.columns.intersection(rna_anova.columns)

# 2) Print counts and percentages
print(f"Methylation overlap: {len(common_meth)} / {top_k} features "
      f"({len(common_meth)/top_k:.1%})")
print(f"RNA overlap:         {len(common_rna)} / {top_k} features "
      f"({len(common_rna)/top_k:.1%})")

# 3) (Optional) peek at the first few common features
print("\nFirst 10 common methylation features:", list(common_meth[:10]))
print("First 10 common RNA features:       ", list(common_rna[:10]))