In [None]:
!pip install scanpy anndata matplotlib scikit-learn



In [None]:
import scanpy as sc

adata = sc.read_h5ad("z-scaled.h5ad")

print(f"Shape of data: {adata.shape}")  # (n_samples, n_genes)
print("Metadata columns (adata.obs):", adata.obs.columns.tolist())
print("Gene info columns (adata.var):", adata.var.columns.tolist())

print(adata.obs.head())

Shape of data: (572, 19740)
Metadata columns (adata.obs): ['nCount_RNA', 'nFeature_RNA', 'cases.submitter_id', 'sample_id', 'total_expr', 'cases.consent_type', 'cases.days_to_consent', 'cases.disease_type', 'cases.lost_to_followup', 'demographic.age_at_index', 'demographic.age_is_obfuscated', 'demographic.country_of_residence_at_enrollment', 'demographic.days_to_birth', 'demographic.days_to_death', 'demographic.demographic_id', 'demographic.ethnicity', 'demographic.gender', 'demographic.race', 'demographic.submitter_id', 'demographic.vital_status', 'diagnoses.age_at_diagnosis', 'diagnoses.ajcc_pathologic_m', 'diagnoses.ajcc_pathologic_n', 'diagnoses.ajcc_pathologic_stage', 'diagnoses.ajcc_pathologic_t', 'diagnoses.ajcc_staging_system_edition', 'diagnoses.classification_of_tumor', 'diagnoses.days_to_diagnosis', 'diagnoses.diagnosis_id.x', 'diagnoses.diagnosis_is_primary_disease', 'diagnoses.figo_stage', 'diagnoses.figo_staging_edition_year', 'diagnoses.icd_10_code', 'diagnoses.lateralit

In [None]:
from sklearn.cluster import KMeans
from scipy.stats import chi2_contingency, kruskal

# Cluster
kmeans = KMeans(n_clusters=3, random_state=42).fit(X_mds)
adata.obs["mds_cluster"] = kmeans.labels_.astype(str)

# Chi-squared test: cluster vs gender
contingency_table = pd.crosstab(adata.obs["mds_cluster"], adata.obs["demographic.gender"])
chi2, p, _, _ = chi2_contingency(contingency_table)

# Chi-squared test: cluster vs smoking status
contingency_smoking = pd.crosstab(adata.obs["mds_cluster"], adata.obs["exposures.tobacco_smoking_status"])
chi2, p_smoking, _, _ = chi2_contingency(contingency_smoking)

# Kruskal-Wallis test: cluster vs age at diagnosis
groups = []
for c in sorted(adata.obs["mds_cluster"].unique()):
    groups.append(adata.obs.loc[adata.obs["mds_cluster"] == c, "diagnoses.age_at_diagnosis"].dropna())

kw_stat, p_age = kruskal(*groups)

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.cluster import KMeans
from scipy.stats import chi2_contingency, kruskal


adata = sc.read_h5ad("/content/z-scaled.h5ad")

print(f"Shape of data: {adata.shape}")


X = adata.X
mds = MDS(n_components=2, random_state=42, n_init=4, max_iter=300)
X_mds = mds.fit_transform(X)
adata.obsm["X_mds"] = X_mds


def plot_mds_feature(feature, title, save_path):
    sc.pl.embedding(
        adata,
        basis="X_mds",
        color=feature,
        title=title,
        size=20,                 # Smaller point size
        legend_loc="right margin",  # Legend outside
        legend_fontsize=8,        # Smaller font
        frameon=False,            # No frame
        show=False
    )
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()


# 1. Tumor Classification
plot_mds_feature(
    "diagnoses.classification_of_tumor",
    "MDS Projection: Tumor Classification",
    "/content/MDS_Tumor_Classification.png"
)

# 2. Gender
plot_mds_feature(
    "demographic.gender",
    "MDS Projection: Gender",
    "/content/MDS_Gender.png"
)

# 3. Smoking Status
plot_mds_feature(
    "exposures.tobacco_smoking_status",
    "MDS Projection: Smoking Status",
    "/content/MDS_Smoking_Status.png"
)

# 4. KMeans Clusters
kmeans = KMeans(n_clusters=3, random_state=42).fit(X_mds)
adata.obs["mds_cluster"] = kmeans.labels_.astype(str)
plot_mds_feature(
    "mds_cluster",
    "MDS Projection: KMeans Clusters",
    "/content/MDS_KMeans_Clusters.png"
)


Shape of data: (572, 19740)
✅ All plots generated and saved as PNG!


In [None]:
from sklearn.cluster import KMeans
from scipy.stats import chi2_contingency, kruskal

# Cluster
kmeans = KMeans(n_clusters=3, random_state=42).fit(X_mds)
adata.obs["mds_cluster"] = kmeans.labels_.astype(str)

# Example: chi-square for gender distribution
contingency_table = pd.crosstab(adata.obs["mds_cluster"], adata.obs["demographic.gender"])
chi2, p, _, _ = chi2_contingency(contingency_table)

# Chi-squared test: cluster vs smoking status
contingency_smoking = pd.crosstab(adata.obs["mds_cluster"], adata.obs["exposures.tobacco_smoking_status"])
chi2, p_smoking, _, _ = chi2_contingency(contingency_smoking)

# Kruskal-Wallis test: cluster vs age at diagnosis
groups = []
for c in sorted(adata.obs["mds_cluster"].unique()):
    groups.append(adata.obs.loc[adata.obs["mds_cluster"] == c, "diagnoses.age_at_diagnosis"].dropna())

kw_stat, p_age = kruskal(*groups)

Chi-square test p-value for gender: 0.0505
Chi-square p-value for cluster vs smoking status: 0.1878
Kruskal-Wallis p-value for cluster vs age at diagnosis: 0.3010
