In [None]:
!pip install scanpy anndata matplotlib scikit-learn



In [2]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.cluster import KMeans
from scipy.stats import chi2_contingency, kruskal


adata = sc.read_h5ad("../../data/z-scaled.h5ad")

print(f"Shape of data: {adata.shape}")


X = adata.X
mds = MDS(n_components=2, random_state=42, n_init=4, max_iter=300)
X_mds = mds.fit_transform(X)
adata.obsm["X_mds"] = X_mds


def plot_mds_feature(feature, title, save_path):
    sc.pl.embedding(
        adata,
        basis="X_mds",
        color=feature,
        title=title,
        size=20,                 # Smaller point size
        legend_loc="right margin",  # Legend outside
        legend_fontsize=8,        # Smaller font
        frameon=False,            # No frame
        show=False
    )
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()


# 1. Tumor Classification
plot_mds_feature(
    "diagnoses.classification_of_tumor",
    "MDS Projection: Tumor Classification",
    "MDS_Tumor_Classification.png"
)

# 2. Gender
plot_mds_feature(
    "demographic.gender",
    "MDS Projection: Gender",
    "MDS_Gender.png"
)

# 3. Smoking Status
plot_mds_feature(
    "exposures.tobacco_smoking_status",
    "MDS Projection: Smoking Status",
    "MDS_Smoking_Status.png"
)

# 4. KMeans Clusters
kmeans = KMeans(n_clusters=3, random_state=42).fit(X_mds)
adata.obs["mds_cluster"] = kmeans.labels_.astype(str)
plot_mds_feature(
    "mds_cluster",
    "MDS Projection: KMeans Clusters",
    "MDS_KMeans_Clusters.png"
)


Shape of data: (572, 19740)


  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(


In [5]:
from sklearn.cluster import KMeans
from scipy.stats import chi2_contingency, kruskal

# Cluster
kmeans = KMeans(n_clusters=3, random_state=42).fit(X_mds)
adata.obs["mds_cluster"] = kmeans.labels_.astype(str)

# Example: chi-square for gender distribution
contingency_table = pd.crosstab(adata.obs["mds_cluster"], adata.obs["demographic.gender"])
chi2, p, _, _ = chi2_contingency(contingency_table)
print(f"Chi-squared test for gender distribution across clusters:")
print(f"Chi2 statistic: {chi2:.4f}, p-value: {p:.4f}")
# print(f"Contingency table:\n{contingency_table}")

# Chi-squared test: cluster vs smoking status
contingency_smoking = pd.crosstab(adata.obs["mds_cluster"], adata.obs["exposures.tobacco_smoking_status"])
chi2, p_smoking, _, _ = chi2_contingency(contingency_smoking)
print(f"Chi-squared test for smoking status distribution across clusters:")
print(f"Chi2 statistic: {chi2:.4f}, p-value: {p_smoking:.4f}")
# print(f"Contingency table:\n{contingency_smoking}")

# Kruskal-Wallis test: cluster vs age at diagnosis
groups = []
for c in sorted(adata.obs["mds_cluster"].unique()):
    groups.append(adata.obs.loc[adata.obs["mds_cluster"] == c, "diagnoses.age_at_diagnosis"].dropna())

kw_stat, p_age = kruskal(*groups)
print(f"Kruskal-Wallis test for age distribution across clusters:")
print(f"Kruskal-Wallis statistic: {kw_stat:.4f}, p-value: {p_age:.4f}")

Chi-squared test for gender distribution across clusters:
Chi2 statistic: 5.9728, p-value: 0.0505
Chi-squared test for smoking status distribution across clusters:
Chi2 statistic: 16.0751, p-value: 0.1878
Kruskal-Wallis test for age distribution across clusters:
Kruskal-Wallis statistic: 2.4010, p-value: 0.3010
