In [7]:
import scanpy as sc
from sklearn.metrics import adjusted_rand_score
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

def preprocess_adata(adata):
    # QC filtering
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)

    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata = adata[adata.obs.pct_counts_mt < 5, :]

    # Normalize + log
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)

    # HVGs
    sc.pp.highly_variable_genes(adata, flavor='seurat', n_top_genes=2000)

    # PCA + neighbors + clustering
    sc.tl.pca(adata, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
    sc.tl.umap(adata)
    sc.tl.leiden(adata)

    return adata


# -------------------------
# Load ground truth dataset
# -------------------------
adata_gt = sc.read_h5ad("Data/combined_raw.h5ad")
adata_gt = preprocess_adata(adata_gt)
adata_gt.obs_names_make_unique()

# -------------------------
# Evaluate KNN imputation
# -------------------------
results = []
missing_fractions = [10, 20, 30]
runs = range(1, 11)

for mf in missing_fractions:  
    for run in runs:
        fname = f"Imputed_KNN_h5ad/adata_dropout_mf{mf}_run{run}_knn_imputed.h5ad"
        if not os.path.exists(fname):
            print(f"Skipping {fname} (not found)")
            continue
        
        adata_imp = sc.read_h5ad(fname)
        adata_imp = preprocess_adata(adata_imp)
        adata_imp.obs_names_make_unique()

        # Align cells to ground truth
        common_cells = adata_gt.obs_names.intersection(adata_imp.obs_names)
        if len(common_cells) == 0:
            print(f"No common cells for mf={mf}, run={run}")
            continue

        gt_labels = adata_gt.obs.loc[common_cells, "leiden"]
        imp_labels = adata_imp.obs.loc[common_cells, "leiden"]

        # Compute ARI
        ari = adjusted_rand_score(gt_labels, imp_labels)
        
        results.append({
            "method": "KNN",   # ✅ updated method label
            "missing_fraction": mf/100,  # e.g., 0.1, 0.2, 0.3
            "run": run,
            "ARI": ari  
        })

results_df = pd.DataFrame(results)

# -------------------------
# Visualization
# -------------------------
summary = results_df.groupby(['method','missing_fraction'])['ARI'].agg(['mean','std']).reset_index()
print(summary)

plt.figure(figsize=(8,6))
sns.barplot(data=summary, x='missing_fraction', y='mean', hue='method', capsize=0.1)

# Add SD error bars manually
for i, row in summary.iterrows():
    xpos = row['missing_fraction']*10 - 10 + (0 if row['method']=="KNN" else 0.3)  # spacing
    plt.errorbar(x=xpos, y=row['mean'], yerr=row['std'], fmt='none', c='black', capsize=5)

plt.ylabel("ARI (mean ± SD)")
plt.xlabel("Missing Fraction")
plt.title("Clustering Accuracy after KNN Imputation")
plt.savefig("ARI_Summary_KNN.png", dpi=300)
plt.show()


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  view_to_actual(adata)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  view_to_actual(adata)
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  view_to_actual(adata)
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  view_to_actual(adata)
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  view_to_actual(adata)
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  view_to_actual(adata)
  utils.warn_names_duplicates("var")
  uti

ValueError: `X_pca` does not have enough PCs. Rerun `sc.pp.pca` with adjusted `n_comps`.

In [2]:
import scanpy as sc
from sklearn.metrics import adjusted_rand_score
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

def preprocess_adata(adata):
    # QC filtering
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)

    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata = adata[adata.obs.pct_counts_mt < 5, :]

    # Normalize + log
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)

    # HVGs
    sc.pp.highly_variable_genes(adata, flavor='seurat', n_top_genes=2000)

    # PCA + neighbors + clustering
    sc.tl.pca(adata, svd_solver='arpack', n_comps=min(40, adata.n_vars - 1))
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=min(40, adata.n_vars - 1))
    sc.tl.umap(adata)
    sc.tl.leiden(adata)

    return adata


# -------------------------
# Load ground truth dataset
# -------------------------
adata_gt = sc.read_h5ad("Data/combined_raw.h5ad")
adata_gt = preprocess_adata(adata_gt)
adata_gt.obs_names_make_unique()

# -------------------------
# Evaluate KNN imputation
# -------------------------
results = []
missing_fractions = [10, 20, 30]
runs = range(1, 11)

for mf in missing_fractions:  
    for run in runs:
        fname = f"Imputed_KNN_h5ad/adata_dropout_mf{mf}_run{run}_knn_imputed.h5ad"
        if not os.path.exists(fname):
            print(f"Skipping {fname} (not found)")
            continue
        
        adata_imp = sc.read_h5ad(fname)
        adata_imp = preprocess_adata(adata_imp)
        adata_imp.obs_names_make_unique()

        # Align cells to ground truth
        common_cells = adata_gt.obs_names.intersection(adata_imp.obs_names)
        if len(common_cells) == 0:
            print(f"No common cells for mf={mf}, run={run}")
            continue

        gt_labels = adata_gt.obs.loc[common_cells, "leiden"]
        imp_labels = adata_imp.obs.loc[common_cells, "leiden"]

        # Compute ARI
        ari = adjusted_rand_score(gt_labels, imp_labels)
        
        results.append({
            "method": "KNN",   # ✅ updated method label
            "missing_fraction": mf/100,  # e.g., 0.1, 0.2, 0.3
            "run": run,
            "ARI": ari  
        })

results_df = pd.DataFrame(results)

# -------------------------
# Visualization
# -------------------------
summary = results_df.groupby(['method','missing_fraction'])['ARI'].agg(['mean','std']).reset_index()
print(summary)

plt.figure(figsize=(8,6))
ax = sns.barplot(data=summary, x='missing_fraction', y='mean', hue='method', capsize=0.1)

# Add error bars manually (using bar positions)
for i, row in summary.iterrows():
    bar = ax.patches[i]
    x = bar.get_x() + bar.get_width()/2
    y = row['mean']
    plt.errorbar(x, y, yerr=row['std'], fmt='none', c='black', capsize=5)

plt.ylabel("ARI (mean ± SD)")
plt.xlabel("Missing Fraction")
plt.title("Clustering Accuracy after KNN Imputation")
plt.savefig("ARI_Summary_KNN.png", dpi=300)
plt.show()


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  view_to_actual(adata)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  view_to_actual(adata)
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  view_to_actual(adata)
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  view_to_actual(adata)
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  view_to_actual(adata)
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  view_to_actual(adata)
  utils.warn_names_duplicates("var")
  uti

ValueError: n_components=40 must be between 1 and min(n_samples, n_features)=30 with svd_solver='arpack'

Processing adata_dropout_mf10_run1.h5ad ...


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


AttributeError: 'csr_matrix' object has no attribute 'A'

In [8]:
import scanpy as sc
from sklearn.metrics import adjusted_rand_score
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

def preprocess_adata(adata):
    # QC filtering
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)

    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata = adata[adata.obs.pct_counts_mt < 5, :]

    # Normalize + log
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)

    # HVGs
    sc.pp.highly_variable_genes(adata, flavor='seurat', n_top_genes=2000)

    # PCA
    n_comps = min(40, adata.n_obs - 1, adata.n_vars - 1)  # handle small datasets
    sc.tl.pca(adata, svd_solver='arpack', n_comps=n_comps)

    # Weighted KNN smoothing (via diffusion maps)
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=n_comps)
    sc.tl.diffmap(adata)
    sc.pp.neighbors(adata, use_rep='X_diffmap')
    sc.tl.umap(adata)
    sc.tl.leiden(adata)

    return adata


# -------------------------
# Load ground truth dataset
# -------------------------
adata_gt = sc.read_h5ad("Data/combined_raw.h5ad")
adata_gt = preprocess_adata(adata_gt)
adata_gt.obs_names_make_unique()

# -------------------------
# Evaluate Weighted KNN imputation
# -------------------------
results = []
missing_fractions = [10, 20, 30]
runs = range(1, 11)

for mf in missing_fractions:  
    for run in runs:
        fname = f"Imputed_WKNN_h5ad/adata_dropout_mf{mf}_run{run}_wknn_imputed.h5ad"
        if not os.path.exists(fname):
            print(f"Skipping {fname} (not found)")
            continue
        
        adata_imp = sc.read_h5ad(fname)
        adata_imp = preprocess_adata(adata_imp)
        adata_imp.obs_names_make_unique()

        # Align cells to ground truth
        common_cells = adata_gt.obs_names.intersection(adata_imp.obs_names)
        if len(common_cells) == 0:
            print(f"No common cells for mf={mf}, run={run}")
            continue

        gt_labels = adata_gt.obs.loc[common_cells, "leiden"]
        imp_labels = adata_imp.obs.loc[common_cells, "leiden"]

        # Compute ARI
        ari = adjusted_rand_score(gt_labels, imp_labels)
        
        results.append({
            "method": "WeightedKNN",
            "missing_fraction": mf/100,  # e.g., 0.1, 0.2, 0.3
            "run": run,
            "ARI": ari  
        })

results_df = pd.DataFrame(results)

# -------------------------
# Visualization
# -------------------------
summary = results_df.groupby(['method','missing_fraction'])['ARI'].agg(['mean','std']).reset_index()
print(summary)

plt.figure(figsize=(8,6))
sns.barplot(data=summary, x='missing_fraction', y='mean', hue='method', capsize=0.1)

# Add SD error bars manually
for i, row in summary.iterrows():
    xpos = row['missing_fraction']*10 - 10 + (0 if row['method']=="WeightedKNN" else 0.3)  # spacing
    plt.errorbar(x=xpos, y=row['mean'], yerr=row['std'], fmt='none', c='black', capsize=5)

plt.ylabel("ARI (mean ± SD)")
plt.xlabel("Missing Fraction")
plt.title("Clustering Accuracy after Weighted KNN Imputation")
plt.savefig("ARI_Summary_WeightedKNN.png", dpi=300)
plt.show()


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")
  view_to_actual(adata)
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("var")


Skipping Imputed_WKNN_h5ad/adata_dropout_mf10_run1_wknn_imputed.h5ad (not found)
Skipping Imputed_WKNN_h5ad/adata_dropout_mf10_run2_wknn_imputed.h5ad (not found)
Skipping Imputed_WKNN_h5ad/adata_dropout_mf10_run3_wknn_imputed.h5ad (not found)
Skipping Imputed_WKNN_h5ad/adata_dropout_mf10_run4_wknn_imputed.h5ad (not found)
Skipping Imputed_WKNN_h5ad/adata_dropout_mf10_run5_wknn_imputed.h5ad (not found)
Skipping Imputed_WKNN_h5ad/adata_dropout_mf10_run6_wknn_imputed.h5ad (not found)
Skipping Imputed_WKNN_h5ad/adata_dropout_mf10_run7_wknn_imputed.h5ad (not found)
Skipping Imputed_WKNN_h5ad/adata_dropout_mf10_run8_wknn_imputed.h5ad (not found)
Skipping Imputed_WKNN_h5ad/adata_dropout_mf10_run9_wknn_imputed.h5ad (not found)
Skipping Imputed_WKNN_h5ad/adata_dropout_mf10_run10_wknn_imputed.h5ad (not found)
Skipping Imputed_WKNN_h5ad/adata_dropout_mf20_run1_wknn_imputed.h5ad (not found)
Skipping Imputed_WKNN_h5ad/adata_dropout_mf20_run2_wknn_imputed.h5ad (not found)
Skipping Imputed_WKNN_h5ad/

KeyError: 'method'