In [2]:
import scanpy as sc
import os
import numpy as np
import pandas as pd
import anndata as ad
import celltypist
from celltypist import models
import scrublet
from scipy.io import mmwrite

In [None]:
# Establish samples
sampleList = ["P1_Pre_10708",
                  "P1_Post_10853",
                  "P4_Pre_10973",
                  "P4_Post_11389",
                  "P7_Pre_11183",
                  "P7_Post_11517_redo",
                  "P16_Post_11907"]

In [None]:

# Base directory where processed data is stored
base_dir = "/home/project/processed_data/"

# Dictionary to store AnnData objects by sample ID
adata_dict = {}

# Load each sample
for sample_id in sampleList:
    file_path = os.path.join( base_dir, "cellbender/", sample_id, "cellbender_output_FPR_0.1_filtered.h5")
    
    if os.path.exists(file_path):
        print(f"Loading {sample_id} from {file_path}")
        adata = sc.read_10x_h5(file_path)

        out_dir = os.path.join(base_dir, "scrublet/", sample_id)
        print("creating directory: ", out_dir)
        os.mkdir(os.path.join(base_dir, "scrublet/", sample_id))
        
        print("running scrublet for: ", sample_id)
        sc.pp.scrublet(adata, expected_doublet_rate = 0.1) # this is the default expected doublet rate. may need to change this. Daniel set to 0.6.
        adata_dict[sample_id] = adata
        path_out = os.path.join(base_dir, "scrublet/", sample_id, "scrublet_doublets.csv")
        adata.obs[['doublet_score', 'predicted_doublet']].to_csv(path_out)
    else:
        print(f"❗ File not found for sample: {sample_id} at {file_path}")

The below cell outputs cellbender .h5 files to barcodes, features, and matrix files for import into Seurat. It is not necessary to do it this way so disregard this code

In [4]:
import os
import pandas as pd
from scipy.io import mmwrite

output_base = "/home/project/processed_data/cellbender"

for sample_id in sampleList:
    print(f"Outputting: {sample_id}")
    
    adata = adata_dict[sample_id]
    output_dir = os.path.join(output_base, sample_id, "cellbender_formatted_for_seurat")
    os.makedirs(output_dir, exist_ok=True)

    # Export the matrix
    mmwrite(os.path.join(output_dir, "matrix.mtx"), adata.X)

    # Export barcodes
    adata.obs_names.to_series().to_csv(
        os.path.join(output_dir, "barcodes.tsv"),
        sep="\t",
        index=False,
        header=False
    )

    # Export features (gene ID, gene name, feature type)
    # Fallback if 'gene_ids' or 'gene_names' are missing
    if 'gene_ids' in adata.var.columns and 'gene_names' in adata.var.columns:
        features_df = pd.DataFrame({
            "gene_id": adata.var['gene_ids'],
            "gene_name": adata.var['gene_names'],
            "feature_type": "Gene Expression"
        })
    else:
        features_df = pd.DataFrame({
            "gene_id": adata.var_names,
            "gene_name": adata.var_names,
            "feature_type": "Gene Expression"
        })

    features_df.to_csv(
        os.path.join(output_dir, "features.tsv"),
        sep="\t",
        index=False,
        header=False
    )


Outputting: P7_Post_11517
Outputting: P16_Post_11907
