# Notebook to run basic analysis on Cellbender data - 0.01 Full model

**Created by :** Srivalli Kolla

**Created on :** 05 March, 2025

**Modified on :** 05 March, 2025

**University of Würzburg**

Env : scanpy (Python 3.12.2)

# Importing Packages

In [1]:
import anndata as ad
import scanpy as sc
import os
import datetime
import bbknn
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import h5py
from scipy.stats import median_abs_deviation

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()

plt.rcParams['figure.dpi'] = 300  
plt.rcParams['savefig.dpi'] = 300

timestamp = datetime.datetime.now().strftime("%d_%m_%y")

-----
anndata     0.11.3
scanpy      1.10.4
-----
Cython              3.0.12
PIL                 11.1.0
annoy               NA
asttokens           NA
bbknn               1.6.0
colorama            0.4.6
comm                0.2.2
cycler              0.12.1
cython              3.0.12
cython_runtime      NA
dateutil            2.9.0.post0
debugpy             1.8.12
decorator           5.2.1
executing           2.1.0
h5py                3.13.0
ipykernel           6.29.5
jedi                0.19.2
joblib              1.4.2
kiwisolver          1.4.7
legacy_api_wrap     NA
llvmlite            0.44.0
matplotlib          3.10.1
mpl_toolkits        NA
natsort             8.4.0
numba               0.61.0
numpy               2.1.3
packaging           24.2
pandas              2.2.3
parso               0.8.4
patsy               1.0.1
platformdirs        4.3.6
prompt_toolkit      3.0.50
psutil              7.0.0
pure_eval           0.2.3
pydev_ipython       NA
pydevconsole        NA
pydevd            

# Data loading

In [3]:
after_cb_raw = sc.read_10x_h5('../data/cellbender_processed_data/0.01_full/0.01_after_cb_filtered.h5',gex_only=False)
after_cb_raw

reading ../data/cellbender_processed_data/0.01_full/0.01_after_cb_filtered.h5
 (0:00:01)


  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 11525 × 32293
    var: 'gene_ids', 'feature_types', 'genome'

In [4]:
after_cb_raw.obs

TACCCATTCGAACGCT-1
TACTCATCACCAGGAC-1
GCTGCGATCCGCCAGA-1
TGCCGTTCATGAATGC-1
ATCGTTGAGGATCTAT-1
...
GTGCGGTCAATCCGTC-1
GCCAATATCCCCTGAC-1
GTCCTATTCGCGAATC-1
AGTGATCCAAAGGTTC-1
CCACTATTCGCTCTCA-1


In [5]:
after_cb_raw.var

Unnamed: 0,gene_ids,feature_types,genome
Xkr4,ENSMUSG00000051951,Gene Expression,
Gm1992,ENSMUSG00000089699,Gene Expression,
Gm19938,ENSMUSG00000102331,Gene Expression,
Gm37381,ENSMUSG00000102343,Gene Expression,
Rp1,ENSMUSG00000025900,Gene Expression,
...,...,...,...
TotalSeqB5,Hash5,Antibody Capture,
TotalSeqB6,Hash6,Antibody Capture,
TotalSeqB7,Hash7,Antibody Capture,
TotalSeqB8,Hash8,Antibody Capture,


In [6]:
after_cb_raw.var['feature_types'].unique()

array(['Gene Expression', 'Antibody Capture'], dtype=object)

In [7]:
def X_is_raw(after_cb_raw):
    return np.array_equal(after_cb_raw.X.sum(axis=0).astype(int), after_cb_raw.X.sum(axis=0))

is_raw = X_is_raw(after_cb_raw)
print(is_raw)

True


# Hashtag check

1. Extract features
2. Extract Hashtags from features

In [8]:
features = after_cb_raw.var[['gene_ids', 'feature_types']].copy()

hashtag_features = features.loc[features["feature_types"] == "Antibody Capture"].index.tolist()
hashtag_features

['TotalSeqB1',
 'TotalSeqB3',
 'TotalSeqB4',
 'TotalSeqB5',
 'TotalSeqB6',
 'TotalSeqB7',
 'TotalSeqB8',
 'TotalSeqB9']

1. Subset the AnnData object to only hashtag counts
2. Convert to a pandas DataFrame
3. Check the head

In [9]:
after_cb_raw.var_names_make_unique()

In [10]:
after_cb_raw_hto = after_cb_raw[:, hashtag_features].copy()

hto_counts = pd.DataFrame(
    after_cb_raw_hto.X.toarray(),  # Convert the sparse matrix to a dense matrix
    index=after_cb_raw_hto.obs.index,  # Use the cell barcodes as index
    columns=hashtag_features  # Use the extracted hashtag feature names
)

hto_counts.head()

Unnamed: 0,TotalSeqB1,TotalSeqB3,TotalSeqB4,TotalSeqB5,TotalSeqB6,TotalSeqB7,TotalSeqB8,TotalSeqB9
TACCCATTCGAACGCT-1,0,0,375,1,0,1546,0,0
TACTCATCACCAGGAC-1,0,0,0,0,2,1333,0,0
GCTGCGATCCGCCAGA-1,1,2366,0,0,1,1,0,0
TGCCGTTCATGAATGC-1,1,1,0,215,1,915,0,0
ATCGTTGAGGATCTAT-1,0,0,2628,0,0,0,0,0


Assign each cell to the hashtag with the highest count and look at the head

In [11]:
hto_counts["Assigned_Hashtag"] = hto_counts.idxmax(axis=1)
hto_counts.head(10)

Unnamed: 0,TotalSeqB1,TotalSeqB3,TotalSeqB4,TotalSeqB5,TotalSeqB6,TotalSeqB7,TotalSeqB8,TotalSeqB9,Assigned_Hashtag
TACCCATTCGAACGCT-1,0,0,375,1,0,1546,0,0,TotalSeqB7
TACTCATCACCAGGAC-1,0,0,0,0,2,1333,0,0,TotalSeqB7
GCTGCGATCCGCCAGA-1,1,2366,0,0,1,1,0,0,TotalSeqB3
TGCCGTTCATGAATGC-1,1,1,0,215,1,915,0,0,TotalSeqB7
ATCGTTGAGGATCTAT-1,0,0,2628,0,0,0,0,0,TotalSeqB4
AGCCAGCCACCTTAGC-1,2,0,0,350,1,793,0,396,TotalSeqB7
TCTAGCTTCCTCTAGT-1,0,0,3,0,0,633,0,0,TotalSeqB7
ACATGGCTCACTAGAT-1,1,0,0,972,0,2,0,0,TotalSeqB5
TGAGCCACATTAGGTA-1,0,580,145,0,0,1,0,0,TotalSeqB3
CATAGGTCAATCTGGG-1,1,845,605,47,0,1198,0,0,TotalSeqB7


Saving the final assigned hashtag as obs and total hashtag counts as obsm

In [12]:
hto_counts = hto_counts.apply(pd.to_numeric, errors="coerce")
after_cb_raw.obs["assigned_hashtag"] = hto_counts.idxmax(axis=1)
after_cb_raw.obsm["hto_counts"] = hto_counts
after_cb_raw

AnnData object with n_obs × n_vars = 11525 × 32293
    obs: 'assigned_hashtag'
    var: 'gene_ids', 'feature_types', 'genome'
    obsm: 'hto_counts'

# Data saving

In [13]:
output_path = f"../data/cellbender_processed_data/0.01_full/cb_0.01_full_with_hashtags_{timestamp}.h5ad"
after_cb_raw.write(output_path)

print(f"Updated AnnData object saved at: {output_path}")

Updated AnnData object saved at: ../data/cellbender_processed_data/cb_0.01_full_with_hashtags_05_03_25.h5ad
