# Notebook to run basic analysis on Raw data before cellbender

**Created by :** Srivalli Kolla

**Created on :** 17 February, 2025

**Modified on :** 17 February, 2025

**University of Würzburg**

Env : scanpy (Python 3.12.2)

# Importing Packages

In [35]:
import anndata as ad
import scanpy as sc
import os
import datetime
import bbknn
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import median_abs_deviation

In [36]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 300, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'svg')

timestamp = datetime.datetime.now().strftime("%d_%m_%y")

-----
anndata     0.11.3
scanpy      1.10.4
-----
Cython              3.0.11
PIL                 11.1.0
annoy               NA
asttokens           NA
bbknn               1.6.0
colorama            0.4.6
comm                0.2.2
cycler              0.12.1
cython              3.0.11
cython_runtime      NA
dateutil            2.9.0.post0
debugpy             1.8.12
decorator           5.1.1
executing           2.1.0
h5py                3.12.1
ipykernel           6.29.5
jedi                0.19.2
joblib              1.4.2
kiwisolver          1.4.8
legacy_api_wrap     NA
llvmlite            0.44.0
matplotlib          3.10.0
matplotlib_inline   0.1.7
mpl_toolkits        NA
natsort             8.4.0
numba               0.61.0
numpy               2.1.3
packaging           24.2
pandas              2.2.3
parso               0.8.4
patsy               1.0.1
pickleshare         0.7.5
platformdirs        4.3.6
prompt_toolkit      3.0.50
psutil              6.1.1
pure_eval           0.2.3
pydev_ipytho

# Data loading

In [37]:
before_cb_raw = sc.read_10x_mtx('../data/filtered_feature_bc_matrix',var_names = 'gene_symbols', gex_only= False)
before_cb_raw

--> This might be very slow. Consider passing `cache=True`, which enables much faster reading from a cache file.


AnnData object with n_obs × n_vars = 41918 × 32293
    var: 'gene_ids', 'feature_types'

In [38]:
before_cb_raw.obs

AAACCAAAGCCAGTGT-1
AAACCAAAGGCGTCCA-1
AAACCAAAGGTTAGCC-1
AAACCAAAGGTTAGTT-1
AAACCAAAGTACCACA-1
...
TGTGTTGAGCCTATCT-1
TGTGTTGAGCTAACCA-1
TGTGTTGAGGAGGACC-1
TGTGTTGAGTACGCAC-1
TGTGTTGAGTCGCTCC-1


In [39]:
before_cb_raw.var

Unnamed: 0,gene_ids,feature_types
Xkr4,ENSMUSG00000051951,Gene Expression
Gm1992,ENSMUSG00000089699,Gene Expression
Gm19938,ENSMUSG00000102331,Gene Expression
Gm37381,ENSMUSG00000102343,Gene Expression
Rp1,ENSMUSG00000025900,Gene Expression
...,...,...
TotalSeqB5,Hash5,Antibody Capture
TotalSeqB6,Hash6,Antibody Capture
TotalSeqB7,Hash7,Antibody Capture
TotalSeqB8,Hash8,Antibody Capture


In [40]:
def X_is_raw(before_cb_raw):
    return np.array_equal(before_cb_raw.X.sum(axis=0).astype(int), before_cb_raw.X.sum(axis=0))

is_raw = X_is_raw(before_cb_raw)
print(is_raw)

True


# Hashtag check

1. Extract features
2. Extract Hashtags from features

In [41]:
features = pd.read_csv(f"../data/filtered_feature_bc_matrix/features.tsv.gz", sep="\t", header=None)
features.columns = ['gene_symbols',"gene_ids", "feature_types"]
features.index = before_cb_raw.var_names 

hashtag_features = features[features["feature_types"] == "Antibody Capture"].index.tolist()
hashtag_features

['TotalSeqB1',
 'TotalSeqB3',
 'TotalSeqB4',
 'TotalSeqB5',
 'TotalSeqB6',
 'TotalSeqB7',
 'TotalSeqB8',
 'TotalSeqB9']

1. Subset the AnnData object to only hashtag counts
2. Convert to a pandas DataFrame
3. Check the head

In [42]:
before_cb_raw_hto = before_cb_raw[:, hashtag_features].copy()

hto_counts = pd.DataFrame(
    before_cb_raw_hto.X.toarray(),  # Convert sparse matrix to dense
    index=before_cb_raw_hto.obs.index,  # Cell barcodes
    columns=hashtag_features   # Hashtag feature names
)

hto_counts.head()

Unnamed: 0,TotalSeqB1,TotalSeqB3,TotalSeqB4,TotalSeqB5,TotalSeqB6,TotalSeqB7,TotalSeqB8,TotalSeqB9
AAACCAAAGCCAGTGT-1,168.0,422.0,401.0,115.0,181.0,124.0,214.0,335.0
AAACCAAAGGCGTCCA-1,196.0,486.0,238.0,145.0,210.0,155.0,255.0,243.0
AAACCAAAGGTTAGCC-1,185.0,615.0,113.0,158.0,236.0,122.0,243.0,189.0
AAACCAAAGGTTAGTT-1,153.0,415.0,95.0,248.0,221.0,128.0,211.0,187.0
AAACCAAAGTACCACA-1,181.0,433.0,89.0,132.0,207.0,175.0,231.0,207.0


Assign each cell to the hashtag with the highest count and look at the head

In [43]:
hto_counts["Assigned_Hashtag"] = hto_counts.idxmax(axis=1)
hto_counts.head(10)

Unnamed: 0,TotalSeqB1,TotalSeqB3,TotalSeqB4,TotalSeqB5,TotalSeqB6,TotalSeqB7,TotalSeqB8,TotalSeqB9,Assigned_Hashtag
AAACCAAAGCCAGTGT-1,168.0,422.0,401.0,115.0,181.0,124.0,214.0,335.0,TotalSeqB3
AAACCAAAGGCGTCCA-1,196.0,486.0,238.0,145.0,210.0,155.0,255.0,243.0,TotalSeqB3
AAACCAAAGGTTAGCC-1,185.0,615.0,113.0,158.0,236.0,122.0,243.0,189.0,TotalSeqB3
AAACCAAAGGTTAGTT-1,153.0,415.0,95.0,248.0,221.0,128.0,211.0,187.0,TotalSeqB3
AAACCAAAGTACCACA-1,181.0,433.0,89.0,132.0,207.0,175.0,231.0,207.0,TotalSeqB3
AAACCAAAGTAGCCGT-1,227.0,441.0,154.0,120.0,194.0,123.0,228.0,189.0,TotalSeqB3
AAACCAAAGTAGGCAG-1,167.0,377.0,72.0,227.0,192.0,123.0,228.0,190.0,TotalSeqB3
AAACCAAAGTCATGGC-1,142.0,525.0,89.0,121.0,539.0,152.0,272.0,204.0,TotalSeqB6
AAACCAAAGTCGAAGG-1,196.0,722.0,166.0,396.0,232.0,190.0,354.0,268.0,TotalSeqB3
AAACCAAAGTTAGGCC-1,191.0,527.0,362.0,150.0,251.0,161.0,286.0,248.0,TotalSeqB3


Saving the final assigned hashtag as obs and total hashtag counts as obsm

In [44]:
hto_counts = hto_counts.apply(pd.to_numeric, errors="coerce")
before_cb_raw.obs["assigned_hashtag"] = hto_counts.idxmax(axis=1)
before_cb_raw.obsm["hto_counts"] = hto_counts
before_cb_raw

AnnData object with n_obs × n_vars = 41918 × 32293
    obs: 'assigned_hashtag'
    var: 'gene_ids', 'feature_types'
    obsm: 'hto_counts'

# Data saving

In [45]:
output_path = f"../data/before_cb_raw_with_hashtags_{timestamp}.h5ad"
before_cb_raw.write(output_path)

print(f"Updated AnnData object saved at: {output_path}")

Updated AnnData object saved at: ../data/before_cb_raw_with_hashtags_17_02_25.h5ad
