# Notebook to seperate multinucleated data based on hashtags

**Created by :** Srivalli Kolla

**Created on :** 06 March, 2025

**Modified on :** 06 March, 2025

**University of Würzburg**

Env : scanpy (Python 3.12.2)

# Importing Packages

In [8]:
import scanpy as sc
import pandas as pd
import datetime
import os

In [9]:
sc.settings.verbosity = 3
sc.logging.print_versions()

timestamp = datetime.datetime.now().strftime("%d_%m_%y")

-----
anndata     0.11.3
scanpy      1.10.4
-----
Cython              3.0.12
PIL                 11.1.0
asttokens           NA
colorama            0.4.6
comm                0.2.2
cycler              0.12.1
cython              3.0.12
cython_runtime      NA
dateutil            2.9.0.post0
debugpy             1.8.12
decorator           5.2.1
executing           2.1.0
h5py                3.13.0
ipykernel           6.29.5
jedi                0.19.2
joblib              1.4.2
kiwisolver          1.4.7
legacy_api_wrap     NA
llvmlite            0.44.0
matplotlib          3.10.1
mpl_toolkits        NA
natsort             8.4.0
numba               0.61.0
numpy               2.1.3
packaging           24.2
pandas              2.2.3
parso               0.8.4
platformdirs        4.3.6
prompt_toolkit      3.0.50
psutil              7.0.0
pure_eval           0.2.3
pydev_ipython       NA
pydevconsole        NA
pydevd              3.2.3
pydevd_file_utils   NA
pydevd_plugins      NA
pydevd_tracing      N

# Importing files

In [10]:
file_path = '../data/cellbender_processed_data/0.01_full/cb_0.01_full_with_hashtags_05_03_25.h5ad'
raw_adata = sc.read_h5ad(file_path)

In [11]:
raw_adata.obs

Unnamed: 0,assigned_hashtag
TACCCATTCGAACGCT-1,TotalSeqB7
TACTCATCACCAGGAC-1,TotalSeqB7
GCTGCGATCCGCCAGA-1,TotalSeqB3
TGCCGTTCATGAATGC-1,TotalSeqB7
ATCGTTGAGGATCTAT-1,TotalSeqB4
...,...
GTGCGGTCAATCCGTC-1,TotalSeqB3
GCCAATATCCCCTGAC-1,TotalSeqB4
GTCCTATTCGCGAATC-1,TotalSeqB1
AGTGATCCAAAGGTTC-1,TotalSeqB6


In [12]:
raw_adata.obs['assigned_hashtag'].unique()

['TotalSeqB7', 'TotalSeqB3', 'TotalSeqB4', 'TotalSeqB5', 'TotalSeqB8', 'TotalSeqB1', 'TotalSeqB9', 'TotalSeqB6']
Categories (8, object): ['TotalSeqB1', 'TotalSeqB3', 'TotalSeqB4', 'TotalSeqB5', 'TotalSeqB6', 'TotalSeqB7', 'TotalSeqB8', 'TotalSeqB9']

In [13]:
raw_adata.var

Unnamed: 0,gene_ids,feature_types,genome
Xkr4,ENSMUSG00000051951,Gene Expression,
Gm1992,ENSMUSG00000089699,Gene Expression,
Gm19938,ENSMUSG00000102331,Gene Expression,
Gm37381,ENSMUSG00000102343,Gene Expression,
Rp1,ENSMUSG00000025900,Gene Expression,
...,...,...,...
TotalSeqB5,Hash5,Antibody Capture,
TotalSeqB6,Hash6,Antibody Capture,
TotalSeqB7,Hash7,Antibody Capture,
TotalSeqB8,Hash8,Antibody Capture,


# Features check

1. Get the raw data as a DataFrame
2. Keep only HTO features

In [14]:
hto_features = [f for f in raw_adata.var_names if "Totalseq" in f or raw_adata.var.loc[f, "feature_types"] == "Antibody Capture"]


raw_df = pd.DataFrame(raw_adata.X.toarray(), index=raw_adata.obs.index, columns=raw_adata.var_names)
raw_df = raw_df[hto_features]

print("Raw Data Head:")
print(raw_df.head())

hto_counts_list = []

Raw Data Head:
                    TotalSeqB1  TotalSeqB3  TotalSeqB4  TotalSeqB5  \
TACCCATTCGAACGCT-1           0           0         375           1   
TACTCATCACCAGGAC-1           0           0           0           0   
GCTGCGATCCGCCAGA-1           1        2366           0           0   
TGCCGTTCATGAATGC-1           1           1           0         215   
ATCGTTGAGGATCTAT-1           0           0        2628           0   

                    TotalSeqB6  TotalSeqB7  TotalSeqB8  TotalSeqB9  
TACCCATTCGAACGCT-1           0        1546           0           0  
TACTCATCACCAGGAC-1           2        1333           0           0  
GCTGCGATCCGCCAGA-1           1           1           0           0  
TGCCGTTCATGAATGC-1           1         915           0           0  
ATCGTTGAGGATCTAT-1           0           0           0           0  


# Hashtag Assignment

1. Initialize a list to store the assigned hashtag for each cell
2. Set a threshold for HTO detection
3. Classify cells based on their highest HTO expression
4. Check if the row has any non-zero values before applying idxmax
5. Find the hashtag with the highest count
6. If the max count is above the threshold, classify this cell to that hashtag
7. Add the classification results to `raw_adata.obs`

In [15]:
nuclei_groups = []
threshold = 10  
 
for cell in raw_df.index:
    counts = raw_df.loc[cell]
    
    if counts.max() == 0:
        nuclei_groups.append('No Hashtag Detected')  
    else:
        max_hto = counts.idxmax()

        if counts[max_hto] > threshold:
            nuclei_groups.append(max_hto)
        else:
            nuclei_groups.append('No Hashtag Detected')  

raw_adata.obs['assigned_hashtag'] = nuclei_groups
assigned_hashtags = raw_adata.obs['assigned_hashtag'].unique()
assigned_hashtags

array(['TotalSeqB7', 'TotalSeqB3', 'TotalSeqB4', 'TotalSeqB5',
       'TotalSeqB8', 'TotalSeqB1', 'TotalSeqB9', 'TotalSeqB6',
       'No Hashtag Detected'], dtype=object)

# Writing files

1. Loop through each assigned hashtag and inspect the data
2. Subset the data for the current hashtag group
3. Print contents
4. Sanitize the hashtag to make it a valid file name
5. Create output folder and file

In [16]:
for hashtag in assigned_hashtags:
    
    subset_adata = raw_adata[raw_adata.obs['assigned_hashtag'] == hashtag]
    
    print(f"Number of cells in {hashtag}: {subset_adata.shape[0]}")
    
    hashtag_sanitized = hashtag.replace(" ", "_").replace("/", "_").replace(":", "_").replace(",", "_")

    output_dir = '../data/processed_by_hashtag'
    output_file = os.path.join(output_dir, f'{hashtag_sanitized}_subset_{timestamp}.h5ad')

    subset_adata.write(output_file)
    print(f'Saved {hashtag} subset to {output_file}')

Number of cells in TotalSeqB7: 1257


  df[key] = c


Saved TotalSeqB7 subset to ../data/processed_by_hashtag/TotalSeqB7_subset_06_03_25.h5ad
Number of cells in TotalSeqB3: 1590


  df[key] = c


Saved TotalSeqB3 subset to ../data/processed_by_hashtag/TotalSeqB3_subset_06_03_25.h5ad
Number of cells in TotalSeqB4: 3151


  df[key] = c


Saved TotalSeqB4 subset to ../data/processed_by_hashtag/TotalSeqB4_subset_06_03_25.h5ad
Number of cells in TotalSeqB5: 1950


  df[key] = c


Saved TotalSeqB5 subset to ../data/processed_by_hashtag/TotalSeqB5_subset_06_03_25.h5ad
Number of cells in TotalSeqB8: 409


  df[key] = c


Saved TotalSeqB8 subset to ../data/processed_by_hashtag/TotalSeqB8_subset_06_03_25.h5ad
Number of cells in TotalSeqB1: 417


  df[key] = c


Saved TotalSeqB1 subset to ../data/processed_by_hashtag/TotalSeqB1_subset_06_03_25.h5ad
Number of cells in TotalSeqB9: 473


  df[key] = c


Saved TotalSeqB9 subset to ../data/processed_by_hashtag/TotalSeqB9_subset_06_03_25.h5ad
Number of cells in TotalSeqB6: 367


  df[key] = c


Saved TotalSeqB6 subset to ../data/processed_by_hashtag/TotalSeqB6_subset_06_03_25.h5ad
Number of cells in No Hashtag Detected: 1911


  df[key] = c


Saved No Hashtag Detected subset to ../data/processed_by_hashtag/No_Hashtag_Detected_subset_06_03_25.h5ad


# Checking no hashtag data

In [20]:
file_path = '../data/processed_by_hashtag/No_Hashtag_Detected_subset_06_03_25.h5ad'
no_hashtag_adata = sc.read_h5ad(file_path)

no_hashtag_cells = no_hashtag_adata.obs[no_hashtag_adata.obs['assigned_hashtag'] == 'No Hashtag Detected']
hto_features = [f for f in no_hashtag_adata.var_names if "Totalseq" in f or no_hashtag_adata.var.loc[f, "feature_types"] == "Antibody Capture"]
hto_counts_no_hashtag = no_hashtag_adata[no_hashtag_cells.index, hto_features].X.toarray()
hto_counts_no_hashtag_df = pd.DataFrame(hto_counts_no_hashtag, index=no_hashtag_cells.index, columns=hto_features)

print("HTO Counts for 'No Hashtag Detected' Cells:")
print(hto_counts_no_hashtag_df.head())

HTO Counts for 'No Hashtag Detected' Cells:
                    TotalSeqB1  TotalSeqB3  TotalSeqB4  TotalSeqB5  \
CCTACCGCAACCCGAA-1           0           2           0           0   
ACTCACCCAATACTGA-1           0           2           0           1   
GTACCAGCATTGCACC-1           0           1           0           0   
CGTCCAAGTGATTGCT-1           0           2           0           4   
AAACTCACATGAGTGT-1           0           0           0           3   

                    TotalSeqB6  TotalSeqB7  TotalSeqB8  TotalSeqB9  
CCTACCGCAACCCGAA-1           0           0           0           0  
ACTCACCCAATACTGA-1           0           0           0           0  
GTACCAGCATTGCACC-1           0           0           0           0  
CGTCCAAGTGATTGCT-1           0           0           0           0  
AAACTCACATGAGTGT-1           0           0           0           0  
