### Notebook for the hashtags demultiplexing of Fetal Gut data from Fawkner-Corbett study
- **Developed by:** Anna Maguza
- **Place:** Wuerzburg Institute for System Immunology
- **Date:** 12th March 2024

### Import packages

In [55]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import anndata

from scipy import sparse

### Set up the cells

In [56]:
%matplotlib inline

In [57]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.9.2
scanpy      1.9.5
-----
PIL                         10.0.1
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        23.1.0
attrs                       23.1.0
babel                       2.13.0
backcall                    0.2.0
certifi                     2023.07.22
cffi                        1.16.0
charset_normalizer          3.3.0
colorama                    0.4.6
comm                        0.1.4
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.8.0
decorator                   5.1.1
defusedxml                  0.7.1
executing                   2.0.0
fastjsonschema              NA
fqdn                        NA
h5py                        3.9.0
idna                        3.4
igraph                      0.11.2
ipykernel                   6.25.2
ipywidgets                  8.1.1
isoduration                 NA
jedi   

In [58]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

+ Unzip data

In [None]:
import os
import tarfile

def unzip_tar_gz_files(folder_path):
    # Iterate over all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a .tar.gz file
        if file_name.endswith('.tar.gz'):
            # Construct the path to the file
            file_path = os.path.join(folder_path, file_name)
            # Remove the .tar.gz extension to get the directory name
            dir_name = file_name[:-7]
            # Construct the path for the new directory
            dir_path = os.path.join(folder_path, dir_name)
            # Create the directory
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            # Open the .tar.gz file
            with tarfile.open(file_path, 'r:gz') as tar:
                # Extract its contents into the directory
                tar.extractall(path=dir_path)
            print(f'Extracted {file_name} into {dir_path}')

In [None]:
folder_path = '/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2' 
unzip_tar_gz_files(folder_path)

In [None]:
import gzip
import shutil

In [None]:
def unzip_and_delete(file_path):
    """Unzip a .gz file and then delete the original .gz file."""
    with gzip.open(file_path, 'rb') as f_in:
        with open(file_path[:-3], 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(file_path)

In [None]:
def process_folders(list_file_path, base_path):
    """Process each folder listed in the provided file, given a base path."""
    with open(list_file_path, 'r') as file:
        for line in file:
            relative_folder_path = line.strip()
            # Construct the full folder path
            full_folder_path = os.path.join(base_path, relative_folder_path)
            
            # Navigate two levels deep, assuming there's only one folder at each level
            try:
                first_level = os.listdir(full_folder_path)[0]
                second_level_path = os.path.join(full_folder_path, first_level)
                if os.path.isdir(second_level_path):
                    second_level = os.listdir(second_level_path)[0]
                    final_folder_path = os.path.join(second_level_path, second_level)
                    
                    # Check and unzip files if they exist
                    for file_name in ['barcodes.tsv.gz', 'features.tsv.gz', 'matrix.mtx.gz']:
                        file_path = os.path.join(final_folder_path, file_name)
                        if os.path.exists(file_path):
                            unzip_and_delete(file_path)
                            print(f"Unzipped and deleted {file_path}")
                        else:
                            print(f"File {file_name} not found in {final_folder_path}")
            except Exception as e:
                print(f"Error processing {full_folder_path}: {e}")

In [None]:
# Base path where the folders from files_1.txt are located
base_path = '/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2'

# Path to the file containing the list of folders
list_file_path = '/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/files_1.txt'

# Process the folders
process_folders(list_file_path, base_path)

### Process HTO data

+ Samples GSM4808349_HTO1

In [59]:
barcodes = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808349_HTO1/HTO1/umi_count/barcodes.tsv', sep='\t', header=None)
features = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808349_HTO1/HTO1/umi_count/features.tsv', sep='\t', header=None)
matrix = sc.read_mtx('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808349_HTO1/HTO1/umi_count/matrix.mtx')

Matrix has one extra row filled with zeros, it is easier to delete it from ready anndata object than from matrix. Therefore, we need to modify features firstly, so we could create anndata object. We are adding last row to features with value 'extra'

In [60]:
features.loc[9] = ['extra']

+ create anndata object

In [61]:
adata49 = sc.AnnData(X=np.transpose(matrix.X), obs=barcodes, var=features)



+ Delete extra row from matrix

In [62]:
# Calculate the total counts for each gene (var)
total_counts = np.array(adata49.X.sum(axis=0)).flatten()

# Add these total counts to adata.var
adata49.var['total_counts'] = total_counts

In [63]:
# if adata.var['total_counts'] is equal zero then remove the row
adata49 = adata49[:, adata49.var['total_counts'] > 0]

  if not is_categorical_dtype(df_full[k]):


In [64]:
adata49.obs.index = adata49.obs[0]

adata49.obs.index.name = 'barcodes'

+ Calculate HTO expression to find the sample with scanpy

In [71]:
adata49.var[['HTO_tag', 'HTO_sequence']] = adata49.var[0].str.split('-', expand=True)

adata49.var.index = adata49.var['HTO_tag']

adata49.var['Tag_type'] = 'TotalSeq-tag'

In [72]:
adata49.var

Unnamed: 0_level_0,0,total_counts,HTO_tag,HTO_sequence,Tag_type
HTO_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HTO1,HTO1-GTCAACTCTTTAGCG,10393083.0,HTO1,GTCAACTCTTTAGCG,TotalSeq-tag
HTO2,HTO2-TGATGGCCTATTGGG,37856464.0,HTO2,TGATGGCCTATTGGG,TotalSeq-tag
HTO3,HTO3-TTCCGCCTCTCTTTG,3438823.0,HTO3,TTCCGCCTCTCTTTG,TotalSeq-tag
HTO4,HTO4-AGTAAGTTCAGCGTA,4704272.0,HTO4,AGTAAGTTCAGCGTA,TotalSeq-tag
HTO5,HTO5-AAGTATCGTTTCGCA,1726293.0,HTO5,AAGTATCGTTTCGCA,TotalSeq-tag
HTO6,HTO6-GGTTGCCAGATGTCA,7342014.0,HTO6,GGTTGCCAGATGTCA,TotalSeq-tag
HTO7,HTO7-TGTCTTTCCTGCCAG,2151070.0,HTO7,TGTCTTTCCTGCCAG,TotalSeq-tag
HTO8,HTO8-CTCCTCTGCAATTAC,4702289.0,HTO8,CTCCTCTGCAATTAC,TotalSeq-tag
HTO9,HTO9-CAGTAGTCACGGTCA,702110.0,HTO9,CAGTAGTCACGGTCA,TotalSeq-tag


In [None]:
adata49.var.index.values

In [73]:
# Extracting gene names, assuming they are stored in adata.var_names
HTO_id = adata49.var_names

# Calculating counts for each gene in each cell
# The operation depends on whether the data is stored as a dense matrix or a sparse matrix.
if sparse.issparse(adata49.X):
    gene_counts = pd.DataFrame(adata49.X.toarray(), index=adata49.obs_names, columns=HTO_id)
else:
    gene_counts = pd.DataFrame(adata49.X, index=adata49.obs_names, columns=HTO_id)

# Because you have only 9 genes, you can add each gene's counts as a separate column in adata.obs directly
for HTO in HTO_id:
    adata49.obs[HTO] = gene_counts[HTO]

In [74]:
cell_hashing_columns=adata49.var.index.values
sc.external.pp.hashsolo(adata49, cell_hashing_columns)

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2


In [77]:
adata49.obs

Unnamed: 0_level_0,0,HTO1,HTO2,HTO3,HTO4,HTO5,HTO6,HTO7,HTO8,HTO9,most_likely_hypothesis,cluster_feature,negative_hypothesis_probability,singlet_hypothesis_probability,doublet_hypothesis_probability,Classification,Digestion_Condition,GEO_Accession
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ACAACCAGTCTACAGT,ACAACCAGTCTACAGT,149.0,592.0,30.0,65.0,14.0,36.0,717.0,45.0,4.0,2.0,0.0,6.567185e-28,8.331745e-13,1.000000,Doublet,EPCAM+,GSM4808349
GAAGCCCAGTAGATCA,GAAGCCCAGTAGATCA,1.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.143123e-02,8.266012e-01,0.151968,HTO2,EPCAM+,GSM4808349
AGAACCTAGGTAAAGG,AGAACCTAGGTAAAGG,0.0,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.124669e-02,8.700887e-01,0.108665,HTO2,EPCAM+,GSM4808349
AAAGGGCGTAGAGCTG,AAAGGGCGTAGAGCTG,0.0,6.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.580356e-02,8.390249e-01,0.145172,HTO2,EPCAM+,GSM4808349
AGCTTCCTCGAGTACT,AGCTTCCTCGAGTACT,145.0,494.0,17.0,35.0,11.0,41.0,19.0,55.0,9.0,1.0,0.0,9.162083e-05,8.721335e-01,0.127775,HTO2,EPCAM+,GSM4808349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GGAGGTCAGTGGTGAC,GGAGGTCAGTGGTGAC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,1.0,0.0,1.199879e-03,9.324116e-01,0.066389,HTO8,EPCAM+,GSM4808349
GACAGCCAGCGGATCA,GACAGCCAGCGGATCA,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.949644e-02,8.716446e-01,0.108859,HTO2,EPCAM+,GSM4808349
GTATTGGAGGGTTAAT,GTATTGGAGGGTTAAT,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.104728e-02,9.045481e-01,0.064405,HTO2,EPCAM+,GSM4808349
GTAGGAGTCTTGGTCC,GTAGGAGTCTTGGTCC,136.0,577.0,32.0,53.0,26.0,42.0,16.0,46.0,7.0,1.0,0.0,7.015921e-05,8.649408e-01,0.134989,HTO2,EPCAM+,GSM4808349


In [76]:
adata49.obs['Digestion_Condition'] = 'EPCAM+'
adata49.obs['GEO_Accession'] = 'GSM4808349'

In [79]:
adata49.var.columns = adata49.var.columns.map(str)
adata49.obs.columns = adata49.obs.columns.map(str)

In [80]:
adata49.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808349_HTO1/GSM4808349_HTO_adata.h5ad')

+ Samples GSM4808350_HTO2

In [81]:
barcodes = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808350_HTO2/HTO2/umi_count/barcodes.tsv', sep='\t', header=None)
features = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808350_HTO2/HTO2/umi_count/features.tsv', sep='\t', header=None)
matrix = sc.read_mtx('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808350_HTO2/HTO2/umi_count/matrix.mtx')

In [82]:
features.loc[9] = ['extra']

adata50 = sc.AnnData(X=np.transpose(matrix.X), obs=barcodes, var=features)

# Calculate the total counts for each gene (var)
total_counts = np.array(adata50.X.sum(axis=0)).flatten()

# Add these total counts to adata.var
adata50.var['total_counts'] = total_counts



In [84]:
# if adata.var['total_counts'] is equal zero then remove the row
adata50 = adata50[:, adata50.var['total_counts'] > 0]

In [87]:
adata50.obs.index = adata50.obs[0]

adata50.obs.index.name = 'barcodes'

+ Calculate HTO expression to find the sample with scanpy

In [89]:
adata50.var[['HTO_tag', 'HTO_sequence']] = adata50.var[0].str.split('-', expand=True)

adata50.var.index = adata50.var['HTO_tag']

adata50.var['Tag_type'] = 'TotalSeq-tag'

  adata50.var[['HTO_tag', 'HTO_sequence']] = adata50.var[0].str.split('-', expand=True)


In [90]:
# Extracting gene names, assuming they are stored in adata.var_names
HTO_id = adata50.var_names

# Calculating counts for each gene in each cell
# The operation depends on whether the data is stored as a dense matrix or a sparse matrix.
if sparse.issparse(adata50.X):
    gene_counts = pd.DataFrame(adata50.X.toarray(), index=adata50.obs_names, columns=HTO_id)
else:
    gene_counts = pd.DataFrame(adata50.X, index=adata50.obs_names, columns=HTO_id)

# Because you have only 9 genes, you can add each gene's counts as a separate column in adata.obs directly
for HTO in HTO_id:
    adata50.obs[HTO] = gene_counts[HTO]

In [91]:
cell_hashing_columns=adata50.var.index.values
sc.external.pp.hashsolo(adata50, cell_hashing_columns)

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2


In [92]:
adata50.obs['Digestion_Condition'] = 'EPCAM+'
adata50.obs['GEO_Accession'] = 'GSM4808350'

In [93]:
adata50.var.columns = adata50.var.columns.map(str)
adata50.obs.columns = adata50.obs.columns.map(str)

In [95]:
adata50.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808350_HTO2/GSM4808350_HTO_adata.h5ad')

+ Samples GSM4808351_HTO3

In [96]:
barcodes = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808351_HTO3/HTO3/umi_count/barcodes.tsv', sep='\t', header=None)
features = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808351_HTO3/HTO3/umi_count/features.tsv', sep='\t', header=None)
matrix = sc.read_mtx('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808351_HTO3/HTO3/umi_count/matrix.mtx')

features.loc[9] = ['extra']

adata51 = sc.AnnData(X=np.transpose(matrix.X), obs=barcodes, var=features)

# Calculate the total counts for each gene (var)
total_counts = np.array(adata51.X.sum(axis=0)).flatten()

# Add these total counts to adata.var
adata51.var['total_counts'] = total_counts

# if adata.var['total_counts'] is equal zero then remove the row
adata51 = adata51[:, adata51.var['total_counts'] > 0]

adata51.obs.index = adata51.obs[0]

adata51.obs.index.name = 'barcodes'

  if not is_categorical_dtype(df_full[k]):


+ Calculate HTO expression to find the sample with scanpy

In [98]:
adata51.var[['HTO_tag', 'HTO_sequence']] = adata51.var[0].str.split('-', expand=True)

adata51.var.index = adata51.var['HTO_tag']

adata51.var['Tag_type'] = 'TotalSeq-tag'

  adata51.var[['HTO_tag', 'HTO_sequence']] = adata51.var[0].str.split('-', expand=True)


In [99]:
# Extracting gene names, assuming they are stored in adata.var_names
HTO_id = adata51.var_names

# Calculating counts for each gene in each cell
# The operation depends on whether the data is stored as a dense matrix or a sparse matrix.
if sparse.issparse(adata51.X):
    gene_counts = pd.DataFrame(adata51.X.toarray(), index=adata51.obs_names, columns=HTO_id)
else:
    gene_counts = pd.DataFrame(adata51.X, index=adata51.obs_names, columns=HTO_id)

# Because you have only 9 genes, you can add each gene's counts as a separate column in adata.obs directly
for HTO in HTO_id:
    adata51.obs[HTO] = gene_counts[HTO]

In [100]:
cell_hashing_columns=adata51.var.index.values
sc.external.pp.hashsolo(adata51, cell_hashing_columns)

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2


In [102]:
adata51.obs['Digestion_Condition'] = 'EPCAM+'
adata51.obs['GEO_Accession'] = 'GSM4808351'

In [103]:
adata51.var.columns = adata51.var.columns.map(str)
adata51.obs.columns = adata51.obs.columns.map(str)

In [104]:
adata51.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808351_HTO3/GSM4808351_HTO_adata.h5ad')

In [105]:
del adata49, adata50, adata51, adata

+ Samples GSM4808352_HTO4

In [106]:
barcodes = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808352_HTO4/HTO4/umi_count/barcodes.tsv', sep='\t', header=None)
features = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808352_HTO4/HTO4/umi_count/features.tsv', sep='\t', header=None)
matrix = sc.read_mtx('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808352_HTO4/HTO4/umi_count/matrix.mtx')

features.loc[9] = ['extra']

adata = sc.AnnData(X=np.transpose(matrix.X), obs=barcodes, var=features)

# Calculate the total counts for each gene (var)
total_counts = np.array(adata.X.sum(axis=0)).flatten()

# Add these total counts to adata.var
adata.var['total_counts'] = total_counts

# if adata.var['total_counts'] is equal zero then remove the row
adata = adata[:, adata.var['total_counts'] > 0]

adata.obs.index = adata.obs[0]

adata.obs.index.name = 'barcodes'

  if not is_categorical_dtype(df_full[k]):


+ Calculate HTO expression to find the sample with scanpy

In [108]:
adata.var[['HTO_tag', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)

adata.var.index = adata.var['HTO_tag']

adata.var['Tag_type'] = 'TotalSeq-tag'

  adata.var[['HTO_tag', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)


In [109]:
# Extracting gene names, assuming they are stored in adata.var_names
HTO_id = adata.var_names

# Calculating counts for each gene in each cell
# The operation depends on whether the data is stored as a dense matrix or a sparse matrix.
if sparse.issparse(adata.X):
    gene_counts = pd.DataFrame(adata.X.toarray(), index=adata.obs_names, columns=HTO_id)
else:
    gene_counts = pd.DataFrame(adata.X, index=adata.obs_names, columns=HTO_id)

# Because you have only 9 genes, you can add each gene's counts as a separate column in adata.obs directly
for HTO in HTO_id:
    adata.obs[HTO] = gene_counts[HTO]

In [110]:
cell_hashing_columns=adata.var.index.values
sc.external.pp.hashsolo(adata, cell_hashing_columns)

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2


In [111]:
adata.obs['Digestion_Condition'] = 'EPCAM-'
adata.obs['GEO_Accession'] = 'GSM4808352'

In [112]:
adata.var.columns = adata.var.columns.map(str)
adata.obs.columns = adata.obs.columns.map(str)

In [113]:
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808352_HTO4/GSM4808352_HTO_adata.h5ad')

In [114]:
del adata

+ Samples GSM4808353_HTO5

In [115]:
barcodes = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808353_HTO5/HTO5/umi_count/barcodes.tsv', sep='\t', header=None)
features = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808353_HTO5/HTO5/umi_count/features.tsv', sep='\t', header=None)
matrix = sc.read_mtx('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808353_HTO5/HTO5/umi_count/matrix.mtx')

features.loc[9] = ['extra']

adata = sc.AnnData(X=np.transpose(matrix.X), obs=barcodes, var=features)

# Calculate the total counts for each gene (var)
total_counts = np.array(adata.X.sum(axis=0)).flatten()

# Add these total counts to adata.var
adata.var['total_counts'] = total_counts

# if adata.var['total_counts'] is equal zero then remove the row
adata = adata[:, adata.var['total_counts'] > 0]

adata.obs.index = adata.obs[0]

adata.obs.index.name = 'barcodes'

  if not is_categorical_dtype(df_full[k]):


+ Calculate HTO expression to find the sample with scanpy

In [116]:
adata.var[['HTO_tag', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)

adata.var.index = adata.var['HTO_tag']

adata.var['Tag_type'] = 'TotalSeq-tag'

  adata.var[['HTO_tag', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)


In [119]:
# Extracting gene names, assuming they are stored in adata.var_names
HTO_id = adata.var_names

# Calculating counts for each gene in each cell
# The operation depends on whether the data is stored as a dense matrix or a sparse matrix.
if sparse.issparse(adata.X):
    gene_counts = pd.DataFrame(adata.X.toarray(), index=adata.obs_names, columns=HTO_id)
else:
    gene_counts = pd.DataFrame(adata.X, index=adata.obs_names, columns=HTO_id)

# Because you have only 9 genes, you can add each gene's counts as a separate column in adata.obs directly
for HTO in HTO_id:
    adata.obs[HTO] = gene_counts[HTO]

In [120]:
cell_hashing_columns=adata.var.index.values
sc.external.pp.hashsolo(adata, cell_hashing_columns)

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2


In [121]:
adata.obs['Digestion_Condition'] = 'EPCAM-'
adata.obs['GEO_Accession'] = 'GSM4808353'

In [122]:
adata.var.columns = adata.var.columns.map(str)
adata.obs.columns = adata.obs.columns.map(str)

In [123]:
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808353_HTO5/GSM4808353_HTO_adata.h5ad')

In [124]:
del adata

+ Samples GSM4808354_HTO6

In [125]:
barcodes = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808354_HTO6/HTO6/umi_count/barcodes.tsv', sep='\t', header=None)
features = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808354_HTO6/HTO6/umi_count/features.tsv', sep='\t', header=None)
matrix = sc.read_mtx('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808354_HTO6/HTO6/umi_count/matrix.mtx')

features.loc[9] = ['extra']

adata = sc.AnnData(X=np.transpose(matrix.X), obs=barcodes, var=features)

# Calculate the total counts for each gene (var)
total_counts = np.array(adata.X.sum(axis=0)).flatten()

# Add these total counts to adata.var
adata.var['total_counts'] = total_counts

# if adata.var['total_counts'] is equal zero then remove the row
adata = adata[:, adata.var['total_counts'] > 0]

adata.obs.index = adata.obs[0]

adata.obs.index.name = 'barcodes'

  if not is_categorical_dtype(df_full[k]):


+ Calculate HTO expression to find the sample with scanpy

In [126]:
adata.var[['HTO_tag', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)

adata.var.index = adata.var['HTO_tag']

adata.var['Tag_type'] = 'TotalSeq-tag'

  adata.var[['HTO_tag', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)


In [128]:
# Extracting gene names, assuming they are stored in adata.var_names
HTO_id = adata.var_names

# Calculating counts for each gene in each cell
# The operation depends on whether the data is stored as a dense matrix or a sparse matrix.
if sparse.issparse(adata.X):
    gene_counts = pd.DataFrame(adata.X.toarray(), index=adata.obs_names, columns=HTO_id)
else:
    gene_counts = pd.DataFrame(adata.X, index=adata.obs_names, columns=HTO_id)

# Because you have only 9 genes, you can add each gene's counts as a separate column in adata.obs directly
for HTO in HTO_id:
    adata.obs[HTO] = gene_counts[HTO]

In [129]:
cell_hashing_columns=adata.var.index.values
sc.external.pp.hashsolo(adata, cell_hashing_columns)

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2


In [130]:
adata.obs['Digestion_Condition'] = 'EPCAM-'
adata.obs['GEO_Accession'] = 'GSM4808354'

In [131]:
adata.var.columns = adata.var.columns.map(str)
adata.obs.columns = adata.obs.columns.map(str)

In [132]:
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808354_HTO6/GSM4808354_HTO_adata.h5ad')

In [133]:
del adata

+ Samples GSM4808355_HTO_epi_4

In [142]:
tag_file = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/hto_demultiplexing/tag_file.csv', sep=',', header=None, index_col=0)

In [149]:
tag_file.head()

Unnamed: 0_level_0,HTO_tag
HTO_sequence,Unnamed: 1_level_1
CATGATTGGCTC,HTO1-in-house-tag
GAGGCGATTGAT,HTO2-in-house-tag
TGTCCGGCAATA,HTO3-in-house-tag
TGGTGAACCTGG,HTO4-in-house-tag
GATCGTAATACC,HTO5-in-house-tag


In [146]:
tag_file.columns = ['HTO_tag']

tag_file.index.name = 'HTO_sequence'

In [137]:
barcodes = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808355_HTO_epi_4/HTO_epi_4/umi_count/barcodes.tsv', sep='\t', header=None)
features = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808355_HTO_epi_4/HTO_epi_4/umi_count/features.tsv', sep='\t', header=None)
matrix = sc.read_mtx('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808355_HTO_epi_4/HTO_epi_4/umi_count/matrix.mtx')

features.loc[9] = ['extra']

adata = sc.AnnData(X=np.transpose(matrix.X), obs=barcodes, var=features)

# Calculate the total counts for each gene (var)
total_counts = np.array(adata.X.sum(axis=0)).flatten()

# Add these total counts to adata.var
adata.var['total_counts'] = total_counts

# if adata.var['total_counts'] is equal zero then remove the row
adata = adata[:, adata.var['total_counts'] > 0]

adata.obs.index = adata.obs[0]

adata.obs.index.name = 'barcodes'

  if not is_categorical_dtype(df_full[k]):


In [157]:
adata.var

Unnamed: 0,0,total_counts,HTO_info,HTO_sequence,HTO_tag,Tag_type,extra
0,A_Ep_AAU1_10_TI_85-GTCAACTCTTTAGCG,2169415.0,A_Ep_AAU1_10_TI_85,GTCAACTCTTTAGCG,HTO1,TotalSeq,tag
1,B_Ep_AAU2_10_Proxcolon_76-TGATGGCCTATTGGG,1045143.0,B_Ep_AAU2_10_Proxcolon_76,TGATGGCCTATTGGG,HTO2,TotalSeq,tag
2,C_Ep_AAU3_10_Distcolon_72-TTCCGCCTCTCTTTG,1636235.0,C_Ep_AAU3_10_Distcolon_72,TTCCGCCTCTCTTTG,HTO3,TotalSeq,tag
3,D_Ep_AAQ1_15_TI_84-AGTAAGTTCAGCGTA,1125889.0,D_Ep_AAQ1_15_TI_84,AGTAAGTTCAGCGTA,HTO4,TotalSeq,tag
4,E_Ep_AAQ2_15_Proxcolon_81-AAGTATCGTTTCGCA,516705.0,E_Ep_AAQ2_15_Proxcolon_81,AAGTATCGTTTCGCA,HTO5,TotalSeq,tag
5,F_Ep_AAQ3_15_Distcolon_92-GGTTGCCAGATGTCA,1015037.0,F_Ep_AAQ3_15_Distcolon_92,GGTTGCCAGATGTCA,HTO6,TotalSeq,tag
6,G_Ep_AAP1_22_TI_90-TGTCTTTCCTGCCAG,595114.0,G_Ep_AAP1_22_TI_90,TGTCTTTCCTGCCAG,HTO7,TotalSeq,tag
7,H_Ep_AAP2_22_Proxcolon_83-CTCCTCTGCAATTAC,672856.0,H_Ep_AAP2_22_Proxcolon_83,CTCCTCTGCAATTAC,HTO8,TotalSeq,tag
8,I_Ep_AAP3_22_Distcolon_87-CAGTAGTCACGGTCA,599292.0,I_Ep_AAP3_22_Distcolon_87,CAGTAGTCACGGTCA,HTO9,TotalSeq,tag


In [151]:
adata.var[['HTO_info', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)

  adata.var[['HTO_info', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)


In [153]:
# merge the tag_file with adata.var by HTO_sequence
adata.var = adata.var.merge(tag_file, on='HTO_sequence')

In [156]:
adata.var[['HTO_tag', 'Tag_type', 'extra']] = adata.var['HTO_tag'].str.split('-', expand=True)

In [158]:
del adata.var['extra']

In [159]:
adata.var.index = adata.var['HTO_tag']

+ Calculate HTO expression to find the sample with scanpy

In [160]:
adata.var

Unnamed: 0_level_0,0,total_counts,HTO_info,HTO_sequence,HTO_tag,Tag_type
HTO_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HTO1,A_Ep_AAU1_10_TI_85-GTCAACTCTTTAGCG,2169415.0,A_Ep_AAU1_10_TI_85,GTCAACTCTTTAGCG,HTO1,TotalSeq
HTO2,B_Ep_AAU2_10_Proxcolon_76-TGATGGCCTATTGGG,1045143.0,B_Ep_AAU2_10_Proxcolon_76,TGATGGCCTATTGGG,HTO2,TotalSeq
HTO3,C_Ep_AAU3_10_Distcolon_72-TTCCGCCTCTCTTTG,1636235.0,C_Ep_AAU3_10_Distcolon_72,TTCCGCCTCTCTTTG,HTO3,TotalSeq
HTO4,D_Ep_AAQ1_15_TI_84-AGTAAGTTCAGCGTA,1125889.0,D_Ep_AAQ1_15_TI_84,AGTAAGTTCAGCGTA,HTO4,TotalSeq
HTO5,E_Ep_AAQ2_15_Proxcolon_81-AAGTATCGTTTCGCA,516705.0,E_Ep_AAQ2_15_Proxcolon_81,AAGTATCGTTTCGCA,HTO5,TotalSeq
HTO6,F_Ep_AAQ3_15_Distcolon_92-GGTTGCCAGATGTCA,1015037.0,F_Ep_AAQ3_15_Distcolon_92,GGTTGCCAGATGTCA,HTO6,TotalSeq
HTO7,G_Ep_AAP1_22_TI_90-TGTCTTTCCTGCCAG,595114.0,G_Ep_AAP1_22_TI_90,TGTCTTTCCTGCCAG,HTO7,TotalSeq
HTO8,H_Ep_AAP2_22_Proxcolon_83-CTCCTCTGCAATTAC,672856.0,H_Ep_AAP2_22_Proxcolon_83,CTCCTCTGCAATTAC,HTO8,TotalSeq
HTO9,I_Ep_AAP3_22_Distcolon_87-CAGTAGTCACGGTCA,599292.0,I_Ep_AAP3_22_Distcolon_87,CAGTAGTCACGGTCA,HTO9,TotalSeq


In [161]:
# Extracting gene names, assuming they are stored in adata.var_names
HTO_id = adata.var_names

# Calculating counts for each gene in each cell
# The operation depends on whether the data is stored as a dense matrix or a sparse matrix.
if sparse.issparse(adata.X):
    gene_counts = pd.DataFrame(adata.X.toarray(), index=adata.obs_names, columns=HTO_id)
else:
    gene_counts = pd.DataFrame(adata.X, index=adata.obs_names, columns=HTO_id)

# Because you have only 9 genes, you can add each gene's counts as a separate column in adata.obs directly
for HTO in HTO_id:
    adata.obs[HTO] = gene_counts[HTO]

In [162]:
cell_hashing_columns=adata.var.index.values
sc.external.pp.hashsolo(adata, cell_hashing_columns)

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2


In [163]:
adata.obs['Digestion_Condition'] = 'EPCAM+'
adata.obs['GEO_Accession'] = 'GSM4808355'

In [164]:
adata.var.columns = adata.var.columns.map(str)
adata.obs.columns = adata.obs.columns.map(str)

In [165]:
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808355_HTO_epi_4/GSM4808355_HTO_adata.h5ad')

In [166]:
del adata

+ Samples GSM4808356_HTO_stromal_4

In [167]:
barcodes = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808356_HTO_stromal_4/HTO_stromal_4/umi_count/barcodes.tsv', sep='\t', header=None)
features = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808356_HTO_stromal_4/HTO_stromal_4/umi_count/features.tsv', sep='\t', header=None)
matrix = sc.read_mtx('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808356_HTO_stromal_4/HTO_stromal_4/umi_count//matrix.mtx')

features.loc[9] = ['extra']

adata = sc.AnnData(X=np.transpose(matrix.X), obs=barcodes, var=features)

# Calculate the total counts for each gene (var)
total_counts = np.array(adata.X.sum(axis=0)).flatten()

# Add these total counts to adata.var
adata.var['total_counts'] = total_counts

# if adata.var['total_counts'] is equal zero then remove the row
adata = adata[:, adata.var['total_counts'] > 0]

adata.obs.index = adata.obs[0]

adata.obs.index.name = 'barcodes'

  if not is_categorical_dtype(df_full[k]):


In [174]:
adata.var

Unnamed: 0_level_0,0,total_counts,HTO_info,HTO_sequence,HTO_tag,Tag_type
HTO_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HTO1,A_St_AAU1_10_TI_78-GTCAACTCTTTAGCG,634211.0,A_St_AAU1_10_TI_78,GTCAACTCTTTAGCG,HTO1,TotalSeq
HTO2,B_St_AAU2_10_Proxcolon_90-TGATGGCCTATTGGG,20950064.0,B_St_AAU2_10_Proxcolon_90,TGATGGCCTATTGGG,HTO2,TotalSeq
HTO3,C_St_AAU3_10_Distcolon_79-TTCCGCCTCTCTTTG,1489625.0,C_St_AAU3_10_Distcolon_79,TTCCGCCTCTCTTTG,HTO3,TotalSeq
HTO4,D_St_AAQ1_15_TI_88-AGTAAGTTCAGCGTA,1105556.0,D_St_AAQ1_15_TI_88,AGTAAGTTCAGCGTA,HTO4,TotalSeq
HTO5,E_St_AAQ2_15_Proxcolon_82-AAGTATCGTTTCGCA,225692.0,E_St_AAQ2_15_Proxcolon_82,AAGTATCGTTTCGCA,HTO5,TotalSeq
HTO6,F_St_AAQ3_15_Distcolon_87-GGTTGCCAGATGTCA,328078.0,F_St_AAQ3_15_Distcolon_87,GGTTGCCAGATGTCA,HTO6,TotalSeq
HTO7,G_St_AAP1_22_TI_92-TGTCTTTCCTGCCAG,249863.0,G_St_AAP1_22_TI_92,TGTCTTTCCTGCCAG,HTO7,TotalSeq
HTO8,H_St__AAP2_22_Proxcolon_91-CTCCTCTGCAATTAC,624830.0,H_St__AAP2_22_Proxcolon_91,CTCCTCTGCAATTAC,HTO8,TotalSeq
HTO9,I_St_AAP3_22_Distcolon_94-CAGTAGTCACGGTCA,212123.0,I_St_AAP3_22_Distcolon_94,CAGTAGTCACGGTCA,HTO9,TotalSeq


In [169]:
adata.var[['HTO_info', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)

  adata.var[['HTO_info', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)


In [170]:
# merge the tag_file with adata.var by HTO_sequence
adata.var = adata.var.merge(tag_file, on='HTO_sequence')

In [171]:
adata.var[['HTO_tag', 'Tag_type', 'extra']] = adata.var['HTO_tag'].str.split('-', expand=True)

In [172]:
del adata.var['extra']

In [173]:
adata.var.index = adata.var['HTO_tag']

+ Calculate HTO expression to find the sample with scanpy

In [175]:
adata.var

Unnamed: 0_level_0,0,total_counts,HTO_info,HTO_sequence,HTO_tag,Tag_type
HTO_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HTO1,A_St_AAU1_10_TI_78-GTCAACTCTTTAGCG,634211.0,A_St_AAU1_10_TI_78,GTCAACTCTTTAGCG,HTO1,TotalSeq
HTO2,B_St_AAU2_10_Proxcolon_90-TGATGGCCTATTGGG,20950064.0,B_St_AAU2_10_Proxcolon_90,TGATGGCCTATTGGG,HTO2,TotalSeq
HTO3,C_St_AAU3_10_Distcolon_79-TTCCGCCTCTCTTTG,1489625.0,C_St_AAU3_10_Distcolon_79,TTCCGCCTCTCTTTG,HTO3,TotalSeq
HTO4,D_St_AAQ1_15_TI_88-AGTAAGTTCAGCGTA,1105556.0,D_St_AAQ1_15_TI_88,AGTAAGTTCAGCGTA,HTO4,TotalSeq
HTO5,E_St_AAQ2_15_Proxcolon_82-AAGTATCGTTTCGCA,225692.0,E_St_AAQ2_15_Proxcolon_82,AAGTATCGTTTCGCA,HTO5,TotalSeq
HTO6,F_St_AAQ3_15_Distcolon_87-GGTTGCCAGATGTCA,328078.0,F_St_AAQ3_15_Distcolon_87,GGTTGCCAGATGTCA,HTO6,TotalSeq
HTO7,G_St_AAP1_22_TI_92-TGTCTTTCCTGCCAG,249863.0,G_St_AAP1_22_TI_92,TGTCTTTCCTGCCAG,HTO7,TotalSeq
HTO8,H_St__AAP2_22_Proxcolon_91-CTCCTCTGCAATTAC,624830.0,H_St__AAP2_22_Proxcolon_91,CTCCTCTGCAATTAC,HTO8,TotalSeq
HTO9,I_St_AAP3_22_Distcolon_94-CAGTAGTCACGGTCA,212123.0,I_St_AAP3_22_Distcolon_94,CAGTAGTCACGGTCA,HTO9,TotalSeq


In [176]:
# Extracting gene names, assuming they are stored in adata.var_names
HTO_id = adata.var_names

# Calculating counts for each gene in each cell
# The operation depends on whether the data is stored as a dense matrix or a sparse matrix.
if sparse.issparse(adata.X):
    gene_counts = pd.DataFrame(adata.X.toarray(), index=adata.obs_names, columns=HTO_id)
else:
    gene_counts = pd.DataFrame(adata.X, index=adata.obs_names, columns=HTO_id)

# Because you have only 9 genes, you can add each gene's counts as a separate column in adata.obs directly
for HTO in HTO_id:
    adata.obs[HTO] = gene_counts[HTO]

In [177]:
cell_hashing_columns=adata.var.index.values
sc.external.pp.hashsolo(adata, cell_hashing_columns)

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2


In [178]:
adata.obs['Digestion_Condition'] = 'EPCAM-'
adata.obs['GEO_Accession'] = 'GSM4808356'

In [179]:
adata.var.columns = adata.var.columns.map(str)
adata.obs.columns = adata.obs.columns.map(str)

In [180]:
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808356_HTO_stromal_4/HTO_stromal_4/GSM4808356_HTO_adata.h5ad')

In [181]:
del adata

* Sample GSM4808357_Pool5_HTO

In [183]:
barcodes = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808357_Pool5_HTO/Pool5_HTO/umi_count/barcodes.tsv', sep='\t', header=None)
features = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808357_Pool5_HTO/Pool5_HTO/umi_count/features.tsv', sep='\t', header=None)
matrix = sc.read_mtx('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808357_Pool5_HTO/Pool5_HTO/umi_count/matrix.mtx')

adata = sc.AnnData(X=np.transpose(matrix.X), obs=barcodes, var=features)

# Calculate the total counts for each gene (var)
total_counts = np.array(adata.X.sum(axis=0)).flatten()

# Add these total counts to adata.var
adata.var['total_counts'] = total_counts

# if adata.var['total_counts'] is equal zero then remove the row
adata = adata[:, adata.var['total_counts'] > 0]

adata.obs.index = adata.obs[0]

adata.obs.index.name = 'barcodes'

  if not is_categorical_dtype(df_full[k]):


In [187]:
adata.var

Unnamed: 0,0,total_counts,HTO_info,HTO_sequence
0,T1-GTCAACTCTTTAGCG,723946.0,T1,GTCAACTCTTTAGCG
1,T2-TGATGGCCTATTGGG,1220410.0,T2,TGATGGCCTATTGGG
2,T3-TTCCGCCTCTCTTTG,726804.0,T3,TTCCGCCTCTCTTTG
3,T4-AGTAAGTTCAGCGTA,653428.0,T4,AGTAAGTTCAGCGTA
4,T5-AAGTATCGTTTCGCA,592991.0,T5,AAGTATCGTTTCGCA
5,T6-GGTTGCCAGATGTCA,696523.0,T6,GGTTGCCAGATGTCA
6,unmapped,120716.0,unmapped,


In [186]:
adata.var[['HTO_info', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)

  adata.var[['HTO_info', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)


In [189]:
# merge the tag_file with adata.var by HTO_sequence
adata.var = adata.var.merge(tag_file, on='HTO_sequence', how='left')

In [193]:
adata.var 

Unnamed: 0_level_0,0,total_counts,HTO_info,HTO_sequence,HTO_tag,Tag_type
HTO_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HTO1,T1-GTCAACTCTTTAGCG,723946.0,T1,GTCAACTCTTTAGCG,HTO1,TotalSeq
HTO2,T2-TGATGGCCTATTGGG,1220410.0,T2,TGATGGCCTATTGGG,HTO2,TotalSeq
HTO3,T3-TTCCGCCTCTCTTTG,726804.0,T3,TTCCGCCTCTCTTTG,HTO3,TotalSeq
HTO4,T4-AGTAAGTTCAGCGTA,653428.0,T4,AGTAAGTTCAGCGTA,HTO4,TotalSeq
HTO5,T5-AAGTATCGTTTCGCA,592991.0,T5,AAGTATCGTTTCGCA,HTO5,TotalSeq
HTO6,T6-GGTTGCCAGATGTCA,696523.0,T6,GGTTGCCAGATGTCA,HTO6,TotalSeq
,unmapped,120716.0,unmapped,,,


In [191]:
adata.var[['HTO_tag', 'Tag_type', 'extra']] = adata.var['HTO_tag'].str.split('-', expand=True)
del adata.var['extra']

In [192]:
adata.var.index = adata.var['HTO_tag']

+ Calculate HTO expression to find the sample with scanpy

In [194]:
# Extracting gene names, assuming they are stored in adata.var_names
HTO_id = adata.var_names

# Calculating counts for each gene in each cell
# The operation depends on whether the data is stored as a dense matrix or a sparse matrix.
if sparse.issparse(adata.X):
    gene_counts = pd.DataFrame(adata.X.toarray(), index=adata.obs_names, columns=HTO_id)
else:
    gene_counts = pd.DataFrame(adata.X, index=adata.obs_names, columns=HTO_id)

# Because you have only 9 genes, you can add each gene's counts as a separate column in adata.obs directly
for HTO in HTO_id:
    adata.obs[HTO] = gene_counts[HTO]

In [195]:
cell_hashing_columns=adata.var.index.values
sc.external.pp.hashsolo(adata, cell_hashing_columns)

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2


In [197]:
adata.obs['Classification'].value_counts()

Classification
HTO1       13556
HTO2       11386
HTO3        9392
HTO6        4400
HTO5        4183
HTO4        4061
Doublet     2986
Name: count, dtype: int64

In [198]:
adata.obs['Digestion_Condition'] = 'intestinal stromal and epithelial cells'
adata.obs['GEO_Accession'] = 'GSM4808357'

In [201]:
adata.var.columns = adata.var.columns.map(str)
adata.obs.columns = adata.obs.columns.map(str)

In [205]:
adata.obs.index = adata.obs.index.astype(str)
adata.var.index = adata.var.index.astype(str)

In [206]:
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808357_Pool5_HTO/GSM4808357_HTO_adata.h5ad')

In [207]:
del adata

+ Samples GSM4808358_Pool6_HTO

In [209]:
barcodes = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808358_Pool6_HTO/Pool6_HTO/umi_count/barcodes.tsv', sep='\t', header=None)
features = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808358_Pool6_HTO/Pool6_HTO/umi_count/features.tsv', sep='\t', header=None)
matrix = sc.read_mtx('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808358_Pool6_HTO/Pool6_HTO/umi_count//matrix.mtx')

adata = sc.AnnData(X=np.transpose(matrix.X), obs=barcodes, var=features)

# Calculate the total counts for each gene (var)
total_counts = np.array(adata.X.sum(axis=0)).flatten()

# Add these total counts to adata.var
adata.var['total_counts'] = total_counts

# if adata.var['total_counts'] is equal zero then remove the row
adata = adata[:, adata.var['total_counts'] > 0]

adata.obs.index = adata.obs[0]

adata.obs.index.name = 'barcodes'

  if not is_categorical_dtype(df_full[k]):


In [211]:
adata.var[['HTO_info', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)

  adata.var[['HTO_info', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)


In [212]:
# merge the tag_file with adata.var by HTO_sequence
adata.var = adata.var.merge(tag_file, on='HTO_sequence', how='left')

In [213]:
adata.var 

Unnamed: 0,0,total_counts,HTO_info,HTO_sequence,HTO_tag
0,T1-GTCAACTCTTTAGCG,3277226.0,T1,GTCAACTCTTTAGCG,HTO1-TotalSeq-tag
1,T2-TGATGGCCTATTGGG,3160727.0,T2,TGATGGCCTATTGGG,HTO2-TotalSeq-tag
2,T3-TTCCGCCTCTCTTTG,1728305.0,T3,TTCCGCCTCTCTTTG,HTO3-TotalSeq-tag
3,T4-AGTAAGTTCAGCGTA,975.0,T4,AGTAAGTTCAGCGTA,HTO4-TotalSeq-tag
4,T5-AAGTATCGTTTCGCA,336.0,T5,AAGTATCGTTTCGCA,HTO5-TotalSeq-tag
5,T6-GGTTGCCAGATGTCA,396.0,T6,GGTTGCCAGATGTCA,HTO6-TotalSeq-tag
6,unmapped,252031.0,unmapped,,


In [214]:
adata.var[['HTO_tag', 'Tag_type', 'extra']] = adata.var['HTO_tag'].str.split('-', expand=True)
del adata.var['extra']

In [215]:
adata.var.index = adata.var['HTO_tag']

+ Calculate HTO expression to find the sample with scanpy

In [217]:
adata.var

Unnamed: 0_level_0,0,total_counts,HTO_info,HTO_sequence,HTO_tag,Tag_type
HTO_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HTO1,T1-GTCAACTCTTTAGCG,3277226.0,T1,GTCAACTCTTTAGCG,HTO1,TotalSeq
HTO2,T2-TGATGGCCTATTGGG,3160727.0,T2,TGATGGCCTATTGGG,HTO2,TotalSeq
HTO3,T3-TTCCGCCTCTCTTTG,1728305.0,T3,TTCCGCCTCTCTTTG,HTO3,TotalSeq
HTO4,T4-AGTAAGTTCAGCGTA,975.0,T4,AGTAAGTTCAGCGTA,HTO4,TotalSeq
HTO5,T5-AAGTATCGTTTCGCA,336.0,T5,AAGTATCGTTTCGCA,HTO5,TotalSeq
HTO6,T6-GGTTGCCAGATGTCA,396.0,T6,GGTTGCCAGATGTCA,HTO6,TotalSeq
,unmapped,252031.0,unmapped,,,


In [218]:
# Extracting gene names, assuming they are stored in adata.var_names
HTO_id = adata.var_names

# Calculating counts for each gene in each cell
# The operation depends on whether the data is stored as a dense matrix or a sparse matrix.
if sparse.issparse(adata.X):
    gene_counts = pd.DataFrame(adata.X.toarray(), index=adata.obs_names, columns=HTO_id)
else:
    gene_counts = pd.DataFrame(adata.X, index=adata.obs_names, columns=HTO_id)

# Because you have only 9 genes, you can add each gene's counts as a separate column in adata.obs directly
for HTO in HTO_id:
    adata.obs[HTO] = gene_counts[HTO]

In [219]:
cell_hashing_columns=adata.var.index.values
sc.external.pp.hashsolo(adata, cell_hashing_columns)

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2


In [220]:
adata.obs['Digestion_Condition'] = 'intestinal stromal and epithelial cells'
adata.obs['GEO_Accession'] = 'GSM4808358'

In [221]:
adata.var.columns = adata.var.columns.map(str)
adata.obs.columns = adata.obs.columns.map(str)

In [222]:
adata.obs.index = adata.obs.index.astype(str)
adata.var.index = adata.var.index.astype(str)

In [223]:
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808358_Pool6_HTO/GSM4808358_HTO_adata.h5ad')

In [224]:
del adata

+ Samples GSM4808359_ADT1

In [231]:
barcodes = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808359_ADT1/ADT1/umi_count/barcodes.tsv', sep='\t', header=None)
features = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808359_ADT1/ADT1/umi_count/features.tsv', sep='\t', header=None)
matrix = sc.read_mtx('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808359_ADT1/ADT1/umi_count//matrix.mtx')

features.loc[9] = ['extra']

adata = sc.AnnData(X=np.transpose(matrix.X), obs=barcodes, var=features)

# Calculate the total counts for each gene (var)
total_counts = np.array(adata.X.sum(axis=0)).flatten()

# Add these total counts to adata.var
adata.var['total_counts'] = total_counts

# if adata.var['total_counts'] is equal zero then remove the row
adata = adata[:, adata.var['total_counts'] > 0]

adata.obs.index = adata.obs[0]

adata.obs.index.name = 'barcodes'

  if not is_categorical_dtype(df_full[k]):


+ Calculate HTO expression to find the sample with scanpy

In [234]:
adata.var

Unnamed: 0_level_0,0,total_counts,HTO_tag,HTO_sequence,Tag_type
HTO_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ADT1,ADT1-CATGATTGGCTC,16668808.0,ADT1,CATGATTGGCTC,In-house-tag
ADT2,ADT2-GAGGCGATTGAT,63839284.0,ADT2,GAGGCGATTGAT,In-house-tag
ADT3,ADT3-TGTCCGGCAATA,4504660.0,ADT3,TGTCCGGCAATA,In-house-tag
ADT4,ADT4-TGGTGAACCTGG,8485592.0,ADT4,TGGTGAACCTGG,In-house-tag
ADT5,ADT5-GATCGTAATACC,6766251.0,ADT5,GATCGTAATACC,In-house-tag
ADT6,ADT6-AAGCGCTTGGCA,25693644.0,ADT6,AAGCGCTTGGCA,In-house-tag
ADT7,ADT7-CATCGGTGTACA,4045379.0,ADT7,CATCGGTGTACA,In-house-tag
ADT8,ADT8-GTCTAGACTTCG,13197939.0,ADT8,GTCTAGACTTCG,In-house-tag
ADT9,ADT9-CGAAGAAGGAGT,6214982.0,ADT9,CGAAGAAGGAGT,In-house-tag


In [233]:
adata.var[['HTO_tag', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)

adata.var.index = adata.var['HTO_tag']

adata.var['Tag_type'] = 'In-house-tag'

  adata.var[['HTO_tag', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)


In [235]:
# Extracting gene names, assuming they are stored in adata.var_names
HTO_id = adata.var_names

# Calculating counts for each gene in each cell
# The operation depends on whether the data is stored as a dense matrix or a sparse matrix.
if sparse.issparse(adata.X):
    gene_counts = pd.DataFrame(adata.X.toarray(), index=adata.obs_names, columns=HTO_id)
else:
    gene_counts = pd.DataFrame(adata.X, index=adata.obs_names, columns=HTO_id)

# Because you have only 9 genes, you can add each gene's counts as a separate column in adata.obs directly
for HTO in HTO_id:
    adata.obs[HTO] = gene_counts[HTO]

In [236]:
cell_hashing_columns=adata.var.index.values
sc.external.pp.hashsolo(adata, cell_hashing_columns)

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2


In [237]:
adata.obs['Digestion_Condition'] = 'EPCAM+'
adata.obs['GEO_Accession'] = 'GSM4808359'

In [238]:
adata.var.columns = adata.var.columns.map(str)
adata.obs.columns = adata.obs.columns.map(str)

In [239]:
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808359_ADT1/GGSM4808359_ADT_adata.h5ad')

In [240]:
del adata

+ Samples GSM4808360_ADT2

In [241]:
barcodes = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808360_ADT2/ADT2/umi_count/barcodes.tsv', sep='\t', header=None)
features = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808360_ADT2/ADT2/umi_count/features.tsv', sep='\t', header=None)
matrix = sc.read_mtx('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808360_ADT2/ADT2/umi_count//matrix.mtx')

features.loc[9] = ['extra']

adata = sc.AnnData(X=np.transpose(matrix.X), obs=barcodes, var=features)

# Calculate the total counts for each gene (var)
total_counts = np.array(adata.X.sum(axis=0)).flatten()

# Add these total counts to adata.var
adata.var['total_counts'] = total_counts

# if adata.var['total_counts'] is equal zero then remove the row
adata = adata[:, adata.var['total_counts'] > 0]

adata.obs.index = adata.obs[0]

adata.obs.index.name = 'barcodes'

  if not is_categorical_dtype(df_full[k]):


+ Calculate HTO expression to find the sample with scanpy

In [242]:
adata.var

Unnamed: 0,0,total_counts
0,ADT1-CATGATTGGCTC,45068020.0
1,ADT2-GAGGCGATTGAT,115158600.0
2,ADT3-TGTCCGGCAATA,22047180.0
3,ADT4-TGGTGAACCTGG,8646181.0
4,ADT5-GATCGTAATACC,10061397.0
5,ADT6-AAGCGCTTGGCA,11943134.0
6,ADT7-CATCGGTGTACA,3131238.0
7,ADT8-GTCTAGACTTCG,9150803.0
8,ADT9-CGAAGAAGGAGT,25910.0


In [243]:
adata.var[['HTO_tag', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)

adata.var.index = adata.var['HTO_tag']

adata.var['Tag_type'] = 'In-house-tag'

  adata.var[['HTO_tag', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)


In [244]:
# Extracting gene names, assuming they are stored in adata.var_names
HTO_id = adata.var_names

# Calculating counts for each gene in each cell
# The operation depends on whether the data is stored as a dense matrix or a sparse matrix.
if sparse.issparse(adata.X):
    gene_counts = pd.DataFrame(adata.X.toarray(), index=adata.obs_names, columns=HTO_id)
else:
    gene_counts = pd.DataFrame(adata.X, index=adata.obs_names, columns=HTO_id)

# Because you have only 9 genes, you can add each gene's counts as a separate column in adata.obs directly
for HTO in HTO_id:
    adata.obs[HTO] = gene_counts[HTO]

In [245]:
cell_hashing_columns=adata.var.index.values
sc.external.pp.hashsolo(adata, cell_hashing_columns)

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2


In [246]:
adata.obs['Digestion_Condition'] = 'EPCAM+'
adata.obs['GEO_Accession'] = 'GSM4808360'

In [247]:
adata.var.columns = adata.var.columns.map(str)
adata.obs.columns = adata.obs.columns.map(str)

In [248]:
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808360_ADT2/GGSM4808360_ADT_adata.h5ad')

In [249]:
del adata

+ Samples GSM4808361_ADT3

In [250]:
barcodes = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808361_ADT3/ADT3/umi_count/barcodes.tsv', sep='\t', header=None)
features = pd.read_csv('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808361_ADT3/ADT3/umi_count/features.tsv', sep='\t', header=None)
matrix = sc.read_mtx('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808361_ADT3/ADT3/umi_count//matrix.mtx')

features.loc[9] = ['extra']

adata = sc.AnnData(X=np.transpose(matrix.X), obs=barcodes, var=features)

# Calculate the total counts for each gene (var)
total_counts = np.array(adata.X.sum(axis=0)).flatten()

# Add these total counts to adata.var
adata.var['total_counts'] = total_counts

# if adata.var['total_counts'] is equal zero then remove the row
adata = adata[:, adata.var['total_counts'] > 0]

adata.obs.index = adata.obs[0]

adata.obs.index.name = 'barcodes'

  if not is_categorical_dtype(df_full[k]):


+ Calculate HTO expression to find the sample with scanpy

In [253]:
adata.var

Unnamed: 0_level_0,0,total_counts,HTO_tag,HTO_sequence,Tag_type
HTO_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ADT1,ADT1-CATGATTGGCTC,8729961.0,ADT1,CATGATTGGCTC,In-house-tag
ADT2,ADT2-GAGGCGATTGAT,48487252.0,ADT2,GAGGCGATTGAT,In-house-tag
ADT3,ADT3-TGTCCGGCAATA,10257105.0,ADT3,TGTCCGGCAATA,In-house-tag
ADT4,ADT4-TGGTGAACCTGG,46446992.0,ADT4,TGGTGAACCTGG,In-house-tag
ADT5,ADT5-GATCGTAATACC,12636813.0,ADT5,GATCGTAATACC,In-house-tag
ADT6,ADT6-AAGCGCTTGGCA,15415419.0,ADT6,AAGCGCTTGGCA,In-house-tag
ADT7,ADT7-CATCGGTGTACA,10695777.0,ADT7,CATCGGTGTACA,In-house-tag
ADT8,ADT8-GTCTAGACTTCG,15613573.0,ADT8,GTCTAGACTTCG,In-house-tag
ADT9,ADT9-CGAAGAAGGAGT,27920.0,ADT9,CGAAGAAGGAGT,In-house-tag


In [252]:
adata.var[['HTO_tag', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)

adata.var.index = adata.var['HTO_tag']

adata.var['Tag_type'] = 'In-house-tag'

  adata.var[['HTO_tag', 'HTO_sequence']] = adata.var[0].str.split('-', expand=True)


In [254]:
# Extracting gene names, assuming they are stored in adata.var_names
HTO_id = adata.var_names

# Calculating counts for each gene in each cell
# The operation depends on whether the data is stored as a dense matrix or a sparse matrix.
if sparse.issparse(adata.X):
    gene_counts = pd.DataFrame(adata.X.toarray(), index=adata.obs_names, columns=HTO_id)
else:
    gene_counts = pd.DataFrame(adata.X, index=adata.obs_names, columns=HTO_id)

# Because you have only 9 genes, you can add each gene's counts as a separate column in adata.obs directly
for HTO in HTO_id:
    adata.obs[HTO] = gene_counts[HTO]

In [255]:
cell_hashing_columns=adata.var.index.values
sc.external.pp.hashsolo(adata, cell_hashing_columns)

Please cite HashSolo paper:
https://www.cell.com/cell-systems/fulltext/S2405-4712(20)30195-2


  lam = 1 / np.var(data) if len(data) > 1 else lam_o
  (np.mean(data) * n * lam + mu_o * lam_o) / lam_n if len(data) > 0 else mu_o


In [256]:
adata.obs['Digestion_Condition'] = 'EPCAM+'
adata.obs['GEO_Accession'] = 'GSM4808361'

In [257]:
adata.var.columns = adata.var.columns.map(str)
adata.obs.columns = adata.obs.columns.map(str)

In [258]:
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/raw_data/Fawkner-Corbett_2021/scRNA_seq_data/GSE158702_RAW_2/GSM4808361_ADT3/GGSM4808361_ADT_adata.h5ad')

In [259]:
del adata