### Notebook to format the 10X Genomics HTO data from SCC0120_1_S2 into an `anndata` object with raw counts in `adata.X`

- **Developed by:** Carlos Talavera-López Ph.D
- **Würzburg Institute for Systems Immunology - JMU-Würzburg**
- v230701

### Import required modules

In [1]:
import anndata
import numpy as np
import pandas as pd
import scanpy as sc

### Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 160, color_map = 'RdPu', dpi_save = 180, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.2
-----
PIL                 9.4.0
appnope             0.1.3
asttokens           NA
backcall            0.2.0
cffi                1.15.1
colorama            0.4.6
comm                0.1.2
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.6
decorator           5.1.1
executing           1.2.0
h5py                3.8.0
igraph              0.10.4
importlib_resources NA
ipykernel           6.21.2
jedi                0.18.2
joblib              1.2.0
kiwisolver          1.4.4
leidenalg           0.9.1
llvmlite            0.39.1
louvain             0.8.0
matplotlib          3.7.0
mpl_toolkits        NA
natsort             8.2.0
numba               0.56.4
numexpr             2.8.4
numpy               1.23.5
packaging           23.0
pandas              1.5.3
parso               0.8.3
pexpect             4.8.0
pickleshare         0.7.5
pkg_resources       NA
platformdirs        3.0.0
prompt_toolkit      3.0.

### Read in 10X Genomics files for SCC0120_1_Sample_1

In [3]:
adata_raw = sc.read_10x_mtx('../data/SCC0120_1_Sample_2/outs/filtered_feature_bc_matrix/', cache = True, gex_only = False) 
adata_raw

... writing an h5ad cache file to speedup reading next time


AnnData object with n_obs × n_vars = 8142 × 36611
    var: 'gene_ids', 'feature_types'

In [4]:
adata_raw.var['feature_types'].value_counts()

Gene Expression     36601
Antibody Capture       10
Name: feature_types, dtype: int64

In [5]:
adata_raw.obs

AAACCCAAGCAGTCTT-1
AAACCCAAGCGGTATG-1
AAACCCACAATAGGGC-1
AAACCCACAATTGAGA-1
AAACCCACATTCTCTA-1
...
TTTGTTGGTATGTGTC-1
TTTGTTGTCAAGTAAG-1
TTTGTTGTCACCCTTG-1
TTTGTTGTCACTTGTT-1
TTTGTTGTCCCATACC-1


### Read in processed metadata

In [7]:
adata_metadata = pd.read_csv('../data/SCC0120_1_Sample_2_metadata.csv', sep = ',', index_col = 0)
adata_metadata.head()

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,nCount_HTO,nFeature_HTO,nCount_CITE,nFeature_CITE,nCount_PROT,nFeature_PROT,percent.mt,sample,HTO_maxID,HTO_secondID,HTO_margin,HTO_classification,HTO_classification.global,hash.ID,hashtag,unique,group
AAACCCAAGCAGTCTT-1,SeuratProject,7223,3068,887,7,375,2,1262,9,7.254603,SCC0120_1_Sample_2,Hashtag7-TotalA,Hashtag5-TotalA,3.144635,Hashtag7-TotalA,Singlet,Hashtag7-TotalA,Hashtag7-TotalA,hs_3,hs
AAACCCAAGCGGTATG-1,SeuratProject,20749,5992,582,6,205,2,787,8,3.026652,SCC0120_1_Sample_2,Hashtag5-TotalA,Hashtag4-TotalA,0.45477,Hashtag4-TotalA_Hashtag5-TotalA,Doublet,Doublet,Doublet,Doublet,Doublet
AAACCCACAATAGGGC-1,SeuratProject,2514,1293,1546,6,20,2,1566,8,3.221957,SCC0120_1_Sample_2,Hashtag5-TotalA,Hashtag7-TotalA,3.528118,Hashtag5-TotalA,Singlet,Hashtag5-TotalA,Hashtag5-TotalA,is_2,is
AAACCCACAATTGAGA-1,SeuratProject,13347,4268,1301,6,828,2,2129,8,2.337604,SCC0120_1_Sample_2,Hashtag4-TotalA,Hashtag7-TotalA,2.287189,Hashtag4-TotalA,Singlet,Hashtag4-TotalA,Hashtag4-TotalA,hs_2,hs
AAACCCACATTCTCTA-1,SeuratProject,9256,3491,2390,6,41,2,2431,8,2.776577,SCC0120_1_Sample_2,Hashtag5-TotalA,Hashtag1-TotalA,3.669243,Hashtag5-TotalA,Singlet,Hashtag5-TotalA,Hashtag5-TotalA,is_2,is


In [8]:
adata_metadata.shape

(4969, 20)

In [9]:
adata_raw.obs.index = adata_raw.obs.index.astype(str)
adata_metadata.index = adata_metadata.index.astype(str)
merged = adata_raw.obs.join(adata_metadata, how = 'left')
adata_raw.obs = merged
adata_raw

AnnData object with n_obs × n_vars = 8142 × 36611
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'nCount_CITE', 'nFeature_CITE', 'nCount_PROT', 'nFeature_PROT', 'percent.mt', 'sample', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'hashtag', 'unique', 'group'
    var: 'gene_ids', 'feature_types'

In [10]:
adata_raw.obs['hash.ID'].value_counts()

Doublet            1113
Hashtag4-TotalA    1087
Hashtag5-TotalA     734
Hashtag8-TotalA     652
Hashtag7-TotalA     562
Hashtag2-TotalA     475
Hashtag1-TotalA     203
Negative            143
Name: hash.ID, dtype: int64

### Create a dictionary to map the samples with their identity

In [11]:
mapping_dict = {
    "Hashtag1-TotalA": {"sample": "hs_1", "tissue": "skin", "condition": "healthy"},
    "Hashtag4-TotalA": {"sample": "hs_2", "tissue": "skin", "condition": "healthy"},
    "Hashtag7-TotalA": {"sample": "hs_3", "tissue": "skin", "condition": "healthy"},
    "Hashtag2-TotalA": {"sample": "is_1", "tissue": "skin", "condition": "infected"},
    "Hashtag5-TotalA": {"sample": "is_2", "tissue": "skin", "condition": "infected"},
    "Hashtag8-TotalA": {"sample": "is_3", "tissue": "skin", "condition": "infected"},
    "Hashtag3-TotalA": {"sample": "pbmc_1", "tissue": "pbmc", "condition": "blood"},
    "Hashtag6-TotalA": {"sample": "pbmc_2", "tissue": "pbmc", "condition": "blood"}
}


In [12]:
for hash_ID, mapping in mapping_dict.items():
    if hash_ID in adata_raw.obs['hash.ID'].values:
        for column, value in mapping.items():
            if column not in adata_raw.obs.columns:
                adata_raw.obs[column] = np.nan
            idx = adata_raw.obs[adata_raw.obs['hash.ID'] == hash_ID].index
            adata_raw.obs.loc[idx, column] = value
adata_raw

AnnData object with n_obs × n_vars = 8142 × 36611
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'nCount_CITE', 'nFeature_CITE', 'nCount_PROT', 'nFeature_PROT', 'percent.mt', 'sample', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'hashtag', 'unique', 'group', 'tissue', 'condition'
    var: 'gene_ids', 'feature_types'

In [13]:
adata_raw.obs['condition'].value_counts()

infected    1861
healthy     1852
Name: condition, dtype: int64

In [14]:
adata_raw.obs['tissue'].value_counts()

skin    3713
Name: tissue, dtype: int64

In [15]:
adata_raw.obs['sample'].value_counts()

SCC0120_1_Sample_2    1256
hs_2                  1087
is_2                   734
is_3                   652
hs_3                   562
is_1                   475
hs_1                   203
Name: sample, dtype: int64

### Remove cells that have _Doublet_ or _Negative assigned_

In [16]:
adata_raw.obs['HTO_classification.global'].value_counts()

Singlet     3713
Doublet     1113
Negative     143
Name: HTO_classification.global, dtype: int64

In [17]:
adata_raw_sc = adata_raw[adata_raw.obs['HTO_classification.global'].isin(['Singlet'])]
adata_raw_sc

View of AnnData object with n_obs × n_vars = 3713 × 36611
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'nCount_CITE', 'nFeature_CITE', 'nCount_PROT', 'nFeature_PROT', 'percent.mt', 'sample', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'hashtag', 'unique', 'group', 'tissue', 'condition'
    var: 'gene_ids', 'feature_types'

### Save object

In [18]:
adata_raw_sc.write('../data/SCC0120_1_Sample_2/SCC0120_1_Sample_2.raw.ctl230701.h5ad')

  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
