### Notebook to format 10X Genomics GEX to anndata for project `Kdm6aKO`

- **Developed by:** Carlos Talavera-López Ph.D
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**
- **Created on**: 231207
- **Last modified**: 240101

### Import required modules

In [37]:
import anndata
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt

### Set up working environment

In [38]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.10.5.post1
scanpy      1.9.8
-----
PIL                 10.2.0
asttokens           NA
colorama            0.4.6
comm                0.2.1
cycler              0.12.1
cython_runtime      NA
dateutil            2.9.0
debugpy             1.8.1
decorator           5.1.1
exceptiongroup      1.2.0
executing           2.0.1
h5py                3.10.0
ipykernel           6.29.3
jedi                0.19.1
joblib              1.3.2
kiwisolver          1.4.5
llvmlite            0.42.0
matplotlib          3.8.3
mpl_toolkits        NA
natsort             8.4.0
numba               0.59.0
numpy               1.26.4
packaging           23.2
pandas              2.2.1
parso               0.8.3
pickleshare         0.7.5
platformdirs        4.2.0
prompt_toolkit      3.0.42
psutil              5.9.8
pure_eval           0.2.2
pydev_ipython       NA
pydevconsole        NA
pydevd              2.9.5
pydevd_file_utils   NA
pydevd_plugins      NA
pydevd_tracing      NA
pygments            2.17.

In [39]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis = 0))

### Read in samples

In [40]:
directory = "../../../../INBOX/becker_kdm6a/cellbender/"

In [29]:
metadata_dict = {
    "KDM6A_wt_11":	    ("WT",      "KDM6A",    "SCC0203_1_Becker_multiome_D11_A2_outs"),
    "KDM6A_wt_40":	    ["WT",	    "KDM6A",	"SCC0203_2_Becker_multiome_E7_A5_nb40_outs"],
    "KDM6A_KO_34":	    ["KO",	    "KDM6A",    "SCC0203_2_Becker_multiome_E5_A3_nb27_outs"],
    "KDM6A_KO_31":	    ["KO",	    "KDM6A",    "SCC0203_2_Becker_multiome_E6_A4_nb31_outs"],
    "GSKJ4_sham_51":	["sham",    "GSKJ4",    "SCC0203_4_Becker_multiome_2_D6_B6_nb51"],
    "GSKJ4_sham_57":    ["sham",    "GSKJ4"],
    "GSKJ4_treat_47":   ["treated", "GSKJ4"],
    "GSKJ4_treat_52":   ["treated", "GSKJ4"],
}
sample_metadata = pd.DataFrame.from_dict(metadata_dict, orient="index", columns=["genotype", "group", "file"])
sample_metadata.index.name = "sample_id"
sample_metadata

Unnamed: 0_level_0,genotype,group,file
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KDM6A_wt_11,WT,KDM6A,SCC0203_1_Becker_multiome_D11_A2_outs
KDM6A_wt_40,WT,KDM6A,SCC0203_2_Becker_multiome_E7_A5_nb40_outs
KDM6A_KO_34,KO,KDM6A,SCC0203_2_Becker_multiome_E5_A3_nb27_outs
KDM6A_KO_31,KO,KDM6A,SCC0203_2_Becker_multiome_E6_A4_nb31_outs
GSKJ4_sham_51,sham,GSKJ4,SCC0203_4_Becker_multiome_2_D6_B6_nb51
GSKJ4_sham_57,sham,GSKJ4,
GSKJ4_treat_47,treated,GSKJ4,
GSKJ4_treat_52,treated,GSKJ4,


In [28]:
#sample_metadata = pd.read_csv('../data/samples.txt', sep = ',', index_col = 0) 
#sample_metadata.set_index('sample_id', inplace = True)
#sample_metadata.head()

In [30]:
filenames = sample_metadata.index
filenames

Index(['KDM6A_wt_11', 'KDM6A_wt_40', 'KDM6A_KO_34', 'KDM6A_KO_31',
       'GSKJ4_sham_51', 'GSKJ4_sham_57', 'GSKJ4_treat_47', 'GSKJ4_treat_52'],
      dtype='object', name='sample_id')

In [32]:
adatas = [sc.read_10x_h5(directory + filename + '/' + filename + '_CB_ctl240101.raw.h5') for filename in filenames]
for i in range(len(adatas)):
    adatas[i].var_names_make_unique()
    adatas[i].obs['sample'] = sample_metadata.index[i]
    for col in sample_metadata.columns:
        adatas[i].obs[col] = sample_metadata[col][i]
adata_1 = adatas[0].concatenate(adatas[1:], batch_categories = sample_metadata.index)
adata_1.shape

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  adatas[i].obs[col] = sample_metadata[col][i]
  adatas[i].obs[col] = sample_metadata[col][i]
  adatas[i].obs[col] = sample_metadata[col][i]
  adatas[i].obs[col] = sample_metadata[col][i]
  adatas[i].obs[col] = sample_metadata[col][i]
  adatas[i].obs[col] = sample_metadata[col][i]
  adatas[i].obs[col] = sample_metadata[col][i]
  adatas[i].obs[col] = sample_metadata[col][i]
  adatas[i].obs[col] = sample_me

(5782594, 32285)

In [33]:
adata_1

AnnData object with n_obs × n_vars = 5782594 × 32285
    obs: 'sample', 'genotype', 'group', 'file', 'batch'
    var: 'gene_ids', 'feature_types', 'genome'

In [34]:
adata_1.var['modality'] = 'GEX'

In [35]:
adata_1.obs['genotype'] = adata_1.obs['genotype'].astype('category')
adata_1.obs['genotype'].cat.categories

Index(['KO', 'WT', 'sham', 'treated'], dtype='object')

In [41]:
X_is_raw(adata_1)

True

In [42]:
adata_1.var.modality.value_counts()

modality
GEX    32285
Name: count, dtype: int64

### Save merged object

In [43]:
adata_1.write('../data/Kdm6aKO_ALL_GEX-CB_ctl240116.raw.h5ad')