### Notebook to format 10X Genomics multiome to anndata for project `Kdm6aKO`

- **Developed by:** Carlos Talavera-López Ph.D
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**
- **Created on**: 231207
- **Last modified**: 231207

### Import required modules

In [1]:
import scvi
import anndata
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt

  self.seed = seed
  self.dl_pin_memory_gpu_training = (


### Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.9.2
scanpy      1.9.4
-----
PIL                 10.0.0
absl                NA
aiohttp             3.8.5
aiosignal           1.3.1
annotated_types     0.5.0
anyio               NA
asttokens           NA
async_timeout       4.0.3
attr                23.1.0
backcall            0.2.0
backoff             2.2.1
bs4                 4.12.2
certifi             2023.07.22
charset_normalizer  3.2.0
chex                0.1.7
click               8.1.7
comm                0.1.4
contextlib2         NA
croniter            NA
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.7.post1
decorator           5.1.1
deepdiff            6.3.1
docrep              0.3.2
etils               1.4.1
executing           1.2.0
fastapi             0.103.0
flax                0.7.2
frozenlist          1.4.0
fsspec              2023.6.0
h5py                3.9.0
idna                3.4
igraph              0.10.8
importlib_resources NA
ipykernel         

In [3]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis = 0))

### Read in samples

In [4]:
sample_metadata = pd.read_csv('../data/samples.txt', sep = ',', index_col = 0)
sample_metadata['sample'] = sample_metadata.index 
sample_metadata.set_index('file', inplace = True)
sample_metadata.head()

Unnamed: 0_level_0,sample_id,genotype,group,sample
file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SCC0203_1_Becker_multiome_D11_A2_outs,KDM6A_wt_11,WT,KDM6A,D11
SCC0203_2_Becker_multiome_E7_A5_nb40_outs,KDM6A_wt_40,WT,KDM6A,E7
SCC0203_2_Becker_multiome_E5_A3_nb27_outs,KDM6A_KO_34,KO,KDM6A,E5
SCC0203_2_Becker_multiome_E6_A4_nb31_outs,KDM6A_KO_31,KO,KDM6A,E6
SCC0203_4_Becker_multiome_2_D6_B6_nb51,GSKJ4_sham_51,Sham,GSKJ4,D6


In [6]:
filenames = sample_metadata.index

In [10]:
gene_expr_adatas = []
atac_seq_adatas = []

for i, filename in enumerate(filenames):
    adata = scvi.data.read_10x_multiome('../data/' + str(filename) + '/outs/raw_feature_bc_matrix/')
    # Separate gene expression and ATAC-seq data
    gene_expr_adata = adata[:, adata.var['modality'] == 'Gene Expression']
    atac_seq_adata = adata[:, adata.var['modality'] == 'Peaks']

    # Add metadata
    for col in sample_metadata.columns:
        gene_expr_adata.obs[col] = sample_metadata[col][i]
        atac_seq_adata.obs[col] = sample_metadata[col][i]

    gene_expr_adatas.append(gene_expr_adata)
    atac_seq_adatas.append(atac_seq_adata)

In [None]:
concatenated_gene_expr = gene_expr_adatas[0].concatenate(gene_expr_adatas[1:], join='inner')
concatenated_atac_seq = atac_seq_adatas[0].concatenate(atac_seq_adatas[1:], join='outer')

In [None]:
# Step 4: Add 'modality' Column
concatenated_gene_expr.obs['modality'] = 'Gene Expression'
concatenated_atac_seq.obs['modality'] = 'Peaks'

In [None]:
final_adata = concatenated_gene_expr.concatenate(concatenated_atac_seq, join='outer')
print(final_adata.shape)


In [None]:
sample_metadata['sample']

In [None]:
adata1 = scvi.data.read_10x_multiome('../data/' + 'SCC0203_1_Becker_multiome_D11_A2_outs' + '/outs/filtered_feature_bc_matrix/')
adata1.var_names_make_unique()
adata1

In [None]:
adata1.var

In [None]:
adata1.var['modality'].value_counts()

In [None]:
filenames = sample_metadata.index
adatas = [scvi.data.read_10x_multiome('../data/' + str(filename) + '/outs/raw_feature_bc_matrix/') for filename in filenames]
for i in range(len(adatas)):
    adatas[i].obs['sample'] = sample_metadata['sample'][i]
    adatas[i].var_names_make_unique()
    for col in sample_metadata.columns:
        adatas[i].obs[col] = sample_metadata[col][i]
adata = adatas[0].concatenate(adatas[1:], batch_categories = sample_metadata.index, join = 'outer')
adata.shape

In [None]:
adata.var

In [None]:
adata.var['modality'].value_counts()

In [None]:
adata.obs['sample'] = adata.obs['sample'].astype('category')
adata.obs['sample'].cat.categories

In [None]:
adata.obs['genotype'] = adata.obs['genotype'].astype('category')
adata.obs['genotype'].cat.categories

In [None]:
X_is_raw(adata)

In [None]:
adata.var.modality.value_counts()

In [None]:
n = 4004
adata_rna = adata[:n, adata.var.modality == "Gene Expression"].copy()
adata_paired = adata[n : 2 * n].copy()
adata_atac = adata[2 * n :, adata.var.modality == "Peaks"].copy()

### Save merged object

In [None]:
adata.write('../data/Kdm6aKO_multitome_ctl231207.raw.h5ad')