### Notebook to format 10X Genomics multiome to anndata for project `Kdm6aKO`

- **Developed by:** Carlos Talavera-López Ph.D
- **Würzburg Institute for Systems Immunology & Julius-Maximilian-Universität Würzburg**
- **Created on**: 231207
- **Last modified**: 231207

### Import required modules

In [28]:
import scvi
import anndata
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import os
import shutil

### Set up working environment

In [3]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.10.5.post1
scanpy      1.9.8
-----
PIL                 10.2.0
absl                NA
annotated_types     0.6.0
anyio               NA
asttokens           NA
backoff             2.2.1
brotli              1.1.0
bs4                 4.12.3
certifi             2024.02.02
cffi                1.16.0
charset_normalizer  3.3.2
chex                0.1.85
click               8.1.7
colorama            0.4.6
comm                0.2.1
contextlib2         NA
croniter            NA
cycler              0.12.1
cython_runtime      NA
dateutil            2.9.0
debugpy             1.8.1
decorator           5.1.1
deepdiff            6.7.1
docrep              0.3.2
etils               1.7.0
exceptiongroup      1.2.0
executing           2.0.1
fastapi             0.110.0
flax                0.8.1
fsspec              2024.2.0
gmpy2               2.1.2
h5py                3.10.0
idna                3.6
importlib_resources NA
ipykernel           6.29.3
jax                 0.4.25
jaxlib        

In [4]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis = 0))

### Read in samples

In [6]:
metadata_dict = {
    "KDM6A_wt_11":	    ("WT",      "KDM6A",    "SCC0203_1_Becker_multiome_D11_A2_outs"),
    "KDM6A_wt_40":	    ["WT",	    "KDM6A",	"SCC0203_2_Becker_multiome_E7_A5_nb40_outs"],
    "KDM6A_KO_34":	    ["KO",	    "KDM6A",    "SCC0203_2_Becker_multiome_E5_A3_nb27_outs"],
    "KDM6A_KO_31":	    ["KO",	    "KDM6A",    "SCC0203_2_Becker_multiome_E6_A4_nb31_outs"],
    "GSKJ4_sham_51":	["sham",    "GSKJ4",    "SCC0203_4_Becker_multiome_2_D6_B6_nb51"],
    "GSKJ4_sham_57":    ["sham",    "GSKJ4"],
    "GSKJ4_treat_47":   ["treated", "GSKJ4"],
    "GSKJ4_treat_52":   ["treated", "GSKJ4"],
}
sample_metadata = pd.DataFrame.from_dict(metadata_dict, orient="index", columns=["genotype", "group", "file"])
sample_metadata.index.name = "sample_id"
sample_metadata

Unnamed: 0_level_0,genotype,group,file
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KDM6A_wt_11,WT,KDM6A,SCC0203_1_Becker_multiome_D11_A2_outs
KDM6A_wt_40,WT,KDM6A,SCC0203_2_Becker_multiome_E7_A5_nb40_outs
KDM6A_KO_34,KO,KDM6A,SCC0203_2_Becker_multiome_E5_A3_nb27_outs
KDM6A_KO_31,KO,KDM6A,SCC0203_2_Becker_multiome_E6_A4_nb31_outs
GSKJ4_sham_51,sham,GSKJ4,SCC0203_4_Becker_multiome_2_D6_B6_nb51
GSKJ4_sham_57,sham,GSKJ4,
GSKJ4_treat_47,treated,GSKJ4,
GSKJ4_treat_52,treated,GSKJ4,


In [7]:
#sample_metadata = pd.read_csv('../data/samples.txt', sep = ',', index_col = 0)
#sample_metadata['sample'] = sample_metadata.index 
#sample_metadata.set_index('file', inplace = True)
#sample_metadata.head()

In [9]:
filenames = sample_metadata.index
filenames

Index(['KDM6A_wt_11', 'KDM6A_wt_40', 'KDM6A_KO_34', 'KDM6A_KO_31',
       'GSKJ4_sham_51', 'GSKJ4_sham_57', 'GSKJ4_treat_47', 'GSKJ4_treat_52'],
      dtype='object', name='sample_id')

In [65]:
source_directories = {sample: "../../../../INBOX/becker_kdm6a/cellranger/" + sample + "/raw_feature_bc_matrix/" for sample in filenames}
target_directory = "../data/raw_data/cellranger/"
for sample, directory in source_directories.items():
    if os.path.exists(directory):
        if not os.path.exists(target_directory + sample + "/"):
            os.makedirs(target_directory + sample + "/")
        else:
            pass
        files = os.listdir(directory)
        shutil.copy(directory + 'barcodes.tsv.gz', target_directory + sample + '/')
        shutil.copy(directory + 'matrix.mtx.gz', target_directory + sample + '/')
        shutil.copy(directory + 'features.tsv.gz', target_directory + sample + '/')

AttributeError: 'enumerate' object has no attribute 'items'

In [87]:
gene_expr_adatas = []
atac_seq_adatas = []

for i, filename in enumerate(filenames):
    adata = scvi.data.read_10x_multiome('../data/raw_data/cellranger/' + str(filename))
     # Separate gene expression and ATAC-seq data
    gene_expr_adata = adata[:, adata.var['modality'] == 'Gene Expression']
    atac_seq_adata = adata[:, adata.var['modality'] == 'Peaks']

    # Add metadata
    for col in sample_metadata.columns:
        print(col)
        gene_expr_adata.obs[col] = sample_metadata[col][i]
        atac_seq_adata.obs[col] = sample_metadata[col][i]

    gene_expr_adatas.append(gene_expr_adata)
    atac_seq_adatas.append(atac_seq_adata)

  utils.warn_names_duplicates("var")
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]


genotype


  utils.warn_names_duplicates("var")
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]


group
file


  utils.warn_names_duplicates("var")
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  utils.warn_names_duplicates("var")
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]


genotype
group
file


  utils.warn_names_duplicates("var")
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]


genotype


  utils.warn_names_duplicates("var")
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]


group
file


  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  utils.warn_names_duplicates("var")
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]


genotype


  utils.warn_names_duplicates("var")
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]


group
file


  utils.warn_names_duplicates("var")
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]


genotype


  utils.warn_names_duplicates("var")
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]


group
file


  utils.warn_names_duplicates("var")
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]


genotype


  utils.warn_names_duplicates("var")
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]


group
file


  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  utils.warn_names_duplicates("var")
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]


genotype


  utils.warn_names_duplicates("var")
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]


group
file


  utils.warn_names_duplicates("var")
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]


genotype
group
file


  utils.warn_names_duplicates("var")
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]
  gene_expr_adata.obs[col] = sample_metadata[col][i]
  atac_seq_adata.obs[col] = sample_metadata[col][i]


In [122]:
gene_expr_adatas[2].X

<722928x32285 sparse matrix of type '<class 'numpy.int64'>'
	with 17520926 stored elements in Compressed Sparse Row format>

In [108]:
gene_expr_adatas[1].obs

Unnamed: 0_level_0,batch_id,genotype,group,file
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAACAGCCAAACAACA,1,WT,KDM6A,SCC0203_2_Becker_multiome_E7_A5_nb40_outs
AAACAGCCAAACCTAT,1,WT,KDM6A,SCC0203_2_Becker_multiome_E7_A5_nb40_outs
AAACAGCCAAACGCGA,1,WT,KDM6A,SCC0203_2_Becker_multiome_E7_A5_nb40_outs
AAACAGCCAAACGGGC,1,WT,KDM6A,SCC0203_2_Becker_multiome_E7_A5_nb40_outs
AAACAGCCAAACTAAG,1,WT,KDM6A,SCC0203_2_Becker_multiome_E7_A5_nb40_outs
...,...,...,...,...
TTTGTTGGTTTGGGCG,1,WT,KDM6A,SCC0203_2_Becker_multiome_E7_A5_nb40_outs
TTTGTTGGTTTGGGTA,1,WT,KDM6A,SCC0203_2_Becker_multiome_E7_A5_nb40_outs
TTTGTTGGTTTGGTTC,1,WT,KDM6A,SCC0203_2_Becker_multiome_E7_A5_nb40_outs
TTTGTTGGTTTGTCTA,1,WT,KDM6A,SCC0203_2_Becker_multiome_E7_A5_nb40_outs


In [103]:
anndata.concat(gene_expr_adatas, join='inner')

ValueError: cannot reindex on an axis with duplicate labels

In [74]:
concatenated_gene_expr = gene_expr_adatas[0].concatenate(gene_expr_adatas[1:], join='inner')
concatenated_atac_seq = atac_seq_adatas[0].concatenate(atac_seq_adatas[1:], join='outer')

  concatenated_gene_expr = gene_expr_adatas[0].concatenate(gene_expr_adatas[1:], join='inner')


ValueError: cannot reindex on an axis with duplicate labels

In [None]:
# Step 4: Add 'modality' Column
concatenated_gene_expr.obs['modality'] = 'Gene Expression'
concatenated_atac_seq.obs['modality'] = 'Peaks'

In [None]:
final_adata = concatenated_gene_expr.concatenate(concatenated_atac_seq, join='outer')
print(final_adata.shape)


In [None]:
sample_metadata['sample']

In [None]:
adata1 = scvi.data.read_10x_multiome('../data/' + 'SCC0203_1_Becker_multiome_D11_A2_outs' + '/outs/filtered_feature_bc_matrix/')
adata1.var_names_make_unique()
adata1

In [None]:
adata1.var

In [None]:
adata1.var['modality'].value_counts()

In [None]:
filenames = sample_metadata.index
adatas = [scvi.data.read_10x_multiome('../data/' + str(filename) + '/outs/raw_feature_bc_matrix/') for filename in filenames]
for i in range(len(adatas)):
    adatas[i].obs['sample'] = sample_metadata['sample'][i]
    adatas[i].var_names_make_unique()
    for col in sample_metadata.columns:
        adatas[i].obs[col] = sample_metadata[col][i]
adata = adatas[0].concatenate(adatas[1:], batch_categories = sample_metadata.index, join = 'outer')
adata.shape

In [None]:
adata.var

In [None]:
adata.var['modality'].value_counts()

In [None]:
adata.obs['sample'] = adata.obs['sample'].astype('category')
adata.obs['sample'].cat.categories

In [None]:
adata.obs['genotype'] = adata.obs['genotype'].astype('category')
adata.obs['genotype'].cat.categories

In [None]:
X_is_raw(adata)

In [None]:
adata.var.modality.value_counts()

In [None]:
n = 4004
adata_rna = adata[:n, adata.var.modality == "Gene Expression"].copy()
adata_paired = adata[n : 2 * n].copy()
adata_atac = adata[2 * n :, adata.var.modality == "Peaks"].copy()

### Save merged object

In [None]:
adata.write('../data/Kdm6aKO_multitome_ctl231207.raw.h5ad')