## References

Tutorial: https://docs.scvi-tools.org/en/stable/user_guide/notebooks/MultiVI_tutorial.html <br>
Paper: https://www.biorxiv.org/content/10.1101/2021.08.20.457057v2

## Dataset to prepare

### 1) RNA (scnRNA + Multiome-RNA)
* Read in data: post-CellBender, filtered as the previous HCA object, cell-type annotated
* Subset scnRNA: barcode x gene -> **`adata_rna.h5ad`**
* Subset MultiomeRNA: barcode x gene

### 2) ATAC (snATAC + Multiome-ATAC)
* Read in data: post-cellatac and filtered peaks and nuclei, `6reg-v2_ATACs_filtered.h5ad`
* Subset snATAC: barcode x peak -> **`adata_atac.h5ad`**
* Subset MultiomeATAC: barcode x peak

### 3) Concatenate Multiome RNA+ATAC
barcode x (gene+peak) -> **`adata_paired.h5ad`**

In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata
import scipy
import os

In [2]:
import session_info
session_info.show()

## Read in data

**RNA**

In [3]:
path_adata = '/nfs/team205/heart/anndata_objects/'
scrna = sc.read_h5ad(path_adata + 'scRNA_adult.h5ad')
snrna = sc.read_h5ad(path_adata + 'snRNA_adult.h5ad')
multirna = sc.read_h5ad(path_adata + 'multiome_RNA_adult.h5ad')

In [4]:
meta_multiome = pd.read_csv('/nfs/team205/heart/cellatac/tic-1050/6reg-v2_Multiome_metadata.csv', sep = ',', index_col = 0)
meta_scnRNA = pd.read_csv('/nfs/team205/heart/soupremoved/cellbender020/6reg-v1_scnRNA_metadata.csv', sep = ',', index_col = 0)

In [5]:
meta_scnRNA.Donor.astype('category').cat.categories

Index(['1', '11', '2', '3', '4', '5', '6', '7', 'H2', 'H3', 'H4', 'H5', 'H6',
       'H7'],
      dtype='object')

In [6]:
meta_scnRNA.Donor = ['D' + did if 'H' not in did else did for did in meta_scnRNA.Donor]

In [7]:
meta_scnRNA.Donor.astype('category').cat.categories

Index(['D1', 'D11', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'H2', 'H3', 'H4', 'H5',
       'H6', 'H7'],
      dtype='object')

In [8]:
meta_multiome['sample_id'] = meta_multiome['SangerID']
meta_multiome.rename(columns={"Combined_ID": "combined_id"}, inplace=True)
meta_multiome.rename(columns={"Protocol": "protocol"}, inplace=True)
meta_multiome['cell_or_nuclei'] = 'Nuclei'
meta_multiome.rename(columns={"Donor": "donor"}, inplace=True)
meta_multiome.rename(columns={"Region": "region"}, inplace=True)
meta_multiome['donor'] =  'D' + meta_multiome['donor'].astype('str')
meta_multiome['donor_cellnuc'] = meta_multiome['donor'].astype('str') + '_' + meta_multiome['cell_or_nuclei'].astype('str')
meta_multiome['modality'] = 'paired'
meta_multiome[['modality', 'donor_cellnuc', 'donor', 'region', 'cell_or_nuclei']]
meta_multiome = meta_multiome[['sample_id', 'combined_id', "protocol", 'modality', 'donor_cellnuc', 'donor', 'region', 'cell_or_nuclei']]
# meta_multiome.index = meta_multiome.sample_id

In [9]:
meta_scnRNA.rename(columns={"Cell_or_Nuclei": "cell_or_nuclei"}, inplace=True)
meta_scnRNA.rename(columns={"Donor": "donor"}, inplace=True)
meta_scnRNA.rename(columns={"Region": "region"}, inplace=True)
meta_scnRNA['sample_id'] = meta_scnRNA.index
meta_scnRNA['protocol'] = 'RNA'
meta_scnRNA['modality'] = 'expression'
meta_scnRNA['combined_id'] = np.nan
meta_scnRNA['donor_cellnuc'] = meta_scnRNA['donor'].astype('str') + '_' + meta_scnRNA['cell_or_nuclei'].astype('str')
meta_scnRNA = meta_scnRNA[['sample_id', 'combined_id', "protocol", 'modality', 'donor_cellnuc', 'donor', 'region', 'cell_or_nuclei']]
# meta_scnRNA.index = meta_scnRNA.sample_id

In [10]:
metadata = pd.concat([meta_multiome, meta_scnRNA], ignore_index=True)

In [11]:
metadata.donor.astype('category').cat.categories

Index(['D1', 'D11', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'H2', 'H3', 'H4',
       'H5', 'H6', 'H7'],
      dtype='object')

In [12]:
barcodes = multirna.obs.index
multirna.obs = multirna.obs.merge(metadata, on='sample_id', how='left')
multirna.obs.index = barcodes

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


In [13]:
barcodes = scrna.obs.index
scrna.obs = scrna.obs.merge(metadata, on='sample_id', how='left')
scrna.obs.index = barcodes

In [14]:
barcodes = snrna.obs.index
snrna.obs = snrna.obs.merge(metadata, on='sample_id', how='left')
snrna.obs.index = barcodes

In [15]:
rna = scrna.concatenate([snrna, multirna], index_unique = None)

In [16]:
# add modality to .var
rna.var['modality']='Gene Expression'

In [17]:
rna.obs['modality'].value_counts()

expression    618913
paired         69620
Name: modality, dtype: int64

**ATAC**

In [18]:
atac = sc.read_h5ad('/nfs/team205/heart/anndata_objects/6region_v2/6reg-v2_ATACs_filtered.h5ad')

# add intra-batchkey: 'donor_cellnuc'
atac.obs['cell_or_nuclei']='nuclei'
atac.obs.rename(columns={'sangerID':'sample_id'},inplace=True)
atac.obs.rename(columns={'Region':'region'},inplace=True)
atac.obs.rename(columns={'Donor':'donor'},inplace=True)
atac.obs['donor_cellnuc']=atac.obs['donor'].astype('str')+'_'+atac.obs['cell_or_nuclei'].astype('str')
atac.obs['protocol'] = 'ATAC'
# atac.obs['combined_id'] = np.nan

# add modality to .var
atac.var['modality']='Peaks'


atac

AnnData object with n_obs × n_vars = 79208 × 102627
    obs: 'cellatac_clusters', 'cellatac_code', 'sample_id', 'dataset', 'donor', 'region', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'barcode', 'oribarcode', 'cell_or_nuclei', 'donor_cellnuc', 'protocol'
    var: 'peak_width', 'exon', 'gene', 'promoter', 'annotation', 'gene_name', 'gene_id', 'tss_distance', 'ENCODE_blacklist', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'modality'
    layers: 'binary_raw'

In [19]:
atac.obs.dataset

fullbarcode
HCAHeart8374324_AAACGAATCAAACCCA-1            LV
HCAHeart8374324_AAAGGATAGGCACTAG-1            LV
HCAHeart8374324_AAAGGGCAGCGAGCTA-1            LV
HCAHeart8374324_AAAGGGCAGTGATATG-1            LV
HCAHeart8374324_AAATGAGTCCGGGCAT-1            LV
                                         ...    
HCAHeart9917178_TTTGGTAAGTATTGTG-1    Multiome_2
HCAHeart9917178_TTTGTCCCACAAAGCG-1    Multiome_2
HCAHeart9917178_TTTGTCCCATCGCTTT-1    Multiome_2
HCAHeart9917178_TTTGTGTTCAGGATGA-1    Multiome_2
HCAHeart9917178_TTTGTTGGTGTTGTAG-1    Multiome_2
Name: dataset, Length: 79208, dtype: category
Categories (8, object): ['RA', 'LA', 'RV', 'LV', 'SP', 'AX', 'Multiome_1', 'Multiome_2']

In [20]:
atac.obs=atac.obs[['sample_id', 'protocol', 'donor_cellnuc', 
                   'donor', 'region', 'cell_or_nuclei', 
                   'cellatac_clusters','cellatac_code', 'dataset', 'barcode', 'oribarcode']]
atac.var=atac.var[['modality','peak_width', 'exon', 'gene', 'promoter', 'annotation', 'gene_name','gene_id', 'tss_distance', 'ENCODE_blacklist',]]

# set 'modality' in .obs as 'snATAC' and 'Multiome'
atac.obs['modality']=atac.obs['dataset'].copy()

# atac.obs.replace({'modality':{
#     'Multiome_1':'Multiome',
#     'Multiome_2':'Multiome',
#     'LV':'snATAC',
#     'AX':'snATAC',
#     'SP':'snATAC',
#     'RA':'snATAC',
#     'LA':'snATAC',
#     'RV':'snATAC',
# }},inplace=True)


atac.obs.replace({'modality':{
    'Multiome_1':'paired',
    'Multiome_2':'paired',
    'LV':'accessibility',
    'AX':'accessibility',
    'SP':'accessibility',
    'RA':'accessibility',
    'LA':'accessibility',
    'RV':'accessibility',
}},inplace=True)


atac.obs['modality'].value_counts()

accessibility    48098
paired           31110
Name: modality, dtype: int64

In [21]:
atac.X.data[:10]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)

## Generate `adata_paired.h5ad`

In [22]:
# rna.obs.loc[rna.obs.modality == 'Multiome-RNA','sample_id'].astype('category')
rna.obs.loc[rna.obs.modality == 'paired','sample_id'].astype('category')

barcode
HCAHeart9508627_CTTGAACAGTAATCCA-1    HCAHeart9508627
HCAHeart9508627_GGGCATGAGCAAGACA-1    HCAHeart9508627
HCAHeart9508627_CGGCTCACATCTTGAG-1    HCAHeart9508627
HCAHeart9508627_AGGTGAGGTTTGCGCC-1    HCAHeart9508627
HCAHeart9508627_GTGCCTTTCACGAATC-1    HCAHeart9508627
                                           ...       
HCAHeart9845436_GTAAGCTTCAAGGACA-1    HCAHeart9845436
HCAHeart9845436_TAAGCTATCGCACAAT-1    HCAHeart9845436
HCAHeart9845436_GATAACGAGGTTAGCT-1    HCAHeart9845436
HCAHeart9845436_AGGATGTCAATTTGGT-1    HCAHeart9845436
HCAHeart9845436_ACTTATGAGAATCGCT-1    HCAHeart9845436
Name: sample_id, Length: 69620, dtype: category
Categories (9, object): ['HCAHeart9508627', 'HCAHeart9508628', 'HCAHeart9508629', 'HCAHeart9845431', ..., 'HCAHeart9845433', 'HCAHeart9845434', 'HCAHeart9845435', 'HCAHeart9845436']

**Multiome, RNA**

In [23]:
# subset Multiome-RNA
# multiome_rna=rna[rna.obs['modality']=='Multiome-RNA']
multiome_rna=rna[rna.obs['modality']=='paired']

# prepare GEM barcodes
multiome_rna.obs['barcode']=multiome_rna.obs.index.copy()
multiome_rna.obs['barcode']=[x.split('_')[1] for x in multiome_rna.obs.index]

# add Combined_ID
# multiome_rna.obs.rename(columns={'sample_id':'rna_sangerID'},inplace=True)
# multiome_rna.obs=multiome_rna.obs.merge(meta[['SangerID','Combined_ID']],how='left',left_on='rna_sangerID',right_on='SangerID')

# multiome_rna.obs=multiome_rna.obs.merge(meta_multiome, how='left',left_on='sample_id', right_on='sample_id')
# multiome_rna.obs.drop(['SangerID'],axis=1,inplace=True)

# set index with fullbarcode: Combined_ID + barcodes
multiome_rna.obs['combined_barcode']=multiome_rna.obs['combined_id'].astype('str')+ \
                                     '_'+ multiome_rna.obs['barcode'].astype('str')
multiome_rna.obs.set_index('combined_barcode',inplace=True)

multiome_rna

Trying to set attribute `.obs` of view, copying.


AnnData object with n_obs × n_vars = 69620 × 31915
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sample_id', 'Foetal_or_Adult', 'Provider', 'Modality', 'Mapping_ver', 'Reference_genome', 'CellBender_out', 'n_cells', 'multiplet_rate', 'batch', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'combined_id', 'protocol', 'modality', 'donor_cellnuc', 'donor', 'region', 'cell_or_nuclei', 'Chemistry', 'n_nuclei', 'barcode'
    var: 'ambient_expression-0-0', 'genes-0-0', 'ambient_expression-1-0', 'feature_type-1-0', 'id-1-0', 'ambient_expression-10-0', 'feature_type-10-0', 'id-10-0', 'ambient_expression-11-0', 'feature_type-11-0', 'id-11-0', 'ambient_expression-12-0', 'feature_type-12-0', 'id-12-0', 'ambient_expression-13-0', 'feature_type-13-0', 'id-13-0', 'ambient_expression-14-0', 'feature_type-14-0', 'id-14-0', 'ambient_expression-15-0', 'feature_type-15-0'

In [24]:
multiome_rna.obs.head()

Unnamed: 0_level_0,latent_RT_efficiency,latent_cell_probability,latent_scale,sample_id,Foetal_or_Adult,Provider,Modality,Mapping_ver,Reference_genome,CellBender_out,...,combined_id,protocol,modality,donor_cellnuc,donor,region,cell_or_nuclei,Chemistry,n_nuclei,barcode
combined_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeart9508627_HCAHeart9508819_CTTGAACAGTAATCCA-1,9.639693,0.69764,917.024841,HCAHeart9508627,Adult,Sanger Heart Mona-Carlos,Multiome-RNA,cellranger-arc-1.0.1,GRCh38-2020-A,,...,HCAHeart9508627_HCAHeart9508819,RNA,paired,D3_Nuclei,D3,LV,Nuclei,Single Cell Multiome ATAC + Gene Expression v1,3686.0,CTTGAACAGTAATCCA-1
HCAHeart9508627_HCAHeart9508819_GGGCATGAGCAAGACA-1,8.256632,0.999412,3547.850098,HCAHeart9508627,Adult,Sanger Heart Mona-Carlos,Multiome-RNA,cellranger-arc-1.0.1,GRCh38-2020-A,,...,HCAHeart9508627_HCAHeart9508819,RNA,paired,D3_Nuclei,D3,LV,Nuclei,Single Cell Multiome ATAC + Gene Expression v1,3686.0,GGGCATGAGCAAGACA-1
HCAHeart9508627_HCAHeart9508819_CGGCTCACATCTTGAG-1,8.386842,0.999705,3879.808838,HCAHeart9508627,Adult,Sanger Heart Mona-Carlos,Multiome-RNA,cellranger-arc-1.0.1,GRCh38-2020-A,,...,HCAHeart9508627_HCAHeart9508819,RNA,paired,D3_Nuclei,D3,LV,Nuclei,Single Cell Multiome ATAC + Gene Expression v1,3686.0,CGGCTCACATCTTGAG-1
HCAHeart9508627_HCAHeart9508819_AGGTGAGGTTTGCGCC-1,9.085649,0.987425,1773.257812,HCAHeart9508627,Adult,Sanger Heart Mona-Carlos,Multiome-RNA,cellranger-arc-1.0.1,GRCh38-2020-A,,...,HCAHeart9508627_HCAHeart9508819,RNA,paired,D3_Nuclei,D3,LV,Nuclei,Single Cell Multiome ATAC + Gene Expression v1,3686.0,AGGTGAGGTTTGCGCC-1
HCAHeart9508627_HCAHeart9508819_GTGCCTTTCACGAATC-1,9.497276,0.820139,971.152283,HCAHeart9508627,Adult,Sanger Heart Mona-Carlos,Multiome-RNA,cellranger-arc-1.0.1,GRCh38-2020-A,,...,HCAHeart9508627_HCAHeart9508819,RNA,paired,D3_Nuclei,D3,LV,Nuclei,Single Cell Multiome ATAC + Gene Expression v1,3686.0,GTGCCTTTCACGAATC-1


**Multiome, ATAC**

In [25]:
# subset Multiome-ATAC
multiome_atac=atac[atac.obs['modality']=='paired']

# add Combined_ID
# multiome_atac.obs.rename(columns={'sangerID':'atac_sangerID'},inplace=True)
# multiome_atac.obs=multiome_atac.obs.merge(meta[['SangerID','Combined_ID']],how='left',left_on='atac_sangerID',right_on='SangerID')

# multiome_atac.obs.drop(['SangerID'],axis=1,inplace=True)

multiome_atac.obs=multiome_atac.obs.merge(meta_multiome[['sample_id','combined_id']],how='left',
                                          left_on='sample_id',right_on='sample_id')

# set index with fullbarcode: Combined_ID + barcodes
multiome_atac.obs['combined_barcode']=multiome_atac.obs['combined_id'].astype('str')+ \
                                     '_'+ multiome_atac.obs['barcode'].astype('str')
multiome_atac.obs.set_index('combined_barcode',inplace=True)

multiome_atac

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


AnnData object with n_obs × n_vars = 31110 × 102627
    obs: 'sample_id', 'protocol', 'donor_cellnuc', 'donor', 'region', 'cell_or_nuclei', 'cellatac_clusters', 'cellatac_code', 'dataset', 'barcode', 'oribarcode', 'modality', 'combined_id'
    var: 'modality', 'peak_width', 'exon', 'gene', 'promoter', 'annotation', 'gene_name', 'gene_id', 'tss_distance', 'ENCODE_blacklist'
    layers: 'binary_raw'

In [26]:
multiome_atac.obs.head()

Unnamed: 0_level_0,sample_id,protocol,donor_cellnuc,donor,region,cell_or_nuclei,cellatac_clusters,cellatac_code,dataset,barcode,oribarcode,modality,combined_id
combined_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
HCAHeart9508627_HCAHeart9508819_AAACATGCAGCAGGTA-1,HCAHeart9508819,ATAC,D3_nuclei,D3,LV,nuclei,21,22,Multiome_1,AAACATGCAGCAGGTA-1,22-AAACATGCAGCAGGTA-1,paired,HCAHeart9508627_HCAHeart9508819
HCAHeart9508627_HCAHeart9508819_AAACATGCATAGCTGC-1,HCAHeart9508819,ATAC,D3_nuclei,D3,LV,nuclei,13,22,Multiome_1,AAACATGCATAGCTGC-1,22-AAACATGCATAGCTGC-1,paired,HCAHeart9508627_HCAHeart9508819
HCAHeart9508627_HCAHeart9508819_AAACCAACAAGGTGCA-1,HCAHeart9508819,ATAC,D3_nuclei,D3,LV,nuclei,6,22,Multiome_1,AAACCAACAAGGTGCA-1,22-AAACCAACAAGGTGCA-1,paired,HCAHeart9508627_HCAHeart9508819
HCAHeart9508627_HCAHeart9508819_AAACCAACACTTAACG-1,HCAHeart9508819,ATAC,D3_nuclei,D3,LV,nuclei,13,22,Multiome_1,AAACCAACACTTAACG-1,22-AAACCAACACTTAACG-1,paired,HCAHeart9508627_HCAHeart9508819
HCAHeart9508627_HCAHeart9508819_AAACCAACAGCCGCTA-1,HCAHeart9508819,ATAC,D3_nuclei,D3,LV,nuclei,13,22,Multiome_1,AAACCAACAGCCGCTA-1,22-AAACCAACAGCCGCTA-1,paired,HCAHeart9508627_HCAHeart9508819


**Concatenate Multiome-RNA and Multiome-ATAC**

In [27]:
# take intersecting barcodes between 
barcodes_inter=list(set(multiome_rna.obs_names).intersection(multiome_atac.obs_names))
len(barcodes_inter)

30638

In [28]:
multiome_rna=multiome_rna[barcodes_inter,:]
multiome_atac=multiome_atac[barcodes_inter,:]
all(multiome_rna.obs_names==multiome_atac.obs_names)

True

In [29]:
# multiome_rna.obs[['Combined_ID','rna_sangerID','barcode','donor', 
#                   'age_group','region','cell_or_nuclei', 'gender', 
#                   'type', 'cell_states','modality','donor_cellnuc']]

# adata_paired=anndata.AnnData(
#     X=scipy.sparse.hstack([multiome_rna.X,multiome_atac.X]).tocsr(), # concatenate sparse matrix and convert to Compressed Sparse Row format
#     obs=pd.concat([multiome_rna.obs, \
#                    multiome_atac.obs[['atac_sangerID','cellatac_clusters','cellatac_code',]]
#                   ],axis=1),
#     var=pd.concat([multiome_rna.var[['modality','genes-0-0']], \
#                    multiome_atac.var[['modality','peak_width', 'exon', 'gene', 'promoter', 'annotation', 'gene_name','gene_id', 'tss_distance', 'ENCODE_blacklist']]
#                   ],axis=0)
# )


adata_paired=anndata.AnnData(
    X=scipy.sparse.hstack([multiome_rna.X,multiome_atac.X]).tocsr(), # concatenate sparse matrix and convert to Compressed Sparse Row format
    obs=pd.concat([multiome_rna.obs, \
                   multiome_atac.obs[['cellatac_clusters','cellatac_code',]]
                  ],axis=1),
    var=pd.concat([multiome_rna.var[['modality','genes-0-0']], \
                   multiome_atac.var[['modality','peak_width', 'exon', 'gene', 'promoter', 'annotation', 'gene_name','gene_id', 'tss_distance', 'ENCODE_blacklist']]
                  ],axis=0)
)


adata_paired

AnnData object with n_obs × n_vars = 30638 × 134542
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sample_id', 'Foetal_or_Adult', 'Provider', 'Modality', 'Mapping_ver', 'Reference_genome', 'CellBender_out', 'n_cells', 'multiplet_rate', 'batch', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'combined_id', 'protocol', 'modality', 'donor_cellnuc', 'donor', 'region', 'cell_or_nuclei', 'Chemistry', 'n_nuclei', 'barcode', 'cellatac_clusters', 'cellatac_code'
    var: 'modality', 'genes-0-0', 'peak_width', 'exon', 'gene', 'promoter', 'annotation', 'gene_name', 'gene_id', 'tss_distance', 'ENCODE_blacklist'

In [30]:
adata_paired.obs.sample_id.isna().sum()

0

In [31]:
directory = path_adata + 'MultiVI'
if not os.path.exists(directory):
    os.makedirs(directory)

In [32]:
adata_paired.obs.drop(['CellBender_out'], axis=1, inplace=True)

In [33]:
# save
adata_paired.write(directory + '/adata_paired.h5ad')

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_id' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Provider' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Modality' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Mapping_ver' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Reference_genome' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'scrublet_leiden' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'combined_id' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'protocol' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'modality' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'don

In [34]:
rna.obs['modality'].value_counts()

expression    618913
paired         69620
Name: modality, dtype: int64

In [35]:
adata_rna=rna[rna.obs['modality']=='expression',:]

In [36]:
adata_rna.obs['modality'].value_counts()

expression    618913
Name: modality, dtype: int64

In [37]:
adata_rna

View of AnnData object with n_obs × n_vars = 618913 × 31915
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sample_id', 'Foetal_or_Adult', 'Provider', 'Modality', 'Mapping_ver', 'Reference_genome', 'CellBender_out', 'n_cells', 'multiplet_rate', 'batch', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'combined_id', 'protocol', 'modality', 'donor_cellnuc', 'donor', 'region', 'cell_or_nuclei', 'Chemistry', 'n_nuclei'
    var: 'ambient_expression-0-0', 'genes-0-0', 'ambient_expression-1-0', 'feature_type-1-0', 'id-1-0', 'ambient_expression-10-0', 'feature_type-10-0', 'id-10-0', 'ambient_expression-11-0', 'feature_type-11-0', 'id-11-0', 'ambient_expression-12-0', 'feature_type-12-0', 'id-12-0', 'ambient_expression-13-0', 'feature_type-13-0', 'id-13-0', 'ambient_expression-14-0', 'feature_type-14-0', 'id-14-0', 'ambient_expression-15-0', 'feature_type-15-0', 

In [38]:
adata_rna.obs.sample_id.isna().sum()

0

In [39]:
adata_rna.obs.combined_id.isna().sum()

618913

In [40]:
adata_rna.obs.drop(columns=['combined_id'], inplace=True)

In [41]:
adata_rna.write(directory + '/adata_scnrna.h5ad')

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_id' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Provider' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Modality' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Mapping_ver' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Reference_genome' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'CellBender_out' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'scrublet_leiden' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'protocol' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'modality' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing '

In [42]:
atac.obs['modality'].value_counts()

accessibility    48098
paired           31110
Name: modality, dtype: int64

In [43]:
adata_atac = atac[atac.obs['modality']=='accessibility']

In [44]:
adata_atac.write(directory + '/adata_atac.h5ad')

  c.reorder_categories(natsorted(c.categories), inplace=True)
Trying to set attribute `.obs` of view, copying.
... storing 'protocol' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
Trying to set attribute `.obs` of view, copying.
... storing 'donor_cellnuc' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
Trying to set attribute `.obs` of view, copying.
... storing 'cell_or_nuclei' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
Trying to set attribute `.obs` of view, copying.
... storing 'modality' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
Trying to set attribute `.var` of view, copying.
... storing 'modality' as categorical
