In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
import scipy

from scipy.io import mmwrite, mmread
from scipy.sparse import csr_matrix

import sctk

In [2]:
import session_info
session_info.show()

In [13]:
sc.settings.set_figure_params(dpi=120)

# Variables

In [7]:
cells_to_analyse = ['MonocytesMPOpos', 
                        'Monocytes', 
                        'MonocyteDerivedCells',
                        'MacrophagesCX3CR1pos', 
                        'MacrophagesTIMD4pos', 
                        'MacrophagesLYVE1pos',
                        'MacrophagesATF3pos'
                   ]

celltype_key = 'fine_grain'

region_to_analyse = ['whole sample']

# Read in

In [10]:
!ls /nfs/team205/heart/anndata_objects/Foetal

14Feb_fromSemih		      metadata
archive			      multiome_ATAC
Feb28ObjectRaw.h5ad	      QC
Feb28ObjectRaw_Multiome.h5ad  RNA_foetal_post-scVI_raw.h5ad
Feb28ObjectRaw_Multiome.rds   RNA_foetal_raw.h5ad
Feb28ObjectRaw.obs.csv	      scVI
Feb28ObjectRaw_original.h5ad  visium_foetal_raw.h5ad
Immune


In [22]:
# rna
adata_rna = sc.read_h5ad('/nfs/team205/heart/anndata_objects/Foetal/Feb28ObjectRaw.h5ad')
print(adata_rna.obs[celltype_key].value_counts())

# subset cells
adata_rna = adata_rna[adata_rna.obs[celltype_key].isin(cells_to_analyse)]

# subset regions
adata_rna = adata_rna[adata_rna.obs['region'].isin(region_to_analyse)]

# add modality to .var
adata_rna.var['modality']='Gene Expression'

adata_rna

GreatVesselAdventitialFibroblasts        61593
GreatVesselSmoothMuscleCells             22992
VentricularCardiomyocytesRightCompact    16215
VentricularCardiomyocytesLeftCompact     15591
MyocardialInterstitialFibroblasts1       13880
                                         ...  
MacrophagesTIMD4pos                        470
MonocytesMPOpos                            414
Megakaryocytes                             360
ChromaffinCells                            228
DendriticCellsMature                       184
Name: fine_grain, Length: 63, dtype: int64


AnnData object with n_obs × n_vars = 7905 × 36601
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'region', 'age', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'donor_by_library-prep', 'multiplexed', 'SOC | status', 'SOC | log_prob_singleton', 'SOC | log_prob_doublet', 'batch_key', '_scvi_batch', 'FACSgate', 'fine_grain', 'mid_grain', 'coarse_grain', 'sex', 'week', 'trimester', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', '_scvi_labels', 'stress_score', 'hb1_score'
    var: 'gene_name_scRNA-0', 'gene_id', 'modality'
    uns: 'FACSgate_colors', '_scvi_manager_uuid', '_scvi_uuid', 'age_colors', 'ce

In [23]:
pd.crosstab(adata_rna.obs[celltype_key],adata_rna.obs['region'])

region,whole sample
fine_grain,Unnamed: 1_level_1
MonocytesMPOpos,204
Monocytes,498
MonocyteDerivedCells,207
MacrophagesCX3CR1pos,835
MacrophagesTIMD4pos,116
MacrophagesLYVE1pos,5117
MacrophagesATF3pos,928


In [24]:
pd.crosstab(adata_rna.obs[celltype_key],adata_rna.obs['kit_10x'])

kit_10x,5prime-v2,Multiome-v1
fine_grain,Unnamed: 1_level_1,Unnamed: 2_level_1
MonocytesMPOpos,201,3
Monocytes,468,30
MonocyteDerivedCells,137,70
MacrophagesCX3CR1pos,695,140
MacrophagesTIMD4pos,79,37
MacrophagesLYVE1pos,3556,1561
MacrophagesATF3pos,639,289


In [25]:
pd.crosstab(adata_rna.obs[celltype_key],adata_rna.obs['cycling'])

cycling,no,yes
fine_grain,Unnamed: 1_level_1,Unnamed: 2_level_1
MonocytesMPOpos,21,183
Monocytes,431,67
MonocyteDerivedCells,179,28
MacrophagesCX3CR1pos,644,191
MacrophagesTIMD4pos,108,8
MacrophagesLYVE1pos,4145,972
MacrophagesATF3pos,857,71


In [26]:
adata_rna.obs['kit_10x'].value_counts()

5prime-v2      5775
Multiome-v1    2130
Name: kit_10x, dtype: int64

In [16]:
# atac
# ie only Multiome data
adata_atac = sc.read_h5ad('/nfs/team205/heart/anndata_objects/Foetal/multiome_ATAC/ArchR/project_output/PeakMatrix/Foetal_Peaks.h5ad')

# subset cells
adata_atac = adata_atac[adata_atac.obs[celltype_key].isin(cells_to_analyse)]
print(adata_atac.obs[celltype_key].value_counts())

# subset regions
adata_atac = adata_atac[adata_atac.obs['region'].isin(region_to_analyse)]

# add modality to .var
adata_atac.var['modality']='Peaks'

adata_atac

MacrophagesLYVE1pos     4157
MacrophagesATF3pos       611
MacrophagesCX3CR1pos     551
MonocyteDerivedCells     386
MacrophagesTIMD4pos      303
Monocytes                117
MonocytesMPOpos           34
Name: fine_grain, dtype: int64


AnnData object with n_obs × n_vars = 1899 × 508040
    obs: 'sangerID', 'combinedID', 'donor', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'multiplexed', 'batch_key', 'region', 'FACSgate', 'sex', 'week', 'trimester', 'fine_grain', 'mid_grain', 'coarse_grain', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', 'stress_score', 'hb1_score'
    var: 'modality'

In [17]:
pd.crosstab(adata_atac.obs[celltype_key],adata_atac.obs['region'])

region,whole sample
fine_grain,Unnamed: 1_level_1
MacrophagesATF3pos,259
MacrophagesCX3CR1pos,131
MacrophagesLYVE1pos,1383
MacrophagesTIMD4pos,33
MonocyteDerivedCells,62
Monocytes,28
MonocytesMPOpos,3


# Prepare paired anndata

In [28]:
# shared barcodes, ie Multiome data (not all, since MultiomeRNA has a bit more data than MultiomeATAC)
barcodes_inter=list(set(adata_rna.obs_names).intersection(adata_atac.obs_names))
len(barcodes_inter)

# take shared cells
multiome_rna=adata_rna[barcodes_inter,:]
multiome_atac=adata_atac[barcodes_inter,:]

# concatenate
adata_paired=anndata.AnnData(
    X=scipy.sparse.hstack([multiome_rna.X,multiome_atac.X]).tocsr(), # concatenate sparse matrix and convert to Compressed Sparse Row format
    obs=multiome_rna.obs[['sangerID', 'combinedID', 'donor', 
                       'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 
                       'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts',
                       'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
                       'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt',
                       'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo',
                       'pct_counts_ribo', 'HB_score', 'multiplexed',
                        'batch_key','region','FACSgate','sex','week','trimester',
                     'fine_grain','mid_grain','coarse_grain',
                     'heart_or_greatvessels','cycling','S_score','G2M_score','phase','stress_score','hb1_score']],
    var=pd.concat([multiome_rna.var[['modality','gene_name_scRNA-0','gene_id']], \
                   multiome_atac.var[['modality']]
                  ],axis=0)
)
adata_paired

AnnData object with n_obs × n_vars = 1899 × 544641
    obs: 'sangerID', 'combinedID', 'donor', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'multiplexed', 'batch_key', 'region', 'FACSgate', 'sex', 'week', 'trimester', 'fine_grain', 'mid_grain', 'coarse_grain', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', 'stress_score', 'hb1_score'
    var: 'modality', 'gene_name_scRNA-0', 'gene_id'

In [29]:
# modify modality columns in .obs
adata_paired.obs['modality'] = 'Multiome'
adata_paired.obs['modality'].value_counts()

Multiome    1899
Name: modality, dtype: int64

In [31]:
# save
adata_paired.write('/nfs/team205/heart/anndata_objects/Foetal/Immune/MultiVI-pre_paired_foetal_MoMP_raw.h5ad')

In [None]:
adata_paired.obs.head()

# Prepare RNA data

In [32]:
# select single-cell data (3p and 5p)
adata_scrna = adata_rna[adata_rna.obs['kit_10x'].isin(['3prime-v3','5prime-v2'])]
adata_scrna.obs['kit_10x'].value_counts()

5prime-v2    5775
Name: kit_10x, dtype: int64

In [33]:
adata_scrna.X.data[:10]

array([1., 5., 1., 2., 1., 3., 1., 1., 2., 1.], dtype=float32)

In [34]:
# recreate anndata
adata_scrna=anndata.AnnData(
    X=adata_scrna.X, # concatenate sparse matrix and convert to Compressed Sparse Row format
    obs=adata_scrna.obs[['sangerID', 'combinedID', 'donor', 
                       'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 
                       'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts',
                       'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
                       'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt',
                       'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo',
                       'pct_counts_ribo', 'HB_score', 'multiplexed',
                        'batch_key','region','FACSgate','sex','week','trimester',
                     'fine_grain','mid_grain','coarse_grain',
                     'heart_or_greatvessels','cycling','S_score','G2M_score','phase','stress_score','hb1_score']],
    var=adata_scrna.var[['modality','gene_name_scRNA-0','gene_id']]
)
adata_scrna

AnnData object with n_obs × n_vars = 5775 × 36601
    obs: 'sangerID', 'combinedID', 'donor', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'multiplexed', 'batch_key', 'region', 'FACSgate', 'sex', 'week', 'trimester', 'fine_grain', 'mid_grain', 'coarse_grain', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', 'stress_score', 'hb1_score'
    var: 'modality', 'gene_name_scRNA-0', 'gene_id'

In [35]:
# modify modality columns in .obs
adata_scrna.obs['modality'] = 'scRNA'
adata_scrna.obs['modality'].value_counts()

scRNA    5775
Name: modality, dtype: int64

In [36]:
adata_scrna.var

Unnamed: 0_level_0,modality,gene_name_scRNA-0,gene_id
gene_name_multiome-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MIR1302-2HG,Gene Expression,MIR1302-2HG,ENSG00000243485
FAM138A,Gene Expression,FAM138A,ENSG00000237613
OR4F5,Gene Expression,OR4F5,ENSG00000186092
AL627309.1,Gene Expression,AL627309.1,ENSG00000238009
AL627309.3,Gene Expression,AL627309.3,ENSG00000239945
...,...,...,...
AC141272.1,Gene Expression,AC141272.1,ENSG00000277836
AC023491.2,Gene Expression,AC023491.2,ENSG00000278633
AC007325.1,Gene Expression,AC007325.1,ENSG00000276017
AC007325.4,Gene Expression,AC007325.4,ENSG00000278817


In [37]:
# save
adata_scrna.write('/nfs/team205/heart/anndata_objects/Foetal/Immune/MultiVI-pre_scRNA_foetal_MoMP_raw.h5ad')