ref: is peak matrix extracted with getMatrixFromProject normalized? NO<br>
https://github.com/GreenleafLab/ArchR/discussions/943

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata

from scipy.io import mmwrite, mmread
from scipy.sparse import csr_matrix

In [2]:
!ls -l /nfs/team205/heart/anndata_objects/Foetal/multiome_ATAC/ArchR/project_output/PeakMatrix

total 3673116
-rw-r--r-- 1 20383 1000624    1492397 Mar 12 20:40 barcodes.tsv.gz
-rw-r--r-- 1 20383 1000624    4082604 Mar 12 20:40 features.tsv.gz
-rw-r--r-- 1 20383 1000624 3755691976 Mar 12 20:40 matrix.mtx.gz


## Read in

In [3]:
peakmatrix_dir = '/nfs/team205/heart/anndata_objects/Foetal/multiome_ATAC/ArchR/project_output/PeakMatrix'

In [4]:
# peak matrix, not-binarised
mat = mmread(f'{peakmatrix_dir}/matrix.mtx.gz')
mat

<508040x167022 sparse matrix of type '<class 'numpy.int64'>'
	with 1173033588 stored elements in COOrdinate format>

In [5]:
# peak names
features = pd.read_csv(f'{peakmatrix_dir}/features.tsv.gz', header=None, sep='\t')[[0]]
features.set_index(0,inplace=True)
features

chr1:817093_817593
chr1:819818_820318
chr1:820542_821042
chr1:825246_825746
chr1:826566_827066
...
chrX:155820072_155820572
chrX:155874490_155874990
chrX:155880548_155881048
chrX:155881064_155881564
chrX:155881618_155882118


In [6]:
# cell barcodes
cells = pd.read_csv(f'{peakmatrix_dir}/barcodes.tsv.gz', header=None, sep='\t')[[1]]
cells.set_index(1,inplace=True)
cells

7089STDY13216921_BHF_F_Hea13242528#GCAATAGAGTTATGTG-1
7089STDY13216921_BHF_F_Hea13242528#TGGGCCTAGATGGACA-1
7089STDY13216921_BHF_F_Hea13242528#TCCAGGTCACAAACTT-1
7089STDY13216921_BHF_F_Hea13242528#GGTCAATTCATTAGCA-1
7089STDY13216921_BHF_F_Hea13242528#TACCCGGCATAATGTC-1
...
BHF_F_Hea11933675_BHF_F_Hea11596628#GTGATCAGTACGGGTT-1
BHF_F_Hea11933675_BHF_F_Hea11596628#TCTACCTCATCGTTCT-1
BHF_F_Hea11933675_BHF_F_Hea11596628#CCGTTAACAGCCTTGG-1
BHF_F_Hea11933675_BHF_F_Hea11596628#GTACTAATCGTTTCCA-1
7089STDY13216923_BHF_F_Hea13242530#GCTGGTTCAAATTCGT-1


## Make anndata

In [7]:
adata = anndata.AnnData(
    X=mat.T,
    obs=cells,
    var=features
)

# rename obs_names
adata.obs_names = [x.replace('#','_') for x in adata.obs_names]

# convert sparce matrix
adata.X = adata.X.tocsr()
adata.X

<167022x508040 sparse matrix of type '<class 'numpy.float32'>'
	with 1173033588 stored elements in Compressed Sparse Row format>

In [8]:
adata

AnnData object with n_obs × n_vars = 167022 × 508040

In [9]:
adata.X.data[:10]

array([2., 2., 2., 2., 1., 1., 2., 1., 2., 2.], dtype=float32)

In [10]:
np.max(adata.X.data)

4.0

In [11]:
adata.obs

7089STDY13216921_BHF_F_Hea13242528_GCAATAGAGTTATGTG-1
7089STDY13216921_BHF_F_Hea13242528_TGGGCCTAGATGGACA-1
7089STDY13216921_BHF_F_Hea13242528_TCCAGGTCACAAACTT-1
7089STDY13216921_BHF_F_Hea13242528_GGTCAATTCATTAGCA-1
7089STDY13216921_BHF_F_Hea13242528_TACCCGGCATAATGTC-1
...
BHF_F_Hea11933675_BHF_F_Hea11596628_GTGATCAGTACGGGTT-1
BHF_F_Hea11933675_BHF_F_Hea11596628_TCTACCTCATCGTTCT-1
BHF_F_Hea11933675_BHF_F_Hea11596628_CCGTTAACAGCCTTGG-1
BHF_F_Hea11933675_BHF_F_Hea11596628_GTACTAATCGTTTCCA-1
7089STDY13216923_BHF_F_Hea13242530_GCTGGTTCAAATTCGT-1


## Add cell type annotation

In [13]:
obs = pd.read_csv('/nfs/team205/heart/anndata_objects/Foetal/Feb28ObjectRaw.obs.csv',index_col=0)

# add cell type labels and other metadata
metadata_to_add = ['sangerID', 'combinedID', 'donor', 
       'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 
       'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts',
       'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
       'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt',
       'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo',
       'pct_counts_ribo', 'HB_score', 'multiplexed',
        'batch_key','region','FACSgate','sex','week','trimester',
                     'fine_grain','mid_grain','coarse_grain',
                     'heart_or_greatvessels','cycling','S_score','G2M_score','phase','stress_score','hb1_score']
adata.obs[metadata_to_add] = obs[metadata_to_add].reindex(adata.obs_names).copy()
adata

AnnData object with n_obs × n_vars = 167022 × 508040
    obs: 'sangerID', 'combinedID', 'donor', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'multiplexed', 'batch_key', 'region', 'FACSgate', 'sex', 'week', 'trimester', 'fine_grain', 'mid_grain', 'coarse_grain', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', 'stress_score', 'hb1_score'

In [14]:
adata.var.rename_axis('Peaks', axis=0, inplace=True)

In [15]:
# save
adata.write(f'{peakmatrix_dir}/Foetal_Peaks.h5ad')

... storing 'sangerID' as categorical
... storing 'combinedID' as categorical
... storing 'donor' as categorical
... storing 'facility' as categorical
... storing 'cell_or_nuclei' as categorical
... storing 'modality' as categorical
... storing 'kit_10x' as categorical
... storing 'multiplexed' as categorical
... storing 'batch_key' as categorical
... storing 'region' as categorical
... storing 'FACSgate' as categorical
... storing 'sex' as categorical
... storing 'week' as categorical
... storing 'trimester' as categorical
... storing 'fine_grain' as categorical
... storing 'mid_grain' as categorical
... storing 'coarse_grain' as categorical
... storing 'heart_or_greatvessels' as categorical
... storing 'cycling' as categorical
... storing 'phase' as categorical


# Refine metadata

In [4]:
adata = sc.read_h5ad(f'{peakmatrix_dir}/Foetal_Peaks.h5ad')
adata

AnnData object with n_obs × n_vars = 167022 × 508040
    obs: 'sangerID', 'combinedID', 'donor', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'multiplexed', 'batch_key', 'region', 'FACSgate', 'sex', 'week', 'trimester', 'fine_grain', 'mid_grain', 'coarse_grain', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', 'stress_score', 'hb1_score'

In [5]:
# drop some obs columns
metadata_to_drop = [
       'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts',
       'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
       'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt',
       'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo',
       'pct_counts_ribo', 'HB_score', 'cycling','S_score','G2M_score','phase','stress_score','hb1_score']

adata.obs.drop(metadata_to_drop,axis=1,inplace=True)

In [6]:
# add peak metadata
meta = pd.read_csv('/nfs/team205/heart/anndata_objects/Foetal/multiome_ATAC/ArchR/project_output/PeakMatrix/peak_metadata.tsv',
           sep='\t')
meta.set_index('range',inplace=True)

# add to anndata
adata.var[meta.columns] = meta.reindex(adata.var_names)
adata.var.head()

Unnamed: 0_level_0,seqnames,start,end,width,strand,score,replicateScoreQuantile,groupScoreQuantile,Reproducibility,GroupReplicate,distToGeneStart,nearestGene,peakType,distToTSS,nearestTSS,GC,idx,N
Peaks,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
chr1:817093_817593,chr1,817093,817593,501,*,17.2761,0.931,0.795,2.0,MacrophagesATF3pos._.Rep2,27,FAM87B,Promoter,27,uc031tlt.2,0.479,1,0
chr1:819818_820318,chr1,819818,820318,501,*,2.56766,0.325,0.045,2.0,MacrophagesTIMD4pos._.Rep2,2696,FAM87B,Distal,2696,uc031tlt.2,0.519,2,0
chr1:820542_821042,chr1,820542,821042,501,*,2.56766,0.325,0.045,2.0,MacrophagesTIMD4pos._.Rep2,3420,FAM87B,Distal,3420,uc031tlt.2,0.5509,3,0
chr1:825246_825746,chr1,825246,825746,501,*,2.56766,0.325,0.045,2.0,MacrophagesTIMD4pos._.Rep2,357,LINC01128,Exonic,357,uc057axa.1,0.4291,4,0
chr1:826566_827066,chr1,826566,827066,501,*,6.68679,0.521,0.183,2.0,AtrialCardiomyocytesCycling._.Rep1,705,LINC00115,Exonic,15,uc057axb.1,0.5489,5,0


In [7]:
# save
adata.write(f'{peakmatrix_dir}/Foetal_Peaks.h5ad')

... storing 'seqnames' as categorical
... storing 'strand' as categorical
... storing 'GroupReplicate' as categorical
... storing 'nearestGene' as categorical
... storing 'peakType' as categorical
... storing 'nearestTSS' as categorical


In [9]:
adata.X.data[:30]

array([2., 2., 2., 2., 1., 1., 2., 1., 2., 2., 2., 2., 3., 1., 2., 2., 3.,
       1., 2., 2., 2., 2., 2., 2., 1., 1., 2., 4., 2., 2.], dtype=float32)

In [8]:
f'{peakmatrix_dir}/Foetal_Peaks.h5ad'

'/nfs/team205/heart/anndata_objects/Foetal/multiome_ATAC/ArchR/project_output/PeakMatrix/Foetal_Peaks.h5ad'

In [None]:
# rclone to gdrive
rclone copy /nfs/team205/heart/anndata_objects/Foetal/multiome_ATAC/ArchR/project_output/PeakMatrix/Foetal_Peaks.h5ad \
gdrive:JamesC_Kazumasa/Foetal_Heart/SanjayShinha/toSinhalab/anndata/Peak_foetal_global_raw.h5ad

In [None]:
adata.obs['cell_type3'].value_counts()

In [None]:
adata.var