In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

import sctk as sk
import seaborn


# Prepare raw gene count data for fibroblasts

In [2]:
# Import raw scRNA-seq data
adata_raw = sc.read('../data/rna/scRNA_batch1_2.raw.h5ad')

# Import processed data, which contains annotation data
adata = sc.read('../data/rna/scRNA_batch1_2.processed.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 38116 × 17590
    obs: 'doublet_bh_pval', 'n_counts', 'sample_id', 'scrublet_score', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'GEX_SID', 'ATAC_SID', 'GEX', 'ATAC', 'Sample', 'DOB', 'Gender', 'date', 'age_years', 'state', 'disease', 'site_taken_fine', 'site_taken_broad', 'sorted', 'Nancy_histologic_score', 'slide_look_other', 'whole_tissue_signature', 'scSeq', 'bulkSeq', 'annot_batch1_lr', 'annot_batch1_lr_prob', 'annot_batch2_lr', 'annot_batch2_lr_prob', 'leiden_bk_r0_1', 'leiden_bk_r0_3', 'leiden_bk_r0_5', 'is_doublet', 'leiden_bk_split1', 'leiden_bk_split2', 'leiden_bk_split3', 'annot1', 'leiden_bk_split4', 'annot_gut_lr', 'annot_gut_lr_prob', 'broad_annot1'
    var: 'gene_ids', 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'cc', 'highly_variable', 'highly_variable_nbatches', 'hvg_full'
    uns: 'leiden', 'neighbors', 'neighbors_bk', 'pca'
    obsm: 'X_pca', 'X_umap_bk'
    varm: '

In [4]:
adata_raw

AnnData object with n_obs × n_vars = 45683 × 17707
    obs: 'doublet_bh_pval', 'n_counts', 'sample_id', 'scrublet_score', 'GEX_SID', 'ATAC_SID', 'GEX', 'ATAC', 'Sample', 'DOB', 'Gender', 'date', 'age_years', 'state', 'disease', 'site_taken_fine', 'site_taken_broad', 'sorted', 'Nancy_histologic_score', 'slide_look_other', 'whole_tissue_signature', 'scSeq', 'bulkSeq', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50'
    var: 'gene_ids', 'mito', 'ribo', 'hb', 'n_counts', 'n_cells'

In [5]:
adata_raw.obs

Unnamed: 0_level_0,doublet_bh_pval,n_counts,sample_id,scrublet_score,GEX_SID,ATAC_SID,GEX,ATAC,Sample,DOB,...,whole_tissue_signature,scSeq,bulkSeq,log1p_n_counts,n_genes,log1p_n_genes,percent_mito,percent_ribo,percent_hb,percent_top50
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCATCGTGCTCT-R01,0.737696,1890.0,R01,0.286344,R01,A01,GX46,NX02,110544,24/10/1951,...,high,RNA+ATAC,ATAC + CD45+RNA,7.544861,888,6.790097,11.322752,16.613756,0.052910,37.989418
AAACGAACAACTTCTT-R01,0.750645,634.0,R01,0.075342,R01,A01,GX46,NX02,110544,24/10/1951,...,high,RNA+ATAC,ATAC + CD45+RNA,6.453625,468,6.150603,1.419558,1.577287,0.000000,32.492114
AAACGAACATCTGCGG-R01,0.737696,804.0,R01,0.042802,R01,A01,GX46,NX02,110544,24/10/1951,...,high,RNA+ATAC,ATAC + CD45+RNA,6.690842,544,6.300786,6.218905,4.104477,0.000000,33.208955
AAACGAACATGTTACG-R01,0.737696,2811.0,R01,0.192308,R01,A01,GX46,NX02,110544,24/10/1951,...,high,RNA+ATAC,ATAC + CD45+RNA,7.941651,1157,7.054450,11.988616,11.028104,0.000000,39.131982
AAACGAAGTTCCGTTC-R01,0.737696,1707.0,R01,0.167630,R01,A01,GX46,NX02,110544,24/10/1951,...,high,RNA+ATAC,ATAC + CD45+RNA,7.443079,920,6.825460,10.369068,6.268307,0.000000,35.032220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTGTACTAGCT-R13,0.863149,5568.0,R13,0.142857,R13,A13,GX43,NX82,113395,8/12/1976,...,,RNA+ATAC,,8.624970,2121,7.660114,2.209052,18.750000,0.035920,31.016523
TTTGGTTTCCATCTGC-R13,0.863149,1527.0,R13,0.064639,R13,A13,GX43,NX82,113395,8/12/1976,...,,RNA+ATAC,,7.331715,477,6.169611,52.652256,2.946955,0.065488,70.203012
TTTGGTTTCGTGGACC-R13,0.863149,4897.0,R13,0.073171,R13,A13,GX43,NX82,113395,8/12/1976,...,,RNA+ATAC,,8.496582,2021,7.611842,2.185011,17.868082,0.000000,25.607515
TTTGTTGGTCACTACA-R13,0.863149,4030.0,R13,0.117967,R13,A13,GX43,NX82,113395,8/12/1976,...,,RNA+ATAC,,8.301770,1666,7.418781,5.930521,16.947891,0.000000,32.655087


In [6]:
#adata_raw.obs.index

In [11]:
#pd.set_option('display.max_columns', None)

# Merge observational table by cell index,
adata_raw.obs = pd.merge(adata_raw.obs, adata.obs, how='left', left_index=True, right_index=True)

# This fix is required for downstream use of AnnData object
# Solution adopted from: https://github.com/scverse/scanpy/issues/747
adata_raw.obs.index = adata_raw.obs.index.astype(str)

In [12]:
adata_raw.obs

Unnamed: 0_level_0,doublet_bh_pval_x,n_counts_x,sample_id_x,scrublet_score_x,GEX_SID_x,ATAC_SID_x,GEX_x,ATAC_x,Sample_x,DOB_x,...,leiden_bk_r0_5,is_doublet,leiden_bk_split1,leiden_bk_split2,leiden_bk_split3,annot1,leiden_bk_split4,annot_gut_lr,annot_gut_lr_prob,broad_annot1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCATCGTGCTCT-R01,0.737696,1890.0,R01,0.286344,R01,A01,GX46,NX02,110544,24/10/1951,...,6,False,6,6,6,S2,6,WNT5B+ 1,0.253333,S2
AAACGAACAACTTCTT-R01,0.750645,634.0,R01,0.075342,R01,A01,GX46,NX02,110544,24/10/1951,...,,,,,,,,,,
AAACGAACATCTGCGG-R01,0.737696,804.0,R01,0.042802,R01,A01,GX46,NX02,110544,24/10/1951,...,,,,,,,,,,
AAACGAACATGTTACG-R01,0.737696,2811.0,R01,0.192308,R01,A01,GX46,NX02,110544,24/10/1951,...,1,False,1,1,1,S1,1,WNT2B+ Fos-hi,0.514190,S1
AAACGAAGTTCCGTTC-R01,0.737696,1707.0,R01,0.167630,R01,A01,GX46,NX02,110544,24/10/1951,...,4,False,4,4,4,S3x,4,WNT2B+ Fos-hi,0.294068,S3x
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTGTACTAGCT-R13,0.863149,5568.0,R13,0.142857,R13,A13,GX43,NX82,113395,8/12/1976,...,0,False,00,00,00,S3,00,RSPO3+,0.231953,S3
TTTGGTTTCCATCTGC-R13,0.863149,1527.0,R13,0.064639,R13,A13,GX43,NX82,113395,8/12/1976,...,,,,,,,,,,
TTTGGTTTCGTGGACC-R13,0.863149,4897.0,R13,0.073171,R13,A13,GX43,NX82,113395,8/12/1976,...,0,False,00,00,00,S3,00,RSPO3+,0.272154,S3
TTTGTTGGTCACTACA-R13,0.863149,4030.0,R13,0.117967,R13,A13,GX43,NX82,113395,8/12/1976,...,0,False,00,00,00,S3,00,RSPO3+,0.148830,S3


In [13]:
# Subset data
select_cells = ['S1', 'S2', 'S3', 'S3x', 'S4', 'S5', 'MF1', 'MF2', 'PC']
adata_raw = adata_raw[adata_raw.obs.annot1.isin(select_cells), :]

In [14]:
adata_raw.obs.annot1.value_counts()

annot1
S3     8295
S1     5644
S3x    3175
S5     3103
S2     3029
MF1    1401
PC     1200
S4      699
MF2     181
Name: count, dtype: int64

In [15]:
adata_raw.obs

Unnamed: 0_level_0,doublet_bh_pval_x,n_counts_x,sample_id_x,scrublet_score_x,GEX_SID_x,ATAC_SID_x,GEX_x,ATAC_x,Sample_x,DOB_x,...,leiden_bk_r0_5,is_doublet,leiden_bk_split1,leiden_bk_split2,leiden_bk_split3,annot1,leiden_bk_split4,annot_gut_lr,annot_gut_lr_prob,broad_annot1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCATCGTGCTCT-R01,0.737696,1890.0,R01,0.286344,R01,A01,GX46,NX02,110544,24/10/1951,...,6,False,6,6,6,S2,6,WNT5B+ 1,0.253333,S2
AAACGAACATGTTACG-R01,0.737696,2811.0,R01,0.192308,R01,A01,GX46,NX02,110544,24/10/1951,...,1,False,1,1,1,S1,1,WNT2B+ Fos-hi,0.514190,S1
AAACGAAGTTCCGTTC-R01,0.737696,1707.0,R01,0.167630,R01,A01,GX46,NX02,110544,24/10/1951,...,4,False,4,4,4,S3x,4,WNT2B+ Fos-hi,0.294068,S3x
AAACGCTGTCGTTGGC-R01,0.737696,1959.0,R01,0.029213,R01,A01,GX46,NX02,110544,24/10/1951,...,4,False,4,4,4,S3x,4,RSPO3+,0.274607,S3x
AAAGAACCACAAGGTG-R01,0.737696,2126.0,R01,0.122970,R01,A01,GX46,NX02,110544,24/10/1951,...,1,False,1,1,1,S1,1,WNT2B+ Fos-hi,0.303378,S1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCATTCGATG-R13,0.863149,2774.0,R13,0.067358,R13,A13,GX43,NX82,113395,8/12/1976,...,4,False,4,4,4,S3x,4,RSPO3+,0.200662,S3x
TTTGGTTGTACTAGCT-R13,0.863149,5568.0,R13,0.142857,R13,A13,GX43,NX82,113395,8/12/1976,...,0,False,00,00,00,S3,00,RSPO3+,0.231953,S3
TTTGGTTTCGTGGACC-R13,0.863149,4897.0,R13,0.073171,R13,A13,GX43,NX82,113395,8/12/1976,...,0,False,00,00,00,S3,00,RSPO3+,0.272154,S3
TTTGTTGGTCACTACA-R13,0.863149,4030.0,R13,0.117967,R13,A13,GX43,NX82,113395,8/12/1976,...,0,False,00,00,00,S3,00,RSPO3+,0.148830,S3


In [16]:
adata_raw.write_h5ad('../data/rna/scRNA_batch1_2.raw.fibroblasts.h5ad')