## Import modules

In [1]:
import numpy as np
import pandas as pd
import anndata
import scipy
import scanpy as sc
import os

import matplotlib.pyplot as plt
import seaborn as sns

import celltypist
from celltypist import models

import warnings
warnings.simplefilter("ignore", UserWarning)

In [2]:
import distributed
import session_info
session_info.show()

In [3]:
# import own function
import importlib.util
import sys
spec = importlib.util.spec_from_file_location("module.name", "/nfs/team205/kk18/function/python/utils.py")
utils = importlib.util.module_from_spec(spec)
sys.modules["module.name"] = utils
spec.loader.exec_module(utils)

In [4]:
sc.settings.set_figure_params(dpi=120)

## Read in reference anndata

In [5]:
adata = sc.read_h5ad('/nfs/team205/heart/anndata_objects/Foetal/Feb28ObjectRaw_finegrain_updated.h5ad')
adata

AnnData object with n_obs × n_vars = 297473 × 36601
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'region', 'age', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'donor_by_library-prep', 'multiplexed', 'SOC | status', 'SOC | log_prob_singleton', 'SOC | log_prob_doublet', 'batch_key', '_scvi_batch', 'FACSgate', 'fine_grain', 'mid_grain', 'coarse_grain', 'sex', 'week', 'trimester', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', '_scvi_labels', 'stress_score', 'hb1_score'
    var: 'gene_name_scRNA-0', 'gene_id'
    uns: 'FACSgate_colors', '_scvi_manager_uuid', '_scvi_uuid', 'age_colors', 'cell_or_nucl

In [6]:
adata.X.data[:5]

array([1., 1., 5., 5., 8.], dtype=float32)

## Subset Xenium genes

In [7]:
# read in genes
adata_xenium = sc.read_h5ad('/lustre/scratch126/cellgen/team205/kk18/xenium/objects/foetal/C194-HEA-0-FFPE-1_5K_filtered_raw.h5ad',
                            backed='r'
                           )
xenium_genes = list(adata_xenium.var_names)
print(len(xenium_genes))
xenium_genes[:5]

5001


['A2ML1', 'AAMP', 'AAR2', 'AARSD1', 'ABAT']

In [8]:
# subset
adata = adata[:,xenium_genes]
print(adata.shape)
print(adata.X.data[:5])

(297473, 5001)


[1. 1. 3. 4. 1.]


## Log-normalise

* It's important to first subset genes, and then normalise
* Since that will be the case for query Xenium data

In [9]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
adata.X.data[:5]

array([0.04801895, 0.04801895, 0.13764806, 0.1796197 , 0.04801895],
      dtype=float32)

## Coarse-grained model

In [10]:
adata.obs['coarse_grain'].value_counts()

coarse_grain
Mesenchymal       148099
Cardiomyocytes     74884
Leukocytes         41046
Endothelium        23109
Neural              6149
Epicardium          4186
Name: count, dtype: int64

In [11]:
# subsample
n_cell_subsample = 20000
label_target = 'coarse_grain'
adata_sub = utils.sctk_subsample(adata,fraction=1,groupby=label_target,max_n=n_cell_subsample,random_state=0)
adata_sub.obs[label_target].value_counts()

coarse_grain
Cardiomyocytes    20000
Mesenchymal       20000
Endothelium       20000
Leukocytes        20000
Neural             6149
Epicardium         4186
Name: count, dtype: int64

In [12]:
# training
model = celltypist.train(adata_sub,
                     labels = label_target,
                     n_jobs = 10,
                     feature_selection = False, # since manually selected
                     check_expression = False
                    )
# save model
model.write(f'/nfs/team205/kk18/notebooks/Foetal/Xenium/5K/celltypist_models/global2coarse.pkl')

🍳 Preparing data before training


✂️ 26 non-expressed genes are filtered out


🔬 Input data has 90335 cells and 4975 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!


## Create per coarse-grained model, to predict modified mid-grained

* Modified mid-grained: split CCS cells into atiral and ventricular CMs

In [13]:
set(adata[adata.obs['mid_grain']=='CardiacConductionSystem'].obs['fine_grain'])

{'AtrioventricularNodePacemakerCells',
 'SinoatrialNodePacemakerCells',
 'VentricularConductionSystemDistal',
 'VentricularConductionSystemProximal'}

In [14]:
# modify mid-grain cell types
CCS_atrial_obsnames = adata.obs_names[adata.obs['fine_grain'].isin(['AtrioventricularNodePacemakerCells','SinoatrialNodePacemakerCells'])]
CCS_vent_obsnames = adata.obs_names[adata.obs['fine_grain'].isin(['VentricularConductionSystemDistal','VentricularConductionSystemProximal'])]
adata.obs['mid_grain_mod'] = adata.obs['mid_grain'].astype('str').copy()
adata.obs.loc[CCS_atrial_obsnames,'mid_grain_mod'] = 'AtrialCardiomyocytes'
adata.obs.loc[CCS_vent_obsnames,'mid_grain_mod'] = 'VentricularCardiomyocytes'
adata.obs['mid_grain_mod'] = adata.obs['mid_grain_mod'].astype('category')
adata.obs['mid_grain_mod'] = adata.obs['mid_grain_mod'].cat.reorder_categories(['AtrialCardiomyocytes', 'VentricularCardiomyocytes',
       'Fibroblasts', 'MuralCells',
       'PericardialCells', 'BloodVesselEndothelialCells', 'EndocardialCells',
       'LymphaticEndothelialCells', 'EpicardialCells', 'Neurons', 'Glia',
       'MyeloidCells', 'LymphoidCells'])

In [15]:
pd.crosstab(adata.obs['mid_grain_mod'],adata.obs['coarse_grain'])

coarse_grain,Cardiomyocytes,Mesenchymal,Endothelium,Epicardium,Neural,Leukocytes
mid_grain_mod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AtrialCardiomyocytes,21616,0,0,0,0,0
VentricularCardiomyocytes,53268,0,0,0,0,0
Fibroblasts,0,102791,0,0,0,0
MuralCells,0,28497,0,0,0,0
PericardialCells,0,16811,0,0,0,0
BloodVesselEndothelialCells,0,0,11352,0,0,0
EndocardialCells,0,0,9346,0,0,0
LymphaticEndothelialCells,0,0,2411,0,0,0
EpicardialCells,0,0,0,4186,0,0
Neurons,0,0,0,0,2608,0


In [16]:
%%time
n_cell_subsample_dict = {
    'Cardiomyocytes':20000,
    'Mesenchymal':20000,
    'Endothelium':None,
    # 'Epicardium':None, # only one cell type
    'Neural':None,
    'Leukocytes':None
}
label_compartment = 'coarse_grain'
label_target = 'mid_grain_mod'

for celltype,n_cell_subsample in n_cell_subsample_dict.items():
    print(celltype)
    # subset
    adata_sub = adata[adata.obs[label_compartment]==celltype]
    # subsample
    if n_cell_subsample!=None:
        print('subsampling')
        adata_sub = utils.sctk_subsample(adata_sub,fraction=1,groupby=label_target,max_n=n_cell_subsample,random_state=0)
    else:
        print('no subsampling')
    print(adata_sub.obs[label_target].value_counts())
    # training
    model = celltypist.train(adata_sub,
                         labels = label_target,
                         n_jobs = 10,
                         feature_selection = False, # since manually selected
                         check_expression = False
                        )
    # save model
    model.write(f'/nfs/team205/kk18/notebooks/Foetal/Xenium/5K/celltypist_models/coarse2midmod_{celltype}.pkl')
    print('')

Cardiomyocytes
subsampling


🍳 Preparing data before training


mid_grain_mod
AtrialCardiomyocytes         20000
VentricularCardiomyocytes    20000
Name: count, dtype: int64


✂️ 134 non-expressed genes are filtered out


🔬 Input data has 40000 cells and 4867 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!



Mesenchymal
subsampling


🍳 Preparing data before training


mid_grain_mod
Fibroblasts         20000
MuralCells          20000
PericardialCells    16811
Name: count, dtype: int64


✂️ 43 non-expressed genes are filtered out


🔬 Input data has 56811 cells and 4958 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!


🍳 Preparing data before training



Endothelium
no subsampling
mid_grain_mod
BloodVesselEndothelialCells    11352
EndocardialCells                9346
LymphaticEndothelialCells       2411
Name: count, dtype: int64


✂️ 85 non-expressed genes are filtered out


🔬 Input data has 23109 cells and 4916 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!


🍳 Preparing data before training



Neural
no subsampling
mid_grain_mod
Glia       3541
Neurons    2608
Name: count, dtype: int64


✂️ 199 non-expressed genes are filtered out


🔬 Input data has 6149 cells and 4802 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!


🍳 Preparing data before training



Leukocytes
no subsampling
mid_grain_mod
LymphoidCells    21084
MyeloidCells     19962
Name: count, dtype: int64


✂️ 54 non-expressed genes are filtered out


🔬 Input data has 41046 cells and 4947 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!



CPU times: user 57min 8s, sys: 24.3 s, total: 57min 32s
Wall time: 58min 19s


In [17]:
os.getcwd()

'/nfs/team205/kk18/notebooks/Foetal/Xenium/5K/celltypist_models/bsub'

## Create per mid-grained (modified) model, to predict fine-grained

In [18]:
with pd.option_context('display.max_rows', None,):
    display(pd.crosstab(adata.obs['fine_grain'],adata.obs['mid_grain_mod']))

mid_grain_mod,AtrialCardiomyocytes,VentricularCardiomyocytes,Fibroblasts,MuralCells,PericardialCells,BloodVesselEndothelialCells,EndocardialCells,LymphaticEndothelialCells,EpicardialCells,Neurons,Glia,MyeloidCells,LymphoidCells
fine_grain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AtrialCardiomyocytesLeft,6606,0,0,0,0,0,0,0,0,0,0,0,0
AtrialCardiomyocytesRight,10696,0,0,0,0,0,0,0,0,0,0,0,0
AtrialCardiomyocytesCycling,2396,0,0,0,0,0,0,0,0,0,0,0,0
VentricularCardiomyocytesLeftCompact,0,15591,0,0,0,0,0,0,0,0,0,0,0
VentricularCardiomyocytesRightCompact,0,16215,0,0,0,0,0,0,0,0,0,0,0
VentricularCardiomyocytesLeftTrabeculated,0,6997,0,0,0,0,0,0,0,0,0,0,0
VentricularCardiomyocytesRightTrabeculated,0,3101,0,0,0,0,0,0,0,0,0,0,0
VentricularCardiomyocytesCycling,0,8195,0,0,0,0,0,0,0,0,0,0,0
SinoatrialNodePacemakerCells,981,0,0,0,0,0,0,0,0,0,0,0,0
AtrioventricularNodePacemakerCells,937,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
adata.obs['mid_grain_mod'].cat.categories

Index(['AtrialCardiomyocytes', 'VentricularCardiomyocytes', 'Fibroblasts',
       'MuralCells', 'PericardialCells', 'BloodVesselEndothelialCells',
       'EndocardialCells', 'LymphaticEndothelialCells', 'EpicardialCells',
       'Neurons', 'Glia', 'MyeloidCells', 'LymphoidCells'],
      dtype='object')

In [20]:
%%time
n_cell_subsample_dict = {
    'AtrialCardiomyocytes':2000,
    'VentricularCardiomyocytes':5000,
    'Fibroblasts':5000,
    'MuralCells':5000,
    'PericardialCells':5000,
    'BloodVesselEndothelialCells':2000,
    'EndocardialCells':2000,
    # 'LymphaticEndothelialCells':None, # only one cell type
    'EpicardialCells':None,
    'Neurons':None,
    'Glia':None,
    'MyeloidCells':2000,
    'LymphoidCells':None
}
label_compartment = 'mid_grain_mod'
label_target = 'fine_grain'

for celltype,n_cell_subsample in n_cell_subsample_dict.items():
    print(celltype)
    # subset
    adata_sub = adata[adata.obs[label_compartment]==celltype]
    # subsample
    if n_cell_subsample!=None:
        print('subsampling')
        adata_sub = utils.sctk_subsample(adata_sub,fraction=1,groupby=label_target,max_n=n_cell_subsample,random_state=0)
    else:
        print('no subsampling')
    print(adata_sub.obs[label_target].value_counts())
    # training
    model = celltypist.train(adata_sub,
                         labels = label_target,
                         n_jobs = 10,
                         feature_selection = False, # since manually selected
                         check_expression = False
                        )
    # save model
    model.write(f'/nfs/team205/kk18/notebooks/Foetal/Xenium/5K/celltypist_models/midmod2fine_{celltype}.pkl')
    print('')

AtrialCardiomyocytes
subsampling


🍳 Preparing data before training


✂️ 292 non-expressed genes are filtered out


🔬 Input data has 7918 cells and 4709 genes


⚖️ Scaling input data


fine_grain
AtrialCardiomyocytesLeft              2000
AtrialCardiomyocytesRight             2000
AtrialCardiomyocytesCycling           2000
SinoatrialNodePacemakerCells           981
AtrioventricularNodePacemakerCells     937
Name: count, dtype: int64


🏋️ Training data using logistic regression


✅ Model training done!



VentricularCardiomyocytes
subsampling


🍳 Preparing data before training


fine_grain
VentricularCardiomyocytesLeftCompact          5000
VentricularCardiomyocytesRightCompact         5000
VentricularCardiomyocytesLeftTrabeculated     5000
VentricularCardiomyocytesCycling              5000
VentricularCardiomyocytesRightTrabeculated    3101
VentricularConductionSystemProximal           2209
VentricularConductionSystemDistal              960
Name: count, dtype: int64


✂️ 179 non-expressed genes are filtered out


🔬 Input data has 26270 cells and 4822 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!



Fibroblasts
subsampling


🍳 Preparing data before training


fine_grain
GreatVesselAdventitialFibroblasts       5000
MyocardialInterstitialFibroblasts       5000
SubEpicardialFibroblasts                5000
Myofibroblasts                          5000
ValveInterstitialCells                  5000
CoronaryVesselAdventitialFibroblasts    3651
LymphNodeFibroblasticReticularCells     1155
Name: count, dtype: int64


✂️ 78 non-expressed genes are filtered out


🔬 Input data has 29806 cells and 4923 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!



MuralCells
subsampling


🍳 Preparing data before training


fine_grain
GreatVesselSmoothMuscleCells         5000
CoronaryPericytes                    2131
CoronarySmoothMuscleCells            1895
DuctusArteriosusSmoothMuscleCells    1479
Name: count, dtype: int64


✂️ 155 non-expressed genes are filtered out


🔬 Input data has 10505 cells and 4846 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!



PericardialCells
subsampling


🍳 Preparing data before training


✂️ 190 non-expressed genes are filtered out


🔬 Input data has 8771 cells and 4811 genes


fine_grain
PericardialCellsIntermediate    5000
PericardialCellsFibrous         2018
PericardialCellsParietal        1753
Name: count, dtype: int64


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!



BloodVesselEndothelialCells
subsampling


🍳 Preparing data before training


✂️ 168 non-expressed genes are filtered out


🔬 Input data has 6686 cells and 4833 genes


⚖️ Scaling input data


fine_grain
GreatVesselArterialEndothelialCells    2000
CoronaryCapillaryEndothelialCells      2000
CoronaryVenousEndothelialCells         1137
CoronaryArterialEndothelialCells        846
GreatVesselVenousEndothelialCells       703
Name: count, dtype: int64


🏋️ Training data using logistic regression


✅ Model training done!



EndocardialCells
subsampling


🍳 Preparing data before training


✂️ 326 non-expressed genes are filtered out


🔬 Input data has 3915 cells and 4675 genes


⚖️ Scaling input data


fine_grain
EndocardialCells           2000
ValveEndothelialCells      1296
EndocardialCushionCells     619
Name: count, dtype: int64


🏋️ Training data using logistic regression


✅ Model training done!


🍳 Preparing data before training



EpicardialCells
no subsampling
fine_grain
MesothelialEpicardialCells    2839
EpicardiumDerivedCells        1347
Name: count, dtype: int64


✂️ 172 non-expressed genes are filtered out


🔬 Input data has 4186 cells and 4829 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!


🍳 Preparing data before training



Neurons
no subsampling
fine_grain
SympatheticNeurons        1055
NeuronPrecursors           776
ParasympatheticNeurons     549
ChromaffinCells            228
Name: count, dtype: int64


✂️ 327 non-expressed genes are filtered out


🔬 Input data has 2608 cells and 4674 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!


🍳 Preparing data before training



Glia
no subsampling
fine_grain
SchwannCells             2263
SchwannCellPrecursors    1278
Name: count, dtype: int64


✂️ 295 non-expressed genes are filtered out


🔬 Input data has 3541 cells and 4706 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!



MyeloidCells
subsampling


🍳 Preparing data before training


fine_grain
MacrophagesCX3CR1pos          2000
MacrophagesLYVE1pos           2000
MacrophagesATF3pos            1844
Monocytes                      900
MastCells                      866
MonocyteDerivedCells           661
DendriticCellsType1            609
PlasmacytoidDendriticCells     608
MacrophagesTIMD4pos            470
MonocytesMPOpos                414
Megakaryocytes                 360
DendriticCellsMature           184
Name: count, dtype: int64


✂️ 126 non-expressed genes are filtered out


🔬 Input data has 10916 cells and 4875 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!


🍳 Preparing data before training



LymphoidCells
no subsampling
fine_grain
TCellsCD4pos           5660
BCellsMS4A1pos         3470
NaturalKillerCells     2483
TCellsCD8pos           2297
TregsCD4pos            2096
BCells                 2058
InnateLymphoidCells    1719
ProBCells              1301
Name: count, dtype: int64


✂️ 112 non-expressed genes are filtered out


🔬 Input data has 21084 cells and 4889 genes


⚖️ Scaling input data


🏋️ Training data using logistic regression


✅ Model training done!



CPU times: user 20.3 s, sys: 9.29 s, total: 29.6 s
Wall time: 7min 19s
