## Import modules

In [1]:
import numpy as np
import pandas as pd
import anndata
import scipy
import scanpy as sc
import os

import matplotlib.pyplot as plt
import seaborn as sns

import celltypist
from celltypist import models

import warnings
warnings.simplefilter("ignore", UserWarning)

In [3]:
import distributed
import session_info
session_info.show()

In [4]:
# import own function
import importlib.util
import sys
spec = importlib.util.spec_from_file_location("module.name", "/nfs/team205/kk18/function/python/utils.py")
utils = importlib.util.module_from_spec(spec)
sys.modules["module.name"] = utils
spec.loader.exec_module(utils)

In [5]:
sc.settings.set_figure_params(dpi=120)

## Read in reference anndata

In [6]:
adata = sc.read_h5ad('/nfs/team205/heart/anndata_objects/Foetal/Feb28ObjectRaw_finegrain_updated.h5ad')
adata

AnnData object with n_obs × n_vars = 297473 × 36601
    obs: 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'sangerID', 'combinedID', 'donor', 'region', 'age', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'n_genes', 'n_counts', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'HB_score', 'donor_by_library-prep', 'multiplexed', 'SOC | status', 'SOC | log_prob_singleton', 'SOC | log_prob_doublet', 'batch_key', '_scvi_batch', 'FACSgate', 'fine_grain', 'mid_grain', 'coarse_grain', 'sex', 'week', 'trimester', 'heart_or_greatvessels', 'cycling', 'S_score', 'G2M_score', 'phase', '_scvi_labels', 'stress_score', 'hb1_score'
    var: 'gene_name_scRNA-0', 'gene_id'
    uns: 'FACSgate_colors', '_scvi_manager_uuid', '_scvi_uuid', 'age_colors', 'cell_or_nucl

In [7]:
adata.X.data[:5]

array([1., 1., 5., 5., 8.], dtype=float32)

## Subset Xenium genes

In [8]:
# read in genes
adata_xenium = sc.read_h5ad('/lustre/scratch126/cellgen/team205/kk18/xenium/objects/foetal/C194-HEA-0-FFPE-1_5K_filtered_raw.h5ad',
                            backed='r'
                           )
xenium_genes = list(adata_xenium.var_names)
print(len(xenium_genes))
xenium_genes[:5]

5001


['A2ML1', 'AAMP', 'AAR2', 'AARSD1', 'ABAT']

In [9]:
# subset
adata = adata[:,xenium_genes]
print(adata.shape)
print(adata.X.data[:5])

(297473, 5001)
[1. 1. 3. 4. 1.]


## Log-normalise

* It's important to first subset genes, and then normalise
* Since that will be the case for query Xenium data

In [None]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
adata.X.data[:5]

## Coarse-grained model

In [None]:
adata.obs['coarse_grain'].value_counts()

In [None]:
# subsample
n_cell_subsample = 20000
label_target = 'coarse_grain'
adata_sub = utils.sctk_subsample(adata,fraction=1,groupby=label_target,max_n=n_cell_subsample,random_state=0)
adata_sub.obs[label_target].value_counts()

In [None]:
# training
model = celltypist.train(adata_sub,
                     labels = label_target,
                     n_jobs = 10,
                     feature_selection = False, # since manually selected
                     check_expression = False
                    )
# save model
model.write(f'/nfs/team205/kk18/notebooks/Foetal/Xenium/5K/celltypist_models/global2coarse.pkl')

## Create per coarse-grained model, to predict modified mid-grained

* Modified mid-grained: split CCS cells into atiral and ventricular CMs

In [None]:
set(adata[adata.obs['mid_grain']=='CardiacConductionSystem'].obs['fine_grain'])

In [None]:
# modify mid-grain cell types
CCS_atrial_obsnames = adata.obs_names[adata.obs['fine_grain'].isin(['AtrioventricularNodePacemakerCells','SinoatrialNodePacemakerCells'])]
CCS_vent_obsnames = adata.obs_names[adata.obs['fine_grain'].isin(['VentricularConductionSystemDistal','VentricularConductionSystemProximal'])]
adata.obs['mid_grain_mod'] = adata.obs['mid_grain'].astype('str').copy()
adata.obs.loc[CCS_atrial_obsnames,'mid_grain_mod'] = 'AtrialCardiomyocytes'
adata.obs.loc[CCS_vent_obsnames,'mid_grain_mod'] = 'VentricularCardiomyocytes'
adata.obs['mid_grain_mod'] = adata.obs['mid_grain_mod'].astype('category')
adata.obs['mid_grain_mod'] = adata.obs['mid_grain_mod'].cat.reorder_categories(['AtrialCardiomyocytes', 'VentricularCardiomyocytes',
       'Fibroblasts', 'MuralCells',
       'PericardialCells', 'BloodVesselEndothelialCells', 'EndocardialCells',
       'LymphaticEndothelialCells', 'EpicardialCells', 'Neurons', 'Glia',
       'MyeloidCells', 'LymphoidCells'])

In [None]:
pd.crosstab(adata.obs['mid_grain_mod'],adata.obs['coarse_grain'])

In [None]:
%%time
n_cell_subsample_dict = {
    'Cardiomyocytes':20000,
    'Mesenchymal':20000,
    'Endothelium':None,
    # 'Epicardium':None, # only one cell type
    'Neural':None,
    'Leukocytes':None
}
label_compartment = 'coarse_grain'
label_target = 'mid_grain_mod'

for celltype,n_cell_subsample in n_cell_subsample_dict.items():
    print(celltype)
    # subset
    adata_sub = adata[adata.obs[label_compartment]==celltype]
    # subsample
    if n_cell_subsample!=None:
        print('subsampling')
        adata_sub = utils.sctk_subsample(adata_sub,fraction=1,groupby=label_target,max_n=n_cell_subsample,random_state=0)
    else:
        print('no subsampling')
    print(adata_sub.obs[label_target].value_counts())
    # training
    model = celltypist.train(adata_sub,
                         labels = label_target,
                         n_jobs = 10,
                         feature_selection = False, # since manually selected
                         check_expression = False
                        )
    # save model
    model.write(f'/nfs/team205/kk18/notebooks/Foetal/Xenium/5K/celltypist_models/coarse2midmod_{celltype}.pkl')
    print('')

In [None]:
os.getcwd()

## Create per mid-grained (modified) model, to predict fine-grained

In [None]:
with pd.option_context('display.max_rows', None,):
    display(pd.crosstab(adata.obs['fine_grain'],adata.obs['mid_grain_mod']))

In [None]:
adata.obs['mid_grain_mod'].cat.categories

In [None]:
%%time
n_cell_subsample_dict = {
    'AtrialCardiomyocytes':2000,
    'VentricularCardiomyocytes':5000,
    'Fibroblasts':5000,
    'MuralCells':5000,
    'PericardialCells':5000,
    'BloodVesselEndothelialCells':2000,
    'EndocardialCells':2000,
    # 'LymphaticEndothelialCells':None, # only one cell type
    'EpicardialCells':None,
    'Neurons':None,
    'Glia':None,
    'MyeloidCells':2000,
    'LymphoidCells':None
}
label_compartment = 'mid_grain_mod'
label_target = 'fine_grain'

for celltype,n_cell_subsample in n_cell_subsample_dict.items():
    print(celltype)
    # subset
    adata_sub = adata[adata.obs[label_compartment]==celltype]
    # subsample
    if n_cell_subsample!=None:
        print('subsampling')
        adata_sub = utils.sctk_subsample(adata_sub,fraction=1,groupby=label_target,max_n=n_cell_subsample,random_state=0)
    else:
        print('no subsampling')
    print(adata_sub.obs[label_target].value_counts())
    # training
    model = celltypist.train(adata_sub,
                         labels = label_target,
                         n_jobs = 10,
                         feature_selection = False, # since manually selected
                         check_expression = False
                        )
    # save model
    model.write(f'/nfs/team205/kk18/notebooks/Foetal/Xenium/5K/celltypist_models/midmod2fine_{celltype}.pkl')
    print('')