In [1]:
import scanpy as sc
import pandas as pd
import celltypist

from celltypist import models

# Run notebook as LSF job
bsub -q basement \
    -n 10 \
    -M 380GB -R "select[mem>380GB] rusage[mem=380GB]" \
    -o logs/%J_output.log -e logs/%J_error.log \
    jupyter nbconvert \
        --to notebook \
        --ExecutePreprocessor.kernel_name=scanpy \
        --clear-output \
        --execute 3_traincelltypist_on_megagut_level3.ipynb

# Train megagut CellTypist models

In [2]:
# Load megagut object
adata = sc.read_h5ad("/nfs/team205/ao15/Megagut/Annotations_v3/h5ad/pooled_healthy_disease.remapped.allgenes.fine_annot.no_doublets.20230322.h5ad")

In [3]:
# parse metadata
adata.obs = pd.read_csv('/nfs/team205/ao15/Megagut/Annotations_v3/metadata/pooled_healthy_disease.remapped.allgenes.fine_annot.no_doublets.20230322.csv',index_col=0)

  adata.obs = pd.read_csv('/nfs/team205/ao15/Megagut/Annotations_v3/metadata/pooled_healthy_disease.remapped.allgenes.fine_annot.no_doublets.20230322.csv',index_col=0)


In [4]:
#adata

In [5]:
#adata.obs.organ_broad.value_counts()

In [6]:
# Subset data to small intestine
adata = adata[adata.obs.organ_broad == 'lowerGI_small']

In [7]:
adata

View of AnnData object with n_obs × n_vars = 480472 × 36601
    obs: 'latent_cell_probability', 'latent_RT_efficiency', 'cecilia22_predH', 'cecilia22_predH_prob', 'cecilia22_predH_uncertain', 'cecilia22_predL', 'cecilia22_predL_prob', 'cecilia22_predL_uncertain', 'elmentaite21_pred', 'elmentaite21_pred_prob', 'elmentaite21_pred_uncertain', 'suo22_pred', 'suo22_pred_prob', 'suo22_pred_uncertain', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'n_counts_mito', 'percent_ribo', 'n_counts_ribo', 'percent_hb', 'n_counts_hb', 'percent_top50', 'n_counts_raw', 'log1p_n_counts_raw', 'n_genes_raw', 'log1p_n_genes_raw', 'percent_mito_raw', 'n_counts_mito_raw', 'percent_ribo_raw', 'n_counts_ribo_raw', 'percent_hb_raw', 'n_counts_hb_raw', 'percent_top50_raw', 'n_counts_spliced', 'log1p_n_counts_spliced', 'n_genes_spliced', 'log1p_n_genes_spliced', 'percent_mito_spliced', 'n_counts_mito_spliced', 'percent_ribo_spliced', 'n_counts_ribo_spliced', 'percent_hb_spliced', 'n_coun

In [8]:
# Normalize megagut
sc.pp.normalize_per_cell(adata, counts_per_cell_after=10000)

# Pseudo log transform
sc.pp.log1p(adata)

In [9]:
adata

AnnData object with n_obs × n_vars = 480472 × 36601
    obs: 'latent_cell_probability', 'latent_RT_efficiency', 'cecilia22_predH', 'cecilia22_predH_prob', 'cecilia22_predH_uncertain', 'cecilia22_predL', 'cecilia22_predL_prob', 'cecilia22_predL_uncertain', 'elmentaite21_pred', 'elmentaite21_pred_prob', 'elmentaite21_pred_uncertain', 'suo22_pred', 'suo22_pred_prob', 'suo22_pred_uncertain', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'n_counts_mito', 'percent_ribo', 'n_counts_ribo', 'percent_hb', 'n_counts_hb', 'percent_top50', 'n_counts_raw', 'log1p_n_counts_raw', 'n_genes_raw', 'log1p_n_genes_raw', 'percent_mito_raw', 'n_counts_mito_raw', 'percent_ribo_raw', 'n_counts_ribo_raw', 'percent_hb_raw', 'n_counts_hb_raw', 'percent_top50_raw', 'n_counts_spliced', 'log1p_n_counts_spliced', 'n_genes_spliced', 'log1p_n_genes_spliced', 'percent_mito_spliced', 'n_counts_mito_spliced', 'percent_ribo_spliced', 'n_counts_ribo_spliced', 'percent_hb_spliced', 'n_counts_hb_sp

In [10]:
adata.obs.level_1_annot.value_counts(dropna=False)

level_1_annot
Mesenchymal       161386
Epithelial        125209
T and NK cells     94151
B and B plasma     50928
Myeloid            18373
Neural             15906
Endothelial        14519
Name: count, dtype: int64

In [11]:
adata.obs.level_2_annot.value_counts(dropna=False)

level_2_annot
NaN                       77668
Absorptive                64938
Fibroblast                63700
Mesoderm                  53759
Conventional_CD4          23579
Mature_B                  18591
Smooth_muscle             16925
Conventional_CD8          16558
Cycling_epithelia         13374
Myofibroblast             12081
B_plasma                  11230
Vascular_endothelia       10862
Secretory                 10205
Transit_amplifying         9758
Epithelial_progenitor      8705
Macrophage                 8562
Unconventional_T/ILC       7990
Neuron_progenitor          7785
Stem                       7292
Pericyte                   5847
Neuron                     4837
Treg                       3856
DC                         3595
NK                         3207
Glia                       3025
Lymphoid_stromal_cell      2436
Enteroendocrine            1664
Mesothelium                1591
Monocyte                   1464
Lymphatic_endothelia       1399
Intestinal_Cajal_cell     

In [12]:
adata.obs.level_3_annot.value_counts(dropna=False)

level_3_annot
Enterocyte                            67021
Mesoderm_2                            40543
Lamina_propria_fibroblast_ADAMDEC1    22545
Trm_CD4                               16262
B_naive                               15483
                                      ...  
Oesophagus_fibroblast                    42
Erythrocytes                             32
Immune_recruiting_pericyte               25
Rectum_fibroblast                        22
Megakaryocyte/platelet                    4
Name: count, Length: 105, dtype: int64

In [13]:
# Check if labels contains missing values, which 

In [14]:
any(adata.obs.level_1_annot.isna())

False

In [15]:
any(adata.obs.level_2_annot.isna())

True

In [16]:
any(adata.obs.level_3_annot.isna())

False

In [17]:
#adata

In [18]:
adata[~adata.obs.level_2_annot.isna()]

View of AnnData object with n_obs × n_vars = 402804 × 36601
    obs: 'latent_cell_probability', 'latent_RT_efficiency', 'cecilia22_predH', 'cecilia22_predH_prob', 'cecilia22_predH_uncertain', 'cecilia22_predL', 'cecilia22_predL_prob', 'cecilia22_predL_uncertain', 'elmentaite21_pred', 'elmentaite21_pred_prob', 'elmentaite21_pred_uncertain', 'suo22_pred', 'suo22_pred_prob', 'suo22_pred_uncertain', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'n_counts_mito', 'percent_ribo', 'n_counts_ribo', 'percent_hb', 'n_counts_hb', 'percent_top50', 'n_counts_raw', 'log1p_n_counts_raw', 'n_genes_raw', 'log1p_n_genes_raw', 'percent_mito_raw', 'n_counts_mito_raw', 'percent_ribo_raw', 'n_counts_ribo_raw', 'percent_hb_raw', 'n_counts_hb_raw', 'percent_top50_raw', 'n_counts_spliced', 'log1p_n_counts_spliced', 'n_genes_spliced', 'log1p_n_genes_spliced', 'percent_mito_spliced', 'n_counts_mito_spliced', 'percent_ribo_spliced', 'n_counts_ribo_spliced', 'percent_hb_spliced', 'n_coun

In [19]:
# model_lvl1 = celltypist.train(adata, labels='level_1_annot', n_jobs=10, feature_selection=False, )

In [20]:
# # Save model
# model_lvl1.write('celltypist/megagutSmallIntestine_lvl1.pkl')

In [21]:
# # Exclude missing labels from training data
# model_lvl2 = celltypist.train(adata[~adata.obs.level_2_annot.isna()], labels='level_2_annot', n_jobs=10, feature_selection=False)

In [22]:
# model_lvl2.write('celltypist/megagutSmallIntestine_lvl2.pkl')

In [23]:
model_lvl3 = celltypist.train(adata, labels='level_3_annot', n_jobs=10, feature_selection=False)

🍳 Preparing data before training
✂️ 1956 non-expressed genes are filtered out
🔬 Input data has 480472 cells and 34645 genes
⚖️ Scaling input data
🏋️ Training data using logistic regression
✅ Model training done!


In [24]:
model_lvl3.write('celltypist/megagutSmallIntestine_lvl3.pkl')

In [25]:
#model = models.Model.load("/nfs/team205/ao15/Megagut/Annotations_v3/make_celltypist_models/models/1_full_healthy_reference_all_ages_organs_finalmodel.pkl")

In [26]:
#model.cell_types

In [27]:
print("finished")

finished
