## Checking for neutrophils in disease data, based on cells failing QC but annotated as neutrophils by celltypist

In [1]:
#%load_ext autoreload
#%autoreload 2
#%matplotlib inline

import os#, re, gc, joblib

import numpy as np
import numpy_groupies as npg
import scipy.sparse as sp
import pandas as pd
from sklearn.preprocessing import minmax_scale

import matplotlib.pyplot as plt
import seaborn as sn

import anndata

import scanpy as sc
import sctk as sk

In [2]:
# from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import rcParams
from matplotlib.colors import ListedColormap

rcParams["pdf.fonttype"] = 42
rcParams["ps.fonttype"] = 42
expr_cmap = sk.expression_colormap()
np.set_printoptions(linewidth=150)
sc.settings.verbosity = 0


In [3]:
ad = sc.read("/lustre/scratch126/cellgen/team205/nh3/20220125_digestive_tract_integration/v2/h5ad/pooled/pooled_disease.gene_cellbender.post_qc.20220930.h5ad")

In [4]:
ad

AnnData object with n_obs × n_vars = 436488 × 36601
    obs: 'latent_cell_probability', 'latent_RT_efficiency', 'cecilia22_predH', 'cecilia22_predH_prob', 'cecilia22_predH_uncertain', 'cecilia22_predL', 'cecilia22_predL_prob', 'cecilia22_predL_uncertain', 'elmentaite21_pred', 'elmentaite21_pred_prob', 'elmentaite21_pred_uncertain', 'suo22_pred', 'suo22_pred_prob', 'suo22_pred_uncertain', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'n_counts_mito', 'percent_ribo', 'n_counts_ribo', 'percent_hb', 'n_counts_hb', 'percent_top50', 'n_counts_raw', 'log1p_n_counts_raw', 'n_genes_raw', 'log1p_n_genes_raw', 'percent_mito_raw', 'n_counts_mito_raw', 'percent_ribo_raw', 'n_counts_ribo_raw', 'percent_hb_raw', 'n_counts_hb_raw', 'percent_top50_raw', 'n_counts_spliced', 'log1p_n_counts_spliced', 'n_genes_spliced', 'log1p_n_genes_spliced', 'percent_mito_spliced', 'n_counts_mito_spliced', 'percent_ribo_spliced', 'n_counts_ribo_spliced', 'percent_hb_spliced', 'n_counts_hb_sp

In [10]:
obs = sk.read_h5ad(
    "/lustre/scratch126/cellgen/team205/nh3/20220125_digestive_tract_integration/v2/h5ad/pooled/pooled_disease.gene_cellbender.post_qc.20220930.h5ad", component="obs"
)

In [13]:
obs.columns

Index(['latent_cell_probability', 'latent_RT_efficiency', 'cecilia22_predH',
       'cecilia22_predH_prob', 'cecilia22_predH_uncertain', 'cecilia22_predL',
       'cecilia22_predL_prob', 'cecilia22_predL_uncertain',
       'elmentaite21_pred', 'elmentaite21_pred_prob',
       'elmentaite21_pred_uncertain', 'suo22_pred', 'suo22_pred_prob',
       'suo22_pred_uncertain', 'n_counts', 'log1p_n_counts', 'n_genes',
       'log1p_n_genes', 'percent_mito', 'n_counts_mito', 'percent_ribo',
       'n_counts_ribo', 'percent_hb', 'n_counts_hb', 'percent_top50',
       'n_counts_raw', 'log1p_n_counts_raw', 'n_genes_raw',
       'log1p_n_genes_raw', 'percent_mito_raw', 'n_counts_mito_raw',
       'percent_ribo_raw', 'n_counts_ribo_raw', 'percent_hb_raw',
       'n_counts_hb_raw', 'percent_top50_raw', 'n_counts_spliced',
       'log1p_n_counts_spliced', 'n_genes_spliced', 'log1p_n_genes_spliced',
       'percent_mito_spliced', 'n_counts_mito_spliced', 'percent_ribo_spliced',
       'n_counts_ribo_spl

In [14]:
obs.suo22_pred.cat.categories.values

array(['B1', 'CD4+T', 'CD8+T', 'CYCLING_B', 'CYCLING_DC', 'CYCLING_EPITHELIUM', 'CYCLING_NK', 'CYCLING_T', 'DC1', 'DC2', 'DOUBLET',
       'DOUBLET_LYMPHOID_MACROPHAGE', 'ENDOTHELIUM_I', 'ENDOTHELIUM_II', 'ENTEROENDOCRINE_I', 'EOSINOPHIL_BASOPHIL', 'EPITHELIUM_I', 'EPITHELIUM_II',
       'FIBROBLAST_I', 'FIBROBLAST_IX', 'FIBROBLAST_VIII', 'FIBROBLAST_XI', 'GLIAL', 'HEPATOCYTE-LIKE', 'HIGH_MITO', 'ILC3', 'KERATINOCYTE',
       'LARGE_PRE_B', 'LATE_ERY', 'LATE_MK', 'LMPP_MLP', 'LOW_QUALITY', 'LOW_QUALITY_MACROPHAGE', 'LOW_QUALITY_MID_ERY_(HIGH_RIBO)',
       'MACROPHAGE_ERY', 'MACROPHAGE_LYVE1_HIGH', 'MACROPHAGE_MHCII_HIGH', 'MAST_CELL', 'MATURE_B', 'MESENCHYMAL_LYMPHOID_TISSUE_ORGANISER',
       'MESOTHELIUM', 'MID_ERY', 'MIGRATORY_DC', 'MONOCYTE_III_IL1B', 'MYOFIBROBLAST', 'MYOFIBROBLAST_I', 'NK', 'OSTEOBLAST', 'OSTEOCLAST', 'PDC',
       'PLASMA_B', 'SMOOTH_MUSCLE', 'TREG', 'TYPE_1_INNATE_T', 'TYPE_3_INNATE_T', 'VSMC_PERICYTE', 'VSMC_PERICYTE_III'], dtype=object)

In [15]:
suo22_neutrophil_counts = (
    obs[["sampleID", "suo22_pred", "good_qc_cluster_mito80"]]
    .value_counts(sort=False)
    .reset_index()
    .rename(columns={0: "count"})
    .loc[lambda x: x.suo22_pred.isin(["MONOCYTE_I_CXCR4", "MYELOCYTE", "NEUTROPHIL"])]
)

In [17]:
suo22_neutrophil_counts

Unnamed: 0,sampleID,suo22_pred,good_qc_cluster_mito80,count


In [18]:
obs.elmentaite21_pred.cat.categories.values

array(['Activated CD4 T', 'Activated CD8 T', 'Adult Glia', 'BEST2+ Goblet cell', 'BEST4+ epithelial', 'CD8 Tmem', 'CLDN10+ cells', 'CLP',
       'CX3CR1+ CD8 Tmem', 'Colonocyte', 'Contractile pericyte (PLN+)', 'Cycling B cell', 'Cycling plasma cell', 'D cells (SST+)', 'DZ GC cell',
       'EC cells (TAC1+)', 'EECs', 'Enterocyte', 'FCRL4+ Memory B', 'FDC', 'Fetal arterial EC', 'GC B cell', 'Goblet cell', 'I cells (CCK+)', 'ILC3',
       'IgA plasma cell', 'IgG plasma cell', 'IgM plasma cell', 'Immature B', 'L cells (PYY+)', 'LEC1 (ACKR4+)', 'LEC3 (ADGRG3+)', 'LEC6 (ADAMTS4+)',
       'LYVE1+ Macrophage', 'LZ GC cell', 'Lymphoid DC', 'M/X cells (MLN/GHRL+)', 'MAIT cell', 'MMP9+ Inflammatory macrophage', 'Macrophages',
       'Mast cell', 'Mature arterial EC', 'Mature venous EC', 'Memory B', 'Mesoderm 1 (HAND1+)', 'Mesoderm 2 (ZEB2+)', 'Mesothelium',
       'Microfold cell', 'Monocytes', 'NK T cell', 'NK cell', 'Naive B', 'Paneth', 'Pericyte', 'Progenitor (NEUROG3+)', 'Proximal progenitor

In [19]:
elmentaite21_neutrophil_counts = (
    obs[["sampleID", "elmentaite21_pred", "good_qc_cluster_mito80"]]
    .value_counts(sort=False)
    .reset_index()
    .rename(columns={0: "count"})
    .loc[lambda x: x.elmentaite21_pred.isin(["Monocytes", "MPO+ mono-neutrophil"])]
)

In [20]:
elmentaite21_neutrophil_counts.loc[
    lambda x: x.elmentaite21_pred == "MPO+ mono-neutrophil"
].sort_values(
    ["elmentaite21_pred", "good_qc_cluster_mito80", "sampleID", "count"],
    ascending=[True, False, True, False],
)

Unnamed: 0,sampleID,elmentaite21_pred,good_qc_cluster_mito80,count


In [16]:
suo22_neutrophil_counts.sort_values(
    ["suo22_pred", "good_qc_cluster_mito80", "sampleID", "count"],
    ascending=[True, False, True, False],
)

Unnamed: 0,sampleID,suo22_pred,good_qc_cluster_mito80,count


In [11]:
neutrophil_obs = obs.loc[
    lambda x: (
        x.suo22_pred.isin(["MONOCYTE_I_CXCR4", "MYELOCYTE", "NEUTROPHIL"])
        | (x.elmentaite21_pred.isin(["MPO+ mono-neutrophil"]))
    )
    & ~x.good_qc_cluster_mito80
]

In [12]:
neutrophil_obs

Unnamed: 0_level_0,latent_cell_probability,latent_RT_efficiency,cecilia22_predH,cecilia22_predH_prob,cecilia22_predH_uncertain,cecilia22_predL,cecilia22_predL_prob,cecilia22_predL_uncertain,elmentaite21_pred,elmentaite21_pred_prob,...,sex,sample_type,sample_category,sample_retrieval,tissue_fraction,cell_fraction,cell_fraction_unified,cell_sorting,technology,include_150722
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [22]:
obs_healthy = sk.read_h5ad(
    "/lustre/scratch126/cellgen/team205/nh3/20220125_digestive_tract_integration/v2/h5ad/pooled/pooled_healthy.gene_cellbender.post_qc.20220909.h5ad", component="obs"
)

In [23]:
suo22_neutrophil_counts = (
    obs_healthy[["sampleID", "suo22_pred", "good_qc_cluster_mito80"]]
    .value_counts(sort=False)
    .reset_index()
    .rename(columns={0: "count"})
    .loc[lambda x: x.suo22_pred.isin(["MONOCYTE_I_CXCR4", "MYELOCYTE", "NEUTROPHIL"])]
)

In [24]:
suo22_neutrophil_counts.sort_values(
    ["suo22_pred", "good_qc_cluster_mito80", "sampleID", "count"],
    ascending=[True, False, True, False],
)

Unnamed: 0,sampleID,suo22_pred,good_qc_cluster_mito80,count
77,6-Int-Fresh-Sorted,MONOCYTE_I_CXCR4,True,41
1270,4918STDY7717783,MONOCYTE_I_CXCR4,True,11
1379,4918STDY7717787,MONOCYTE_I_CXCR4,True,24
1419,4918STDY7717788,MONOCYTE_I_CXCR4,True,45
1538,4918STDY7718973,MONOCYTE_I_CXCR4,True,21
1581,4918STDY7718974,MONOCYTE_I_CXCR4,True,23
5977,HCA_A_GT12934997,MONOCYTE_I_CXCR4,True,10
6000,HCA_A_GT12934998,MONOCYTE_I_CXCR4,True,10
6041,HCA_A_GT12934999,MONOCYTE_I_CXCR4,True,7
6138,HCA_A_GT12935002,MONOCYTE_I_CXCR4,True,7


In [26]:
elmentaite21_neutrophil_counts = (
    obs_healthy[["sampleID", "elmentaite21_pred", "good_qc_cluster_mito80"]]
    .value_counts(sort=False)
    .reset_index()
    .rename(columns={0: "count"})
    .loc[lambda x: x.elmentaite21_pred.isin(["Monocytes", "MPO+ mono-neutrophil"])]
)

In [27]:
elmentaite21_neutrophil_counts.loc[
    lambda x: x.elmentaite21_pred == "MPO+ mono-neutrophil"
].sort_values(
    ["elmentaite21_pred", "good_qc_cluster_mito80", "sampleID", "count"],
    ascending=[True, False, True, False],
)

Unnamed: 0,sampleID,elmentaite21_pred,good_qc_cluster_mito80,count
504,4918STDY7317587,MPO+ mono-neutrophil,True,17
2245,ERR3245543,MPO+ mono-neutrophil,True,43
7491,HCA_A_GT12934997,MPO+ mono-neutrophil,True,7
7508,HCA_A_GT12934998,MPO+ mono-neutrophil,True,2
7538,HCA_A_GT12934999,MPO+ mono-neutrophil,True,11
503,4918STDY7317587,MPO+ mono-neutrophil,False,4
2244,ERR3245543,MPO+ mono-neutrophil,False,52
7490,HCA_A_GT12934997,MPO+ mono-neutrophil,False,48
7507,HCA_A_GT12934998,MPO+ mono-neutrophil,False,13
7537,HCA_A_GT12934999,MPO+ mono-neutrophil,False,22


In [28]:
neutrophil_obs = obs_healthy.loc[
    lambda x: (
        x.suo22_pred.isin(["MONOCYTE_I_CXCR4", "MYELOCYTE", "NEUTROPHIL"])
        | (x.elmentaite21_pred.isin(["MPO+ mono-neutrophil"]))
    )
    & ~x.good_qc_cluster_mito80
]

In [29]:
neutrophil_obs

Unnamed: 0_level_0,latent_cell_probability,latent_RT_efficiency,cecilia22_predH,cecilia22_predH_prob,cecilia22_predH_uncertain,cecilia22_predL,cecilia22_predL_prob,cecilia22_predL_uncertain,elmentaite21_pred,elmentaite21_pred_prob,...,sex,sample_type,sample_category,sample_retrieval,tissue_fraction,cell_fraction,cell_fraction_unified,cell_sorting,technology,include_150722
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGCAACTGCGC-HCA_A_GT12934997,0.999836,0.857807,Monocytes,0.990857,Monocytes,Classical monocytes,0.922057,Classical monocytes,Monocytes,0.780143,...,Female,Organ_donor_resection,Non_pathological,DBD,Epithelium,Total,Total,,10X_5p,healthy_reference
AAACCTGCATGGTAGG-HCA_A_GT12934997,0.999973,1.019605,Monocytes,0.995862,Monocytes,Classical monocytes,0.995586,Classical monocytes,Monocytes,0.414794,...,Female,Organ_donor_resection,Non_pathological,DBD,Epithelium,Total,Total,,10X_5p,healthy_reference
AAAGATGAGGACAGCT-HCA_A_GT12934997,0.999740,0.827556,Monocytes,0.987690,Monocytes,Classical monocytes,0.968681,Classical monocytes,Monocytes,0.879200,...,Female,Organ_donor_resection,Non_pathological,DBD,Epithelium,Total,Total,,10X_5p,healthy_reference
AAAGTAGTCAACGAAA-HCA_A_GT12934997,0.999746,0.844798,Monocytes,0.904525,Monocytes,Classical monocytes,0.408359,Classical monocytes,Monocytes,0.982469,...,Female,Organ_donor_resection,Non_pathological,DBD,Epithelium,Total,Total,,10X_5p,healthy_reference
AACACGTAGAGACTAT-HCA_A_GT12934997,0.999990,1.046917,Monocytes,0.999294,Monocytes,Classical monocytes,0.961076,Classical monocytes,Monocytes,0.095466,...,Female,Organ_donor_resection,Non_pathological,DBD,Epithelium,Total,Total,,10X_5p,healthy_reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CTACATTGTCTGCCAG-HT228-fetal-ileum,0.999368,0.764953,Monocytes,0.290352,Uncertain,Classical monocytes,0.554634,Classical monocytes,Monocytes,0.963459,...,Male,Organ_donor_resection,Non_pathological,Unspecified,Full_thickness,Total,Total,,10X_3pv2,healthy_reference
CTCGTCATCACCTTAT-HT228-fetal-ileum,0.999655,0.781690,Monocytes,0.732175,Monocytes,Classical monocytes,0.912063,Classical monocytes,Monocytes,0.941024,...,Male,Organ_donor_resection,Non_pathological,Unspecified,Full_thickness,Total,Total,,10X_3pv2,healthy_reference
TGACAACAGAAGGGTA-HT228-fetal-ileum,0.999645,0.805281,Monocytes,0.999954,Monocytes,Classical monocytes,0.999600,Classical monocytes,Monocytes,1.000000,...,Male,Organ_donor_resection,Non_pathological,Unspecified,Full_thickness,Total,Total,,10X_3pv2,healthy_reference
TTCTACACATGACATC-HT228-fetal-ileum,0.999886,1.197547,Monocytes,0.988781,Monocytes,Classical monocytes,0.988855,Classical monocytes,Monocytes,0.999451,...,Male,Organ_donor_resection,Non_pathological,Unspecified,Full_thickness,Total,Total,,10X_3pv2,healthy_reference
