# Prepare objects for cellxgene - Pan-GI integration project

In [3]:
import scanpy as sc
import pandas as pd
import numpy as np
import sys
import os
from collections import Counter

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex
import ast

In [4]:
sc.logging.print_header()

scanpy==1.8.0 anndata==0.8.0 umap==0.4.6 numpy==1.20.1 scipy==1.6.1 pandas==1.3.5 scikit-learn==0.24.1 statsmodels==0.13.5 python-igraph==0.8.3 louvain==0.7.0 leidenalg==0.8.3 pynndescent==0.5.2


In [5]:
healthy = sc.read_h5ad('/nfs/team205/ao15/Megagut/Annotations_v3/h5ad/compartments/post_scvi20230126/Myeloid.hvg5000_noCC.scvi_output_nocatcovar.fine_annot.20230126.h5ad')

In [6]:
healthy_annot = pd.read_csv('/nfs/team205/ao15/Megagut/Annotations_v3/compartment_annotations/csv/Myeloid_annot_v3.csv',index_col=0)

In [7]:
healthy.obs['level_3_annot'] = healthy_annot.annot_v3

In [8]:
neutro = sc.read_h5ad('/nfs/team205/ao15/Megagut/Annotations_v3/neutrophils/pooled_healthy.gene_cellbender.bad_qc_cluster_mito80.neutrophil.20221013.h5ad')

In [9]:
neutro_annot = pd.read_csv('/nfs/team205/ao15/Megagut/Annotations_v3/disease_annot/resolved_annot/Myeloid_neutrophils_annot20231207.csv',index_col=0)

In [10]:
neutro

AnnData object with n_obs × n_vars = 1893 × 36601
    obs: 'latent_cell_probability', 'latent_RT_efficiency', 'cecilia22_predH', 'cecilia22_predH_prob', 'cecilia22_predH_uncertain', 'cecilia22_predL', 'cecilia22_predL_prob', 'cecilia22_predL_uncertain', 'elmentaite21_pred', 'elmentaite21_pred_prob', 'elmentaite21_pred_uncertain', 'suo22_pred', 'suo22_pred_prob', 'suo22_pred_uncertain', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'n_counts_mito', 'percent_ribo', 'n_counts_ribo', 'percent_hb', 'n_counts_hb', 'percent_top50', 'n_counts_raw', 'log1p_n_counts_raw', 'n_genes_raw', 'log1p_n_genes_raw', 'percent_mito_raw', 'n_counts_mito_raw', 'percent_ribo_raw', 'n_counts_ribo_raw', 'percent_hb_raw', 'n_counts_hb_raw', 'percent_top50_raw', 'n_counts_spliced', 'log1p_n_counts_spliced', 'n_genes_spliced', 'log1p_n_genes_spliced', 'percent_mito_spliced', 'n_counts_mito_spliced', 'percent_ribo_spliced', 'n_counts_ribo_spliced', 'percent_hb_spliced', 'n_counts_hb_spli

In [11]:
neutro_annot

Unnamed: 0_level_0,fine_predicted_labels_uncertflagged,fine_predicted_labels_resolved
index,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACCTGCAACTGCGC-HCA_A_GT12934997,Monocyte,Neutrophil_circulating
AAACCTGCATGGTAGG-HCA_A_GT12934997,Monocyte,Neutrophil_circulating
AAAGATGAGGACAGCT-HCA_A_GT12934997,Monocyte,Neutrophil_circulating
AAAGTAGTCAACGAAA-HCA_A_GT12934997,Monocyte,Neutrophil_circulating
AACACGTAGAGACTAT-HCA_A_GT12934997,Monocyte,Neutrophil_circulating
...,...,...
CTACATTGTCTGCCAG-HT228-fetal-ileum,Mono/neutrophil_MPO,Neutrophil_mature
CTCGTCATCACCTTAT-HT228-fetal-ileum,Mono/neutrophil_MPO,Neutrophil_mature
TGACAACAGAAGGGTA-HT228-fetal-ileum,Monocyte,Neutrophil_circulating
TTCTACACATGACATC-HT228-fetal-ileum,Monocyte,Neutrophil_fetal


In [12]:
neutro.obs['level_3_annot'] = neutro_annot.fine_predicted_labels_resolved

In [13]:
obs_to_keep = neutro.obs[['donorID_unified', 'level_3_annot', 'log1p_n_counts','percent_mito']]

In [14]:
obs_to_keep

Unnamed: 0_level_0,donorID_unified,level_3_annot,log1p_n_counts,percent_mito
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAACCTGCAACTGCGC-HCA_A_GT12934997,D105,Neutrophil_circulating,6.432940,0.000000
AAACCTGCATGGTAGG-HCA_A_GT12934997,D105,Neutrophil_circulating,7.060476,0.000000
AAAGATGAGGACAGCT-HCA_A_GT12934997,D105,Neutrophil_circulating,6.345636,0.000000
AAAGTAGTCAACGAAA-HCA_A_GT12934997,D105,Neutrophil_circulating,6.343881,0.000000
AACACGTAGAGACTAT-HCA_A_GT12934997,D105,Neutrophil_circulating,6.882438,0.000000
...,...,...,...,...
CTACATTGTCTGCCAG-HT228-fetal-ileum,F25,Neutrophil_mature,7.082549,0.504202
CTCGTCATCACCTTAT-HT228-fetal-ileum,F25,Neutrophil_mature,7.309882,0.803213
TGACAACAGAAGGGTA-HT228-fetal-ileum,F25,Neutrophil_circulating,7.782390,1.126408
TTCTACACATGACATC-HT228-fetal-ileum,F25,Neutrophil_fetal,8.425735,0.986409


In [15]:
del neutro.obs

In [16]:
neutro.obs = obs_to_keep

In [17]:
neutro.X.max()

1277.0

In [18]:
obs_to_keep = healthy.obs[['donorID_unified', 'level_3_annot', 'log1p_n_counts','percent_mito']]

In [19]:
del healthy.obs

In [20]:
healthy.X.max()

13567.0

In [21]:
healthy.obs = obs_to_keep

In [22]:
adata = healthy.concatenate(neutro,index_unique=None)

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [23]:
adata

AnnData object with n_obs × n_vars = 39286 × 36601
    obs: 'donorID_unified', 'level_3_annot', 'log1p_n_counts', 'percent_mito', 'batch'
    var: 'gene_ids', 'feature_type', 'mito', 'ribo', 'hb', 'n_counts', 'n_counts_raw', 'n_counts_spliced', 'n_counts_unspliced', 'n_cells', 'n_cells_raw', 'n_cells_spliced', 'n_cells_unspliced', 'cc', 'ig', 'tcr'

In [24]:
healthy

AnnData object with n_obs × n_vars = 37393 × 36601
    obs: 'donorID_unified', 'level_3_annot', 'log1p_n_counts', 'percent_mito'
    var: 'gene_ids', 'feature_type', 'mito', 'ribo', 'hb', 'n_counts', 'n_counts_raw', 'n_counts_spliced', 'n_counts_unspliced', 'n_cells', 'n_cells_raw', 'n_cells_spliced', 'n_cells_unspliced', 'cc', 'ig', 'tcr'
    uns: 'age_unified_colors', 'fine_annot_colors', 'neighbors_scvi', 'organ_unified_colors', 'study_colors', 'umap'
    obsm: 'X_scvi', 'X_umap'
    obsp: 'neighbors_scvi_connectivities', 'neighbors_scvi_distances'

In [25]:
adata.write_h5ad('/nfs/team205/ao15/Megagut/Annotations_v3/neutrophils/pooled_healthy.Myeloid_withNeutrophils.20240131.h5ad')

In [26]:
adata.obs.level_3_annot.value_counts()

Macrophage                8594
DC_cDC2                   6381
Mast                      5607
Doublets                  4239
Monocyte                  3792
Macrophage_LYVE1          3627
Macrophage_TREM2          1511
Neutrophil_circulating    1342
DC_cDC1                    946
Macrophage_MMP9            907
DC_pDC                     565
Neutrophil_mature          467
DC_migratory               417
Erythrocytes               350
Mono/neutrophil_MPO        178
DC_langerhans              164
Macrophage_CD5L            100
Eosinophil/basophil         70
Neutrophil_fetal            25
Megakaryocyte/platelet       4
Name: level_3_annot, dtype: int64