In [1]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import session_info
import seaborn as sns
import matplotlib.pyplot as plt
import scienceplots
import pickle

# Specific modules
import scanpy as sc
import anndata as an
from matplotlib.backends.backend_pdf import PdfPages


# Setting some parameters
warnings.filterwarnings("ignore")
sys.path.insert(1, str(here('bin')))

# Import custom functions
from customPythonFunctions import *

print("Main directory path: {}".format(here()))

#plt.style.use(['science','nature','no-latex'])
dpi_fig_save = 300
sc.set_figure_params(dpi=100, dpi_save=dpi_fig_save, vector_friendly=True)

Main directory path: /scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas


**Setting parameters** 

In [2]:
overwriteFigures = True
overwriteData = True

**Load data**

In [4]:
# Load the scvi h5ad file
adata = sc.read_h5ad(here("02_fromCellLineages_to_CellTypes/INFLAMMATION_main_annotated_LowQFilt.h5ad"))
adata

AnnData object with n_obs × n_vars = 4435922 × 22838
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'
    uns: 'log1p'

In [8]:
# Remove
adata = adata[~adata.obs['Level1'].isin(['Platelets', 'RBC', 'Cycling_cells']),:].copy()
adata

AnnData object with n_obs × n_vars = 4234171 × 22838
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'
    uns: 'log1p'

In [9]:
adata.obs["Level1"].unique().to_list()

['T_CD8_NonNaive',
 'T_CD4_Naive',
 'Mono',
 'T_CD4_NonNaive',
 'ILC',
 'B',
 'T_CD8_Naive',
 'DC',
 'pDC',
 'UTC',
 'Plasma',
 'Progenitors']

## Compute marker genes

In [12]:
# Normalization, replace adata.X and storing
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.layers["log1p_10e4_counts"] = adata.X.copy()
adata



AnnData object with n_obs × n_vars = 4234171 × 22838
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'
    uns: 'log1p', 'DE_Level1'
    layers: 'log1p_10e4_counts'

In [None]:
# Compute markers
sc.tl.rank_genes_groups(adata, 
                        groupby="Level1", 
                        layer='log1p_10e4_counts', 
                        method='wilcoxon',
                        pts=True,
                        key_added="DE_Level1", 
                        use_raw = False)

## Save markers
rank_genesDF = sc.get.rank_genes_groups_df(adata, group=None, key = "DE_Level1", gene_symbols="symbol")
rank_genesDF.to_csv(here('02_cell_annotation/03_characterizing_CellTypes/results/DEgenes_INFLAMMATION_main_annotated_Level1.csv'))

rank_genes_excel_path = here('02_cell_annotation/03_characterizing_CellTypes/results/DEgenes_INFLAMMATION_main_annotated_Level1.xlsx')
unique_groups = rank_genesDF["group"].unique()
with pd.ExcelWriter(rank_genes_excel_path, engine="openpyxl") as writer:
    for group in unique_groups:
        group_rank_genesDF = rank_genesDF[rank_genesDF["group"] == group]
        group_rank_genesDF = group_rank_genesDF[group_rank_genesDF["logfoldchanges"] > 0.25]
        group_rank_genesDF = group_rank_genesDF[group_rank_genesDF["pvals_adj"] < 0.05]
        group_rank_genesDF = group_rank_genesDF[group_rank_genesDF["pct_nz_group"] > 0.25]
        group_rank_genesDF = group_rank_genesDF.groupby("group", group_keys=False).apply(lambda x: x.sort_values(by="logfoldchanges", ascending=False))
        group_rank_genesDF.to_excel(writer, sheet_name=group, index=False)

In [None]:
session_info.show()