In [1]:
# Parameters
celltype = "UTC"


In [2]:
for v in ['celltype']:
    if (v in locals()) or (v in globals()):
        print(f"{v} = {eval(v)}")
    else:
        raise ValueError(f"{v} not provided")

celltype = UTC


In [3]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import session_info
import seaborn as sns
import matplotlib.pyplot as plt
import scienceplots
import pickle

# Specific modules
import scanpy as sc
import anndata as an
from matplotlib.backends.backend_pdf import PdfPages


# Setting some parameters
warnings.filterwarnings("ignore")
sys.path.insert(1, str(here('bin')))

# Import custom functions
from customPythonFunctions import *

print("Main directory path: {}".format(here()))

#plt.style.use(['science','nature','no-latex'])
dpi_fig_save = 300
sc.set_figure_params(dpi=100, dpi_save=dpi_fig_save, vector_friendly=True)

Main directory path: /scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas


**Setting parameters** 

In [4]:
overwriteFigures = True
overwriteData = True

**Load data**

In [5]:
# Load the scvi h5ad file
adata = sc.read_h5ad(here("02_fromCellLineages_to_CellTypes/INFLAMMATION_main_annotated_LowQFilt.h5ad"))
adata

AnnData object with n_obs × n_vars = 4435922 × 22838
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'

In [6]:
# Subset by celltype

# Parameters
adata_sub = adata[adata.obs['Level1'] == celltype, :].copy()
adata_sub

AnnData object with n_obs × n_vars = 115097 × 22838
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'

## Compute marker genes

In [7]:
# Normalization, replace adata.X and storing
sc.pp.normalize_total(adata_sub, target_sum=1e4)
sc.pp.log1p(adata_sub)
adata_sub.layers["log1p_10e4_counts"] = adata_sub.X.copy()
adata_sub

AnnData object with n_obs × n_vars = 115097 × 22838
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'
    uns: 'log1p'
    layers: 'log1p_10e4_counts'

In [8]:
# Compute markers
sc.tl.rank_genes_groups(adata_sub, 
                        groupby="Level2", 
                        layer='log1p_10e4_counts', 
                        method='wilcoxon',
                        pts=True,
                        key_added="DE_Level2", 
                        use_raw = False)

## Save markers
rank_genesDF = sc.get.rank_genes_groups_df(adata_sub, group=None, key = "DE_Level2", gene_symbols="symbol")
rank_genesDF.to_csv(here('02_fromCellLineages_to_CellTypes/DE_markerGenes/DEgenes_INFLAMMATION_main_annotated_Level2_{}.csv'.format(celltype)))

rank_genes_excel_path = here('02_fromCellLineages_to_CellTypes/DE_markerGenes/DEgenes_INFLAMMATION_main_annotated_Level2_{}.xlsx'.format(celltype))
unique_groups = rank_genesDF["group"].unique()
with pd.ExcelWriter(rank_genes_excel_path, engine="openpyxl") as writer:
    for group in unique_groups:
        group_rank_genesDF = rank_genesDF[rank_genesDF["group"] == group]
        group_rank_genesDF = group_rank_genesDF[group_rank_genesDF["logfoldchanges"] > 0.25]
        group_rank_genesDF = group_rank_genesDF[group_rank_genesDF["pvals_adj"] < 0.05]
        group_rank_genesDF = group_rank_genesDF[group_rank_genesDF["pct_nz_group"] > 0.25]
        group_rank_genesDF = group_rank_genesDF.groupby("group", group_keys=False).apply(lambda x: x.sort_values(by="logfoldchanges", ascending=False))
        group_rank_genesDF.to_excel(writer, sheet_name=group, index=False)

In [4]:
# Generate for manuscript

rank_genesDF = pd.read_csv(here('02_cell_annotation/03_characterizing_CellTypes/results/DEgenes_INFLAMMATION_main_annotated_Level2_{}.csv'.format(celltype)), index_col="Unnamed: 0")

rank_genesDF = rank_genesDF[rank_genesDF["logfoldchanges"] > 0.25]
rank_genesDF = rank_genesDF[rank_genesDF["pvals_adj"] < 0.05]
rank_genesDF = rank_genesDF[rank_genesDF["pct_nz_group"] > 0.25]
rank_genesDF = rank_genesDF.groupby("group", group_keys=False).apply(lambda x: x.sort_values(by="logfoldchanges", ascending=False))

rank_genesDF.to_csv(here('02_cell_annotation/03_characterizing_CellTypes/results/DEgenes_INFLAMMATION_main_annotated_Level2_{}_unique.csv'.format(celltype)), index=None)
rank_genesDF.to_excel(here('02_cell_annotation/03_characterizing_CellTypes/results/DEgenes_INFLAMMATION_main_annotated_Level2_{}_unique.xlsx'.format(celltype)), index=None)

In [9]:
session_info.show()