In [1]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import session_info
import seaborn as sns
import matplotlib.pyplot as plt
import scienceplots
import pickle

# Specific modules
import scanpy as sc
import anndata as an
from matplotlib.backends.backend_pdf import PdfPages


# Setting some parameters
warnings.filterwarnings("ignore")
sys.path.insert(1, str(here('bin')))

# Import custom functions
from customPythonFunctions import *

print("Main directory path: {}".format(here()))

#plt.style.use(['science','nature','no-latex'])
dpi_fig_save = 300
sc.set_figure_params(dpi=100, dpi_save=dpi_fig_save, vector_friendly=True)

Main directory path: /scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas


**Setting parameters** 

In [2]:
overwriteFigures = True
overwriteData = True
clusters_computed = True

In [3]:
cellGroup = 'DC'
workDir = os.getcwd()

In [4]:
resolutions_of_interest = [0.25, 0.5, 0.75, 1]

**Load data**

In [5]:
# Load the scvi h5ad file
adata = sc.read_h5ad(here("{}/results/03_{}_scVI_UMAP_clinical_allGenes.h5ad".format(workDir, cellGroup)))
adata.uns['log1p'] = dict()
adata.uns['log1p']['base'] = None # only if logscale is computed
adata

AnnData object with n_obs × n_vars = 47276 × 22838
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'patientID', 'disease', 'timepoint_replicate', 'treatmentStatus', 'therapyResponse', 'sex', 'age', 'BMI', 'binned_age', 'diseaseStatus', 'smokingStatus', 'ethnicity', 'institute', 'diseaseGroup', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'total_counts_plt', 'log1p_total_counts_plt', 'pct_counts_plt', 'doublet_score', 'predicted_doublet', 'S_score', 'G2M_score', 'phase'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status', 'mt', 'ribo', 'hb', 'plt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable'
    uns: '_scvi_m

In [6]:
if clusters_computed:
    with open(here('{}/results/INFLAMMATION_clustering_resolutions_to_explore.pkl'.format(workDir)), 'rb') as f:
        clusterDF = pickle.load(f)
    for column_name in clusterDF.columns:
        adata.obs[column_name] = clusterDF[column_name]
    adata

## Compute marker genes

In [7]:
marker_genes_folder = here('{}/results/figures/marker_genes/'.format(workDir))
os.makedirs(marker_genes_folder, exist_ok=True)

In [11]:
n_markers_to_plot = 10
resolutions_of_interest = [0.5]

print("******* Computing marker genes *******")
for res in resolutions_of_interest:
    print("Computing markers for resolution = " + str(res))
    # (0) Generate resolution subfolder 
    resolution_folder = here('{}/res_{}/'.format(marker_genes_folder, res))
    os.makedirs(resolution_folder, exist_ok=True)
    # (1) Compute markers: CellGroup markers
    de_key = "de_res_{}".format(res)
    leiden_cluster = 'leiden_res_{}'.format(res)
    sc.tl.rank_genes_groups(adata, 
                            groupby=leiden_cluster, 
                            layer='log1p_10e4_counts', 
                            method='wilcoxon',
                            pts=True,
                            key_added=de_key, 
                            use_raw = False)
    ## (2) Save markers
    rank_genesDF = sc.get.rank_genes_groups_df(adata, group=None, key = de_key, gene_symbols="symbol")
    ### (2.1) As csv (no filtering)
    rank_genesDF.to_csv(here('{}/{}_marker_genes_res{}.csv'.format(resolution_folder, cellGroup, res)))
    ### (2.2) As excel (logfoldchanges > X)
    rank_genes_excel_path = here('{}/{}_marker_genes_res{}.xlsx'.format(resolution_folder, cellGroup, res))
    unique_groups = rank_genesDF["group"].unique()
    with pd.ExcelWriter(rank_genes_excel_path, engine="openpyxl") as writer:
        for group in unique_groups:
            group_rank_genesDF = rank_genesDF[rank_genesDF["group"] == group]
            group_rank_genesDF = group_rank_genesDF[group_rank_genesDF["logfoldchanges"] > 0.25]
            group_rank_genesDF = group_rank_genesDF[group_rank_genesDF["pvals_adj"] < 0.05]
            group_rank_genesDF = group_rank_genesDF[group_rank_genesDF["pct_nz_group"] > 0.25]
            group_rank_genesDF = group_rank_genesDF.groupby("group", group_keys=False).apply(lambda x: x.sort_values(by="logfoldchanges", ascending=False))
            group_rank_genesDF.to_excel(writer, sheet_name=group, index=False)

    ## (3) Plot markers 
    computed_marker_genes = {}
    for cluster in adata.obs[leiden_cluster].cat.categories:
        marker_genes = adata.uns[de_key]['names'][cluster][:n_markers_to_plot]  
        computed_marker_genes[f"Cluster_{cluster}"] = marker_genes

    list_of_marker_genes = []
    for value in computed_marker_genes.values():
        list_of_marker_genes.extend(value.tolist())
    list_of_marker_genes = list(set(list_of_marker_genes))
    list_of_marker_genes

    list_of_marker_genes_symbol = []
    for gene_id in list_of_marker_genes:
        if gene_id in adata.var.index:
            symbol = adata.var.loc[gene_id, "symbol"]
            list_of_marker_genes_symbol.append(symbol)
        else:
            list_of_marker_genes_symbol.append(None)  

    ID2SymbolDF = generateID2SymbolDF(varDF = adata.var, symbolList = list_of_marker_genes_symbol, 
                                  ID_col = 'index', symbols_col = 'symbol', HUGOstatus_col = 'HUGO_status', behaviour = 'all')

    ## (3.1) FeaturePlot
    fig = sc.pl.embedding(adata = adata, 
                          basis="X_umap_scVI", 
                          color=ID2SymbolDF["gene_id"], title= ID2SymbolDF["symbol"], s=10, 
                          show=True, 
                          return_fig=True, 
                          vmin="p1", vmax="p99",
                          size=30, 
                          legend_loc="on data", 
                          use_raw=False)
    plt.savefig(here('{}/FeaturePlot_{}_res{}_Clusters_top{}Markers.pdf'.format(resolution_folder, cellGroup, res, n_markers_to_plot)), bbox_inches='tight', pad_inches=0, dpi=dpi_fig_save)
    ## (3.2) DotPlot
    
    sc.set_figure_params(figsize=(15, 7))
    fig = sc.pl.dotplot(adata,
                        var_names = ID2SymbolDF["gene_id"],
                        groupby=leiden_cluster, standard_scale='var', 
                  use_raw = False, dendrogram=True, show=False, return_fig=False)
        # replace ensembls ids by gene symbol in plot
    _ = fig['mainplot_ax'].set_xticklabels(ID2SymbolDF["symbol"])
    plt.savefig(here('{}/Dotplot_{}_res{}_Clusters_top{}Markers.pdf'.format(resolution_folder, cellGroup, res, n_markers_to_plot)), bbox_inches='tight', pad_inches=0, dpi=dpi_fig_save)

******* Computing marker genes *******
Computing markers for resolution = 0.25
Computing markers for resolution = 0.5


Exception ignored in: <function ZipFile.__del__ at 0x7f51da1b09d0>
Traceback (most recent call last):
  File "/scratch_isilon/groups/singlecell/shared/conda_env/inflammation_atlas_R1/lib/python3.10/zipfile.py", line 1821, in __del__
    self.close()
  File "/scratch_isilon/groups/singlecell/shared/conda_env/inflammation_atlas_R1/lib/python3.10/zipfile.py", line 1838, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file


Computing markers for resolution = 0.75
Computing markers for resolution = 1
