In [None]:
from pathlib import Path
import re
import warnings
import itertools

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
from scipy.sparse import csr_matrix
import scipy.stats as spss
import matplotlib_venn

import bioquest as bq
import sckit as sk

In [None]:
warnings.filterwarnings(action="ignore")
OUTPUT_DIR = "output/SC03.GSEA"
Path(OUTPUT_DIR).mkdir(parents=True,exist_ok=True)
export = sk.export(formats=('pdf',),od=OUTPUT_DIR)

In [None]:
adata = sc.read_h5ad("output/SC02.score/adata.h5ad")

In [None]:
def gsea(adata,key,value,gene_sets,n_jobs=8):
    import gseapy
    lg=adata.obs.loc[:,key].values==value
    adata.obs['TempGroup'] = pd.Categorical([0 if x else 1 for x in lg], categories=[0, 1], ordered=True)
    indices = adata.obs.sort_values(['TempGroup']).index
    adata = adata[indices,:]
    adata.var_names=[x.upper() for x in adata.var_names]

    res = gseapy.gsea(data=adata.to_df().T, # row -> genes, column-> samples
        gene_sets=gene_sets,
        cls=adata.obs.TempGroup,
        permutation_num=1000,
        permutation_type='phenotype',
        outdir=None,
        method='s2n', # signal_to_noise
        threads= n_jobs)
    
    return res

In [None]:
alist=[]
for x in adata.obs.CellType.cat.categories:
    res=gsea(adata,key="CellType",value=x,gene_sets="GO_Biological_Process_2021")
    df=res.res2d.sort_values("NES",ascending=False).drop(columns="Name")
    df.insert(loc=0,column="CellType",value=x)
    alist.append(df)
df = pd.concat(alist)
df.to_csv(f"{OUTPUT_DIR}/CellType_GOBP.csv",index=False)

In [None]:
df=pd.read_csv(f"{OUTPUT_DIR}/CellType_GOBP.csv")

In [None]:
df.head(2)

In [None]:
def dotplot(frame,
        x="CellType",
           output_dir=None,
           fname=None,
           gene_sets=None,
           organism='human',
           pvalue_threshold=1.0,
           figsize=(6, 10),
           top_term=6,
           dotsize=5,
           ):
    import gseapy
    if gene_sets is None:
        gene_sets = {"GO_Biological_Process_2021": "GOBP",
                     "GO_Molecular_Function_2021": "GOMF",
                     "GO_Cellular_Component_2021": "GOCC",
                     "KEGG_{}".format("2019_Mouse" if organism == "mouse" else "2021_Human"): "KEGG"
                     }

    res = frame.replace(gene_sets)
    res.loc[:, "Term"] = bq.st.removes(string=res.Term, pattern=r"\(.+\)")
    ax = gseapy.dotplot(res,
                        column="FDR q-val",
                        x='CellType',  # set x axis, so you could do a multi-sample/library comparsion
                        size=dotsize,
                        top_term=top_term,
                        figsize=figsize,
                        title='',
                        xticklabels_rot=45,  # rotate xtick labels
                        show_ring=False,  # set to False to revmove outer ring
                        marker='o',
                        )
    ax.set_xlabel(xlabel="")
    if fname:
        import matplotlib.pyplot as plt
        plt.savefig(f"{output_dir}/{fname}_enrich.pdf",
                    bbox_inches='tight', dpi=300)


In [None]:
dotplot(df,x="CellType",figsize=(6, 10),dotsize=5,top_term=3,output_dir=OUTPUT_DIR,fname="CellType")