In [1]:
from eco_helper.core import *
import eco_helper.core.settings as settings
import os, glob
import pandas as pd
from alive_progress import alive_bar
import gseapy as gp

In [2]:
directory = "/data/users/noahkleinschmidt/EcoTyper/results/exports"

In [28]:
def enrichr( directory : str, outdir : str, gene_sets : list or str = "KEGG_2021_Human", organism : str = "human" ):
    """
    Perform gene set enrichment using `gseapy enrichr` for each cell type and each cell-state therein.

    Parameters
    ----------
    directory : str
        The path to the directory where extracted gene sets are stored in separate text files.
        Note, if both prerank and enrichr sets were extracted, this will automatically adjust to the `gseapy_enrichr` subdirectory if necessary.
    outdir : str
        The path to the output directory.
    gene_sets : list or str
        The gene sets to use for enrichment analysis. By default the latest KEGG gene sets are used.
    organism : str
        The organism to use for enrichment analysis. By default "human" is assumed.
    """

    if isinstance( gene_sets, str ):
        gene_sets = [gene_sets]

    directory, tmpdir, outdir = _enrichr_prep_directories(directory, outdir)
    
    files = os.listdir( directory )
    with alive_bar( len( files ), title = "Performing gseapy enrichr" ) as bar:
        for file in files:
            
            # perpare the filenames for enrichment analysis
            infile = os.path.join( directory, file )
            outfile = f"{file}{settings.enrichr_results_suffix}"

            # perform enrichr analysis
            gp.enrichr( 
                        gene_list = infile, 
                        outdir = tmpdir, 
                        gene_sets = gene_sets,
                        organism = organism,
                        no_plot = True,
                    )
            
            # merge the temporary files into one for each cell type and cell state
            textfiles = glob.glob( os.path.join( tmpdir, "*.txt" ) )
            textfiles = [ pd.read_csv( file, sep = "\t" ) for file in textfiles ]
            final = pd.concat( textfiles )
            final.to_csv( os.path.join( outdir, outfile ), sep = "\t", index = False )

            bar()

    # remove the temporary directory
    os.system( f"rm -r {tmpdir}" )

def _enrichr_prep_directories(directory, outdir):
    """
    Prepare the input and output directories for enrichr analysis. This will also create the tmpdir.

    Parameters
    ----------
    directory : str
        The path to the directory where extracted gene sets are stored in separate text files.
        Note, if both prerank and enrichr sets were extracted, this will automatically adjust to the `gseapy_enrichr` subdirectory if necessary.
    outdir : str
        The path to the output directory.
    """
    if settings.enrichr_outdir in os.listdir( directory ):
        directory = os.path.join( directory, settings.enrichr_outdir )
    
    if outdir == directory: 
        outdir = os.path.join( outdir, settings.gseapy_outdir )

    tmpdir = os.path.join( outdir, "__tmp" )
    if not os.path.exists( tmpdir ): 
        os.mkdir( tmpdir )
        
    return directory, tmpdir, outdir


enrichr( directory, directory, gene_sets = ["KEGG_2021_Human", "Reactome_2016"] )

TypeError: unhashable type: 'types.SimpleNamespace'

In [19]:
source_results = "/data/users/noahkleinschmidt/EcoTyper/results/drop_hepato"

def assemble_enrichr_results( directory : str, cell_types : CellTypeCollection, outdir : str = None, remove_raw : bool = True ):
    """
    Assemble the per-state enrichr output text files into a single file per cell type. 

    Parameters
    ----------
    directory : str
        The directory storing the raw enrichr output files for each cell type and state.
    cell_types : CellTypeCollection
        The cell types to use for assembling the results.
    outdir : str
        The output directory to store the assembled results. 
        If not specified, the results will be stored in the same directory as the raw enrichr results.
    remove_raw : bool
        If True, the raw enrichr results will be removed after assembling.
    """
    if outdir is None:
        outdir = directory
    for cell_type in cell_types:
        
        # get the corresponding enrichr files
        files = glob.glob( os.path.join( directory, cell_type + "_*" ) )

        # get the states associated with each file by splitting through <cell_type>_<state>.txt.enrichr.txt
        states = [ i[ i.rfind( "_" ) + 1 : ].split(".")[0] for i in files ]

        # now read the dataframes and add the state
        dfs = [ pd.read_csv( file, sep = "\t" ) for file in files ]
        for df, state in zip(dfs, states): 
            df.insert( 0, settings.state_col, state ) 
        
        # concatenate and save the final dataframe
        final = pd.concat( dfs )
        outfile = os.path.join( outdir, f"{cell_type}{settings.enrichr_results_suffix}" )
        final.to_csv( outfile, sep = "\t", index = False )

        if remove_raw:
            files = " ".join( files )
            os.system( f"rm {files}" )
        
cell_types = CellTypeCollection( source_results )
assemble_enrichr_results( directory, cell_types ) 

In [5]:
def prerank( directory : str, outdir : str, gene_sets : list or str = "KEGG_2021_Human", organism : str = "human", min_size : int = 5, max_size = 500, permutations : int = 1000, **kwargs ):
    """
    Perform gene set enrichment using `gseapy prerank` for each cell type and each cell-state therein.

    Parameters
    ----------
    directory : str
        The path to the directory where extracted gene sets are stored in separate text files.
        Note, if both prerank and prerank sets were extracted, this will automatically adjust to the `gseapy_prerank` subdirectory if necessary.
    outdir : str
        The path to the output directory.
    gene_sets : list or str
        The gene sets to use for enrichment analysis. By default the latest KEGG gene sets are used.
    organism : str
        The organism to use for enrichment analysis. By default "human" is assumed.
    min_size : int
        The minimum number of genes required to be found in a gene set. 
    max_size : int
        The maximum number of genes allowed to be found in a gene set.
    permutations : int
        The number of permutations to use for the permutation test.
    **kwargs
        Any additional keyword arguments to pass to `gseapy.prerank`.
    """

    if isinstance( gene_sets, str ):
        gene_sets = [gene_sets]

    directory, tmpdir, outdir = _prerank_prep_directories(directory, outdir)
    
    files = os.listdir( directory )
    with alive_bar( len( files ), title = "Performing gseapy prerank" ) as bar:
        for file in files:
            
            # perpare the filenames for enrichment analysis
            infile = os.path.join( directory, file )
            outfile = f"{file}{settings.prerank_results_suffix}"

            # perform prerank analysis
            gp.prerank( 
                        rnk = infile, 
                        outdir = tmpdir, 
                        gene_sets = gene_sets,
                        organism = organism,
                        min_size = min_size,
                        max_size = max_size,
                        permutation_num = permutations,
                        no_plot = True,
                        **kwargs
                    )
            
            # merge the temporary files into one for each cell type and cell state
            # NOTE: prerank makes CSV files not TSV files!
            textfiles = glob.glob( os.path.join( tmpdir, "*.csv" ) )
            textfiles = [ pd.read_csv( file, sep = "," ) for file in textfiles ]
            final = pd.concat( textfiles )
            final.to_csv( os.path.join( outdir, outfile ), sep = "\t", index = False )

            bar()

    # remove the temporary directory
    os.system( f"rm -r {tmpdir}" )

def _prerank_prep_directories(directory, outdir):
    """
    Prepare the input and output directories for prerank analysis. This will also create the tmpdir.

    Parameters
    ----------
    directory : str
        The path to the directory where extracted gene sets are stored in separate text files.
        Note, if both prerank and prerank sets were extracted, this will automatically adjust to the `gseapy_prerank` subdirectory if necessary.
    outdir : str
        The path to the output directory.
    """
    if settings.prerank_outdir in os.listdir( directory ):
        directory = os.path.join( directory, settings.prerank_outdir )
    
    if outdir == directory: 
        outdir = os.path.join( outdir, settings.gseapy_outdir )

    tmpdir = os.path.join( outdir, "__tmp" )
    if not os.path.exists( tmpdir ): 
        os.mkdir( tmpdir )
        
    return directory, tmpdir, outdir


prerank( directory, directory, gene_sets = ["KEGG_2021_Human", "Reactome_2016"] )

TypeError: unhashable type: 'types.SimpleNamespace'

In [6]:
def assemble_prerank_results( directory : str, cell_types : CellTypeCollection, outdir : str = None, remove_raw : bool = True ):
    """
    Assemble the per-state prerank output text files into a single file per cell type. 

    Parameters
    ----------
    directory : str
        The directory storing the raw prerank output files for each cell type and state.
    cell_types : CellTypeCollection
        The cell types to use for assembling the results.
    outdir : str
        The output directory to store the assembled results. 
        If not specified, the results will be stored in the same directory as the raw prerank results.
    remove_raw : bool
        If True, the raw prerank results will be removed after assembling.
    """
    if outdir is None:
        outdir = directory
    for cell_type in cell_types:
        
        # get the corresponding prerank files
        files = glob.glob( os.path.join( directory, cell_type + "_*" ) )

        # get the states associated with each file by splitting through <cell_type>_<state>.txt.prerank.txt
        states = [ i[ i.rfind( "_" ) + 1 : ].split(".")[0] for i in files ]

        # now read the dataframes and add the state
        dfs = [ pd.read_csv( file, sep = "\t" ) for file in files ]
        for df, state in zip(dfs, states): 
            df.insert( 0, settings.state_col, state ) 
        
        # concatenate and save the final dataframe
        final = pd.concat( dfs )
        outfile = os.path.join( outdir, f"{cell_type}{settings.prerank_results_suffix}" )
        final.to_csv( outfile, sep = "\t", index = False )

        if remove_raw:
            files = " ".join( files )
            os.system( f"rm {files}" )

assemble_prerank_results( directory, cell_types )

TypeError: 'module' object is not iterable