In [1]:
from scipy.stats import hypergeom
import statsmodels.stats.multitest as multi
import pandas as pd
import os
import glob
import requests


In [2]:
degfiles = ['../../../her7/10_gene_exp/gene_expression_jen_local/10_gene_exp/burow_de_results_her19_v_herm.txt',
            '../../../her7/10_gene_exp/gene_expression_jen_local/10_gene_exp/burow_de_results_male_v_her19.txt',
            '../../../her7/10_gene_exp/gene_expression_jen_local/10_gene_exp/burow_de_results_male_v_herm.txt']

In [3]:
annfile = '../../../her7/02_functional_annotations/eggnog_annotations.txt'
outfile = '../../../her7/11_kegg_enrichment/functional_enrichment_results_KEGG_modules_wminexp.txt'
keggfile = '../../../../dbs/kegg/KEGG_MODULE_to_KO_Jun-30-2024.txt'

In [4]:
# min/max thresholds for tests
highCount = 10000000
lowCount = 2

min_exp_thres = 3
de_lfc_thres = 0.58
de_padj_thres = 0.1

# Initialize set dictionaries

In [5]:
degsets = {}
expset = set()

for degfile in degfiles:
    #print(degfile)
    comparison = degfile.split('burow_de_results_')[1].split('.txt')[0]
    #print(comparison)
    degsets[comparison + '_up'] = set()
    degsets[comparison + '_down'] = set()
    
    df = pd.read_csv(degfile, sep='\t', index_col=0)
    df.index.name = 'gene_id'
    #df
    
    # Filter rows based on conditions
    exp_df = df[(df['baseMean'] >= min_exp_thres)]
    de_up_df = df[(df['log2FoldChange'] > de_lfc_thres) & (df['padj'] < de_padj_thres)]
    de_down_df = df[(df['log2FoldChange'] < de_lfc_thres * -1) & (df['padj'] < de_padj_thres)]
    #print(de_up_df.loc[de_up_df['baseMean'].idxmin()])
    #print(de_down_df.loc[de_down_df['baseMean'].idxmin()])

    # Extract gene_id column as Python list
    for gene in exp_df.index.tolist():
        #print(gene)
        expset.add(gene)

    for gene in de_up_df.index.tolist():
        degsets[comparison + '_up'].add(gene)
        
    for gene in de_down_df.index.tolist():
        if gene == 'Ceric.05G026200':
            print(gene, comparison)
        degsets[comparison + '_down'].add(gene)


Ceric.05G026200 male_v_her19
Ceric.05G026200 male_v_herm


In [6]:
# genes in Ceratopteris with min baseMean >= threshold (3)
len(expset)

29839

In [7]:
# total genes in Ceratopteris
! grep -v baseMean {degfile} | wc -l

36857


In [8]:
# initialize remaining data structures
isaDict = {}
keggDesc = {}
keggOrder = {}

codeDict = {}
codeDict['total'] = {}
lociDict = {}
lociDict['total'] = set()

print('Number of genes in each DEG set:')
for degset in degsets:
    print(degset, len(degsets[degset]))
    codeDict[degset] = {}
    lociDict[degset] = set()

Number of genes in each DEG set:
her19_v_herm_up 209
her19_v_herm_down 1982
male_v_her19_up 6222
male_v_her19_down 5568
male_v_herm_up 4868
male_v_herm_down 4885


# Store KEGG descriptions

In [9]:
fi = open(keggfile)

for line in fi:
    if len(line.rstrip().split('\t')) == 2:
        mod, kos = line.rstrip().split('\t')

        for ko in kos.split(','):
            if ko not in isaDict:
                isaDict[ko] = set()
            isaDict[ko].add(mod)
            #print(ko, mod)

fi.close()



mod_list_url = 'http://rest.kegg.jp/list/module'
modDict = {}

resp = requests.get(mod_list_url)
for line in resp.text.split('\n'):
    #print(line)
    col = line.split('\t')
    if len(col) != 2:
        continue
        
    mod = line.split('\t')[0]
    modDesc = line.split('\t')[1]
    modDict[mod] = modDesc
    
    if mod == 'M00371':
        print(mod,modDesc)
    


M00371 Castasterone biosynthesis, campesterol => castasterone


# Parse keggs in annotation file

In [10]:
#! head -6 {annfile}

In [11]:
fi = open(annfile)

for line in fi:
    if line[0] == '#':
        continue
        
    col = line.rstrip().split('\t')
    locus = 'Ceric.' + col[0].split('.')[1]
    
    # skip if locus is not expressed
    if locus not in expset:
        #print('skipping ', locus, ' not expressed')
        continue
    
    returnedparents = ''
        
    keggcol = col[11].split(',')
    #print(locus, keggcol)
    
    if keggcol == ['-']:
        continue
    keggSet = set()
    
    for kegg in keggcol:
        kegg = kegg.split(':')[1]
        #print(kegg)

        if kegg in isaDict:
            returnedparents = isaDict[kegg]

            for parent in returnedparents:
                keggSet.add(parent)

                
    for kegg in keggSet:
            
        if kegg not in codeDict['total']:
            codeDict['total'][kegg] = set()

        codeDict['total'][kegg].add(locus)
        lociDict['total'].add(locus)    

        for degset in degsets:
            #print(degset)
            if locus in degsets[degset]:
    
                if kegg not in codeDict[degset]:
                    codeDict[degset][kegg] = set()

                codeDict[degset][kegg].add(locus)
                lociDict[degset].add(locus)
                
#                 if kegg == 'M00371':
#                     print(kegg, locus, modDict[kegg])

fi.close()

In [12]:
####################################
### Perform hypergeometric tests ###
####################################

df = pd.DataFrame(columns=['set','kegg','x','N','n','M','pval','genelist','kegglist'])
kegglist = ''

for degset in degsets:
    for kegg in codeDict[degset]:

        count = len(codeDict[degset][kegg])
        if count > highCount or count < lowCount:
            continue

        # x is the number of drawn "successes" (ie no. genes in degset and in kegg category)
        x = len(codeDict[degset][kegg])
        genelist = ', '.join(codeDict[degset][kegg])

        # N is the sample size (ie no. genes in degset)
        N = len(lociDict[degset])

        # n is the number of successes in the population (ie no. genes in kegg category [skipping genes not expressed])
        n = len(codeDict['total'][kegg])

        # M is the population size (ie no. genes total in any kegg category [skipping genes not expressed])
        M = len(lociDict['total'])

        # https://alexlenail.medium.com/understanding-and-implementing-the-hypergeometric-test-in-python-a7db688a7458
        # https://github.com/jdrudolph/goenrich
        pval = hypergeom.sf(x-1, M, n, N)

        if kegg in modDict:
            desc = modDict[kegg]
            kegg = kegg + ' - ' + desc

        df.loc[len(df.index)] = [degset,kegg,x,N,n,M,pval,genelist,kegglist]  


In [13]:
#codeDict[degset]['M00371']

In [14]:
#########################################
### Adjust pvalues for multiple tests ###
#########################################
if len(df['pval'].tolist()) > 0:
    adjpval = multi.multipletests(df['pval'].tolist(), alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False)[1]
    df['adjpval'] = adjpval
    df['seqfreq'] = df['x'] / df['N']
    df['totalfreq'] = df['n'] / df['M']

    df = df[['set','kegg','x','N','seqfreq','n','M','totalfreq','pval','adjpval','genelist','kegglist']]

    df.to_csv(outfile, sep='\t', index=False)
