In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pymodulon.core import IcaData
from pymodulon import example_data
from pymodulon.io import save_to_json, load_json_model
from pymodulon.enrichment import *

d

### Create ICA object

In [None]:
tpm = pd.read_csv('../data/ICA_data/log_tpm_norm.csv',index_col='gene_id')
#gene_table = pd.read_csv('../data/Annotations/yarrowia_NCBI_table.csv',index_col='Locus tag')
sample_table = pd.read_table('../data/ICA_data/metadata.tsv',index_col='ID')
A = pd.read_csv('../data/ICA_data/A.csv',index_col='Unnamed: 0')
M = pd.read_csv('../data/ICA_data/M.csv',index_col='gene_id')
M.index = [w.replace('gene-','') for w in M.index]
tpm.index = [w.replace('gene-','') for w in tpm.index]
TRN = pd.read_csv('../data/Annotations/yarrowiaTRN.csv',index_col = 'Unnamed: 0')


In [None]:
'''
Generate gene table
'''

from pymodulon.gene_util import *

yl_gene_table = gff2pandas('../data/Sequences/W29.gff',index='locus_tag')
bbh_results = pd.read_csv('../data/blast/bbh/homologue_maps.csv',index_col='Unnamed: 0')

bbh_results = bbh_results.rename(columns={'subject':'locus_tag'}) 
bbh_results = bbh_results[['gene','locus_tag','PID','COV','BBH','sacc_gene_name', 'sacc_gene_product']]


In [None]:
yl_gene_table = yl_gene_table.merge(bbh_results, on='locus_tag', how='left').set_index('locus_tag')


In [None]:
yl_gene_table

### Get COG categories

In [None]:
from Bio import Entrez, SeqIO
import pandas as pd

Entrez.email = "kkrishnan@ucsd.edu"

def fetch_protein_fasta(accession):
    try:
        handle = Entrez.efetch(db="protein", id=accession, rettype="fasta", retmode="text")
        return handle.read()
    except:
        return None

# # Extract unique protein accessions from your dataframe
# protein_ids = yl_gene_table['ncbi_protein'].dropna().unique()

# # Download protein sequences to a file for eggNOG-mapper
# with open("yarrowia_proteins.faa", "w") as out_f:
#     for acc in protein_ids:
#         fasta = fetch_protein_fasta(acc)
#         if fasta:
#             out_f.write(fasta)


### Map COGs

In [None]:
cogs = pd.read_csv('../data/eggnog/eggnog.tsv', sep='\t', skiprows=4)  # Skip the 3 `##` lines + 1 blank
cogs = cogs[['#query','COG_category','Description','Preferred_name','GOs','PFAMs','KEGG_Pathway','KEGG_Module','BRITE']]
cogs = cogs.rename(columns={'#query':'locus_tag'})

yl_gene_table = yl_gene_table.merge(cogs, on='locus_tag', how='left').set_index('locus_tag')


MAP CLB to W29 genes

In [None]:
from compare_genomes import reciprocal_best_hits

from Bio import SeqIO

def extract_proteins_from_gbff(gbff_path, output_fasta):
    with open(output_fasta, "w") as out_f:
        for record in SeqIO.parse(gbff_path, "genbank"):
            for feature in record.features:
                if feature.type == "CDS" and "translation" in feature.qualifiers:
                    protein_seq = feature.qualifiers["translation"][0]
                    gene_id = feature.qualifiers.get("locus_tag", ["unknown"])[0]
                    out_f.write(f">{gene_id}\n{protein_seq}\n")

# Example usage:
#extract_proteins_from_gbff("../data/Sequences/CLIB122.gbff", "CLIB122_proteins.faa")
#extract_proteins_from_gbff("../data/Sequences/W29.gbff", "W29_proteins.faa")

#df = reciprocal_best_hits("genomeA_proteins.faa", "genomeB_proteins.faa")

In [None]:
# df = reciprocal_best_hits("CLIB122_proteins.faa", "W29_proteins.faa")

# df

In [None]:
#df.to_csv('cl_w29_protein_maps.tsv',sep='\t')

df = pd.read_csv('cl_w29_protein_maps.tsv',sep='\t',index_col=0)
df


Create a new TRN mapping



In [None]:
trn = pd.read_csv('../data/Annotations/yarrowiaTRN.csv',index_col='Unnamed: 0')

gene_map = []
for gene in trn['gene_id'].tolist():
    if(df[df['gene_a']==gene].shape[0]>0):
        gene_map.append(df[df['gene_a']==gene]['gene_b'].tolist()[0])
    else:
        gene_map.append('NA')   

trn['W29_gene_map'] = gene_map
trn = trn[['regulator','W29_gene_map']].rename(columns={'W29_gene_map':'gene_id'})




In [None]:
tpm

### Create ICA data object

In [None]:
from pymodulon.core import IcaData

ica_data = IcaData(M,A,trn=trn,threshold_method='dagostino')

ica_data.gene_table = yl_gene_table
ica_data.X = tpm
ica_data.sample_table = sample_table


Perform regulon enrichments


In [None]:
from enrichment import *

imod_table = pd.DataFrame()
sacc_enrich = []

for imod in ica_data.M.columns:
    enrichs = compute_trn_enrichment(
        ica_data.view_imodulon(imod).index.tolist(),
        ica_data.gene_names,
        trn
    )

    if not enrichs.empty:
        ser = enrichs.iloc[0]              # first row as a Series
        imod_table[imod] = ser             # scores → column
        sacc_enrich.append(ser.name)       # row label → list
    else:
        # keep the column but fill with NaN so shapes stay consistent
        imod_table[imod] = pd.Series(
            np.nan,
            index=imod_table.index,        # match existing index (may be empty the first time)
            dtype=float
        )
        sacc_enrich.append('N.A')          # or np.nan if you prefer

# imod_table  → columns = imod IDs, index = TFs (union over all Series)
# sacc_enrich → list of first-row labels (or 'N.A' for empty enrichs)


In [None]:
imod_table.loc['Yeast_TF',:] = sacc_enrich

imod_table.to_csv('yeast_enrichments.tsv',sep='\t')

In [None]:
enrichment_details = imod_table.copy(deep=False)

enrichment_details

In [None]:
def compute_row_threshold(row, percentile=99.75, min_count=100):
    """
    Given a Pandas Series (one row of data), compute a threshold T with these rules:
      1. Let abs_vals = sorted absolute values of `row`.
      2. Let T_perc = np.percentile(abs_vals, percentile).
      3. Count how many entries satisfy |value| > T_perc:
           count_above = sum(abs_vals > T_perc).
         - If count_above < min_count, return T_perc.
         - Otherwise, return the (min_count)-th largest absolute value.
    """
    abs_vals = np.abs(row.values)
    
    # 1) Find the X-th percentile threshold of abs_vals:
    T_perc = np.percentile(abs_vals, percentile)
    
    # 2) How many entries are strictly greater than that percentile?
    count_above = np.sum(abs_vals > T_perc)
    
    if count_above < min_count:
        return T_perc
    else:
        # Sort descending, then take the (min_count)-th entry
        sorted_desc = np.sort(abs_vals)[::-1]
        # If there are at least min_count entries, pick that one.
        if len(sorted_desc) >= min_count:
            return sorted_desc[min_count-1]
        else:
            # In the rare case that the row has fewer than min_count entries,
            # we just pick the smallest absolute value (i.e. the last one in sorted_desc).
            return sorted_desc[-1]
        
        
thresholds = ica_data.M.T.apply(compute_row_threshold,
                    axis=1)

In [None]:
### Plot COG categories

In [None]:
from collections import Counter
import matplotlib.pyplot as plt

majority_cogs = []
second_majority_cogs = []
majority_counts = []
second_majority_counts = []
imodulon_cog_df = pd.DataFrame({'imodulon': list(range(ica_data.M.shape[1]))})
imodulon_cog_df['majority_cog'] = None
imodulon_cog_df['second_majority_cog'] = None
imodulon_cog_df['majority_count'] = 0
imodulon_cog_df['second_majority_count'] = 0
# Create a list of all imodulon IDs 
all_imodulon_ids = range(ica_data.M.shape[1])
# Iterate through each imodulon and compute the majority and second majority COG categories                 
for imod_id in all_imodulon_ids:
    imod_df = ica_data.view_imodulon(imod_id)
    cogs = imod_df['COG_category'].dropna()
    cog_counts = Counter(cogs)
    if len(cog_counts) == 0:
        majority_counts.append(0)
        second_majority_counts.append(0)
        majority_cogs.append(None)
        second_majority_cogs.append(None)
        continue
    most_common = cog_counts.most_common(2)
    majority_counts.append(most_common[0][1])
    second_majority_counts.append(most_common[1][1] if len(most_common) > 1 else 0)
    majority_cogs.append(most_common[0][0])
    second_majority_cogs.append(most_common[1][0] if len(most_common) > 1 else None)

imodulon_cog_df['majority_count'] = majority_counts
imodulon_cog_df['second_majority_count'] = second_majority_counts
imodulon_cog_df['majority_cog'] = majority_cogs
imodulon_cog_df['second_majority_cog'] = second_majority_cogs
# Display the updated DataFrame for inspection
imodulon_cog_df[['imodulon', 'majority_cog', 'majority_count', 'second_majority_cog', 'second_majority_count']].head()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare data
plot_df = imodulon_cog_df.copy()
plot_df['imodulon'] = plot_df['imodulon'].astype(int)
plot_df = plot_df.sort_values('imodulon')
plot_df['imodulon'] = plot_df['imodulon'].astype(str)

# Calculate total gene count per iModulon
plot_df['total_count'] = plot_df['majority_count'] + plot_df['second_majority_count']

# Compute fraction
plot_df['majority_fraction'] = plot_df['majority_count'] / plot_df['total_count']
plot_df['second_majority_fraction'] = plot_df['second_majority_count'] / plot_df['total_count']

# Get unique COGs from both columns
all_cogs = pd.unique(plot_df[['majority_cog', 'second_majority_cog']].values.ravel('K'))
all_cogs = [c for c in all_cogs if pd.notnull(c)]
cog_palette = dict(zip(all_cogs, sns.color_palette('tab20', len(all_cogs))))

fig, ax = plt.subplots(figsize=(18, 6))

x = range(len(plot_df))
bar_width = 0.4

# Plot majority COG fractions
for i, row in plot_df.iterrows():
    cog = row['majority_cog']
    if pd.notnull(cog):
        ax.bar(x[i] - bar_width / 2, row['majority_fraction'], width=bar_width,
               color=cog_palette[cog], label=f'Majority: {cog}', alpha=0.8)

# Plot second majority COG fractions
for i, row in plot_df.iterrows():
    cog = row['second_majority_cog']
    if pd.notnull(cog):
        ax.bar(x[i] + bar_width / 2, row['second_majority_fraction'], width=bar_width,
               color=cog_palette[cog], label=f'Second: {cog}', alpha=0.4, hatch='//')

# Remove duplicate legend entries
handles, labels = ax.get_legend_handles_labels()
unique = dict()
for h, l in zip(handles, labels):
    if l not in unique:
        unique[l] = h
ax.legend(unique.values(), unique.keys(), loc='upper right', bbox_to_anchor=(1.15, 1), fontsize='small')

ax.set_xticks(x)
ax.set_xticklabels(plot_df['imodulon'], rotation=90)
ax.set_xlabel('iModulon')
ax.set_ylabel('Fraction of Genes')
ax.set_title('Fraction of Genes Assigned to Majority and Second Majority COGs per iModulon')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()

In [None]:
ica_data.view_imodulon(32)[abs(ica_data.view_imodulon(32)['gene_weight'])>0.05]

Rename iModulons and create imodulon table based on trn + COG enrichments

In [None]:
imod_table = pd.read_csv('../data/Annotations/imodulon_table.csv',index_col='iModulon')

imod_table.head()

In [None]:
from pymodulon.util import explained_variance

ngenes = []
exp_var = []
thresholds = []

for i in imod_table.index:
    ngenes.append(ica_data.view_imodulon(i).shape[0])
    exp_var.append(explained_variance(ica_data,imodulons=i))
    thresholds.append(ica_data.thresholds[i])

imod_table['ngenes'] = ngenes
imod_table['explained_variance'] = exp_var
imod_table['threshold'] = thresholds    



In [None]:
imod_table['pvalue'] = enrichment_details.loc['pvalue']
imod_table['precision'] = enrichment_details.loc['precision']
imod_table['recall'] = enrichment_details.loc['recall']
imod_table['f1score'] = enrichment_details.loc['f1score']
imod_table['TP'] = enrichment_details.loc['TP']
imod_table['regulon_size'] = enrichment_details.loc['regulon_size']
imod_table['gene_set_size'] = enrichment_details.loc['gene_set_size']
imod_table['qvalue'] = enrichment_details.loc['qvalue']
imod_table['n_regs'] = enrichment_details.loc['n_regs']


imod_table




In [None]:
enrichment_details

In [None]:
ica_data.imodulon_table = imod_table

Save ica_data onject

In [None]:
save_to_json(ica_data,'../data/yarrowia2025.json')

In [None]:
ica_data.imodulon_table