# Import Packages

In [1]:
# built-in
from ast import literal_eval
import os.path

# third-party (pip install required)
import numpy as np
import pandas as pd
from pymodulon.compare import compare_ica
from pymodulon.core import IcaData
from pymodulon.io import load_json_model, save_to_json
from pymodulon.util import explained_variance
from tqdm.notebook import tqdm

In [2]:
# Enter the location of your data here
P1K_PATH = '../../data/precise1k/'
K12_PATH = '../../data/k12_modulome/'
ANNOTATION_PATH = '../../data/annotation'

# Merge Metadata

Still need to combine metadata from PRECISE-1K and K-12 Modulome

In [3]:
p1k_metadata = pd.read_csv(os.path.join(P1K_PATH, 'metadata_qc.csv'), index_col=0)
k12_metadata = pd.read_csv(os.path.join(K12_PATH, 'metadata_qc.csv'), index_col=0)
# modify K-12 metadata to mesh well with P1K
k12_metadata = k12_metadata[[
    'Run', 'ReleaseDate', 'LibraryLayout', 'Platform', 'Model', 'BioProject', 'BioSample', 'ScientificName',
    'passed_fastqc', 'passed_reads_mapped_to_CDS', 'passed_global_correlation', 'GEO Sample',
    'GEO Series', 'PMID', 'additional_notes', 'aerobicity', 'approx_OD', 'base_media',
    'carbon_source', 'condition', 'culture_type', 'dilution_rate', 'full_name', 'growth_phase',
    'nitrogen_source', 'pH', 'project', 'reference_condition', 'strain_description', 'supplement',
    'temperature', 'time', 'passed_replicate_correlations'
]]
k12_metadata = k12_metadata.rename(columns={
    'ReleaseDate': 'run_date',
    'Model': 'Sequencing Machine',
    'GEO Series': 'GEO',
    'additional_notes': 'Additional Details',
    'base_media': 'Base Media', 
    'carbon_source': 'Carbon Source (g/L)',
    'culture_type': 'Culture Type',
    'nitrogen_source': 'Nitrogen Source (g/L)',
    'temperature': 'Temperature (C)',
    'strain_description': 'Strain Description',
    'supplement': 'Supplement'
})

current_count = {}

def rep_id(row):
    proj, cond = row['project'], row['condition']
    proj_cond = f'{proj}__{cond}'
    if proj_cond not in current_count:
        current_count[proj_cond] = 1
    else:
        current_count[proj_cond] += 1
    return current_count[proj_cond]

def sample_id(row):
    proj, cond, rep_id = row['project'], row['condition'], row['rep_id']
    return f'{proj}__{cond}__{rep_id}'

k12_metadata = k12_metadata.assign(
    rep_id=k12_metadata.apply(rep_id, axis=1)
)
k12_metadata = k12_metadata.assign(
    sample_id=k12_metadata.apply(sample_id, axis=1)
)

merged_metadata = pd.concat([p1k_metadata, k12_metadata])

In [4]:
merged_metadata.to_csv(os.path.join(K12_PATH, 'metadata_qc_with_p1k.csv'))

# Create IcaData

In [5]:
ica_data = IcaData(
    M=os.path.join(K12_PATH,'M.csv'),
    A=os.path.join(K12_PATH,'A.csv'),
    gene_table=os.path.join(ANNOTATION_PATH,'gene_info.csv'),
    sample_table=os.path.join(K12_PATH,'metadata_qc_with_p1k.csv'),
    trn=os.path.join(ANNOTATION_PATH,'TRN.csv'),
    optimize_cutoff=True
)



# Explained Variance

Need to temporarily add the X matrix for this

In [6]:
ica_data.X = pd.read_csv(os.path.join(K12_PATH, 'log_tpm_norm_with_p1k_ctrl.csv'), index_col=0)

In [7]:
exp_vars = [explained_variance(ica_data, imodulons=[imod]) for imod in ica_data.imodulon_names]

In [8]:
ica_data.imodulon_table['exp_var'] = exp_vars

In [9]:
ica_data._x = None

# iModulon Size

In [10]:
ica_data.imodulon_table['imodulon_size'] = [ica_data.view_imodulon(imod).shape[0]
                                            for imod in ica_data.imodulon_names]

# Previous Dataset Comparison

## Load Previous Datasets

In [11]:
precise1 = load_json_model('../../data/precise/precise.json.gz')
precise2 = load_json_model('../../data/precise2/precise2.json.gz')
precise1k = load_json_model(os.path.join(P1K_PATH, 'precise1k.json.gz'))

## Run Correlation Comparisons

In [12]:
match_rows = []
for match in compare_ica(precise1.M, ica_data.M, method='pearson')[0]:
    match_rows.append({
        'dataset': 'PRECISE',
        'iM': match[0],
        'p1k_im': match[1],
        'method': 'pearson',
        'corr': match[2]
    })
for match in compare_ica(precise1.M, ica_data.M, method='spearman')[0]:
    match_rows.append({
        'dataset': 'PRECISE',
        'iM': match[0],
        'p1k_im': match[1],
        'method': 'spearman',
        'corr': match[2]
    })
for match in compare_ica(precise2.M, ica_data.M, method='pearson')[0]:
    match_rows.append({
        'dataset': 'PRECISE 2.0',
        'iM': match[0],
        'p1k_im': match[1],
        'method': 'pearson',
        'corr': match[2]
    })
for match in compare_ica(precise2.M, ica_data.M, method='spearman')[0]:
    match_rows.append({
        'dataset': 'PRECISE 2.0',
        'iM': match[0],
        'p1k_im': match[1],
        'method': 'spearman',
        'corr': match[2]
    })
for match in compare_ica(precise1k.M, ica_data.M, method='pearson')[0]:
    match_rows.append({
        'dataset': 'PRECISE-1K',
        'iM': match[0],
        'p1k_im': match[1],
        'method': 'pearson',
        'corr': match[2]
    })
for match in compare_ica(precise1k.M, ica_data.M, method='spearman')[0]:
    match_rows.append({
        'dataset': 'PRECISE-1K',
        'iM': match[0],
        'p1k_im': match[1],
        'method': 'spearman',
        'corr': match[2]
    })
    
match_df = pd.DataFrame(match_rows)

In [13]:
for im, im_row in ica_data.imodulon_table.iterrows():
    im_match = match_df[match_df['p1k_im'] == im]
    if not im_match.empty:
        for dataset, dataset_match_df in im_match.groupby('dataset'):
            max_corr = np.argmax(dataset_match_df['corr'])
            max_corr_im = dataset_match_df.iloc[max_corr]['iM']
            max_corr_im_df = dataset_match_df[dataset_match_df['iM'] == max_corr_im]
            for _, max_corr_row in max_corr_im_df.iterrows():
                ica_data.imodulon_table.loc[im, max_corr_row['dataset']] = max_corr_row['iM']
                ica_data.imodulon_table.loc[im, f"{max_corr_row['dataset']}_{max_corr_row['method']}"] = max_corr_row['corr']

## Re-run TRN Enrichment per P1K Parameters

In [14]:
always_copy_cols = [
    'enrichment_category', 'system_category', 'functional_category', 'function', 'confidence',
    'note', 'single_gene_dominant_technical', 'tcs', 'regulon_discovery', 'ko'
]

for i, (im, im_row) in enumerate(ica_data.imodulon_table.iterrows()):
    if pd.notna(im_row['PRECISE-1K']):
        p1k_im_row = precise1k.imodulon_table.loc[im_row['PRECISE-1K']]
        print(f"({i+1}/{ica_data.imodulon_table.shape[0]}) {im} -> {im_row['PRECISE-1K']}")
        ica_data.imodulon_table.loc[im, always_copy_cols] = precise1k.imodulon_table.loc[im_row['PRECISE-1K'], always_copy_cols]
        if pd.notna(p1k_im_row['trn_enrich_max_regs']):
            enrich_res = ica_data.compute_trn_enrichment(im, max_regs=int(p1k_im_row['trn_enrich_max_regs']),
                                            evidence=literal_eval(p1k_im_row['trn_enrich_evidence']), force=True,
                                            method=p1k_im_row['trn_enrich_method'])
            enrich_res_with_reg = enrich_res[enrich_res['regulator'] == p1k_im_row['regulator']]
            if enrich_res_with_reg.empty:
                continue
            ica_data.imodulon_table.loc[
                im,
                ['regulator', 'pvalue', 'qvalue', 'precision', 'recall', 'f1score', 'TP', 'regulon_size', 'n_regs']
            ] = enrich_res_with_reg.iloc[0][[
                'regulator', 'pvalue', 'qvalue', 'precision', 'recall', 'f1score', 'TP', 'regulon_size', 'n_regs'
            ]]
            ica_data.imodulon_table.loc[
                im,
                [
                    'trn_enrich_max_regs',
                    'trn_enrich_evidence',
                    'trn_enrich_method'
                ]
            ] = [
                p1k_im_row['trn_enrich_max_regs'],
                p1k_im_row['trn_enrich_evidence'],
                p1k_im_row['trn_enrich_method']
            ]
        elif pd.notna(p1k_im_row['compute_regulon_evidence']):
            enrich_res = ica_data.compute_regulon_enrichment(im, p1k_im_row['regulator'], evidence=literal_eval(p1k_im_row['compute_regulon_evidence']))
            ica_data.imodulon_table.loc[
                im,
                ['pvalue', 'precision', 'recall', 'f1score', 'TP', 'regulon_size', 'n_regs']
            ] = enrich_res[
                ['pvalue', 'precision', 'recall', 'f1score', 'TP', 'regulon_size', 'n_regs']
            ]
            ica_data.imodulon_table.loc[im, 'regulator'] = p1k_im_row['regulator']
            ica_data.imodulon_table.loc[im, 'compute_regulon_evidence'] = p1k_im_row['compute_regulon_evidence']

(1/194) 0 -> Phage Shock
(2/194) 1 -> Magnesium
(3/194) 2 -> Tyr/Trp/Phe
(4/194) 3 -> Maltose
(5/194) 4 -> UC-1
(6/194) 5 -> Microaerobic
(7/194) 6 -> Cysteine-1
(8/194) 7 -> Curli-1
(9/194) 8 -> Xylose
(10/194) 9 -> cydB/appC KO
(11/194) 10 -> Glycolate
(12/194) 11 -> Lysine/T2SS
(13/194) 12 -> ykgR
(14/194) 13 -> Thiamine-1
(15/194) 14 -> Cra
(16/194) 15 -> Methionine
(17/194) 16 -> ytiC
(18/194) 17 -> FucR/AllR/AraC
(19/194) 18 -> DNA Damage
(20/194) 19 -> Molybdenum
(21/194) 20 -> Nitrate/Nitrite
(22/194) 21 -> BasR
(23/194) 22 -> GadXW
(25/194) 24 -> IS1
(26/194) 25 -> Rhamnose
(27/194) 26 -> Dipeptide
(28/194) 27 -> ROS TALE Del-1
(29/194) 28 -> Cytochrome c
(30/194) 29 -> Pyrimidine
(31/194) 30 -> Membrane
(32/194) 31 -> DhaR
(33/194) 32 -> GlcNAc
(34/194) 33 -> Tryptophanase
(35/194) 34 -> ROS TALE Del-2
(36/194) 35 -> crp KO-1
(37/194) 36 -> Fnr-1
(38/194) 37 -> cyoB/kdpE/qseB KO
(39/194) 38 -> Arginine
(40/194) 39 -> Fimbriae
(41/194) 40 -> PaaX
(42/194) 41 -> FlhDC-2
(43/194

# Rename iModulons

In [15]:
rename_dict = {}

for im, im_row in ica_data.imodulon_table.iterrows():
    if pd.notna(im_row['PRECISE-1K']):
        rename_dict[im] = im_row['PRECISE-1K']
        
ica_data.rename_imodulons(rename_dict)

rename_dict2 = {}
for im in ica_data.imodulon_names:
    if not isinstance(im, str):
        rename_dict2[im] = str(im)
ica_data.rename_imodulons(rename_dict2)



# Re-Save M/A Matrices and iModulon Table

In [16]:
ica_data.imodulon_table.to_csv(os.path.join(K12_PATH,'imodulon_table.csv'))
ica_data.A.to_csv(os.path.join(K12_PATH,'A.csv'))
ica_data.M.to_csv(os.path.join(K12_PATH,'M.csv'))

# Save IcaData Object

This will save your iModulon table, your thresholds, and any other information stored in the ica_data object.

In [17]:
save_to_json(ica_data, os.path.join(K12_PATH,'k12_modulome'), compress=True)