In [1]:
from pathlib import Path
import sys
import os.path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import f_oneway, gaussian_kde, mannwhitneyu, pearsonr, spearmanr
import seaborn as sns
from tqdm.notebook import tqdm
from pymodulon.compare import compare_ica
from pymodulon.core import IcaData
from pymodulon.io import load_json_model, save_to_json
from pymodulon.plotting import *

sns.set_style('whitegrid')

In [11]:
PRECISE_1K_X = pd.read_csv('./P1K_minicoli_removed/P1K_minicoli_removed_X.csv', index_col=0)

In [28]:
PRECISE_1K_round_1_A = pd.read_csv('./First_run_100_400/A.csv', index_col=0)

In [29]:
PRECISE_1K_round_1_M = pd.read_csv('./First_run_100_400/M.csv', index_col=0)

In [30]:
PRECISE_1K_round_1_M.shape

(4257, 138)

In [31]:
PRECISE_1K_round_1_A.shape

(138, 981)

In [39]:
PRECISE_1K_round_1_X = pd.DataFrame(np.dot(PRECISE_1K_round_1_M.to_numpy(), PRECISE_1K_round_1_A.to_numpy()))

In [41]:
PRECISE_1K_round_1_X.index = PRECISE_1K_X.index
PRECISE_1K_round_1_X.columns = PRECISE_1K_X.columns

In [42]:
PRECISE_1K_round_1_X

Unnamed: 0,p1k_00001,p1k_00002,p1k_00003,p1k_00004,p1k_00005,p1k_00006,p1k_00007,p1k_00008,p1k_00009,p1k_00010,...,p1k_01046,p1k_01047,p1k_01048,p1k_01049,p1k_01050,p1k_01051,p1k_01052,p1k_01053,p1k_01054,p1k_01055
b0002,-0.027273,0.027273,1.564507,1.653390,-0.287961,-0.380383,1.417525,1.045576,-0.159997,-0.091815,...,-0.345989,-0.393756,-0.572686,-0.535354,-0.824235,-0.758202,0.902402,1.134798,0.311306,0.654154
b0003,-0.061034,0.061034,1.281678,1.305206,0.038670,-0.015367,1.095235,0.971011,0.449889,0.438837,...,-0.942800,-1.107614,-1.283457,-1.266260,-0.932614,-1.106618,-0.335626,-0.356554,-0.512918,-0.532777
b0004,-0.051774,0.051774,1.265402,1.299039,0.093675,0.076650,1.123239,0.992066,0.324929,0.306852,...,-0.467706,-0.718050,-0.794174,-0.923241,-0.551854,-0.688924,0.215852,0.271678,0.039320,-0.077632
b0005,0.062439,-0.062439,-0.581828,-0.601353,-0.303133,-0.283822,-0.565018,-0.678933,-0.216678,-0.290265,...,-1.641739,-2.377659,-2.365223,-2.112472,-1.359921,-1.983520,-1.118181,-1.531733,-1.641081,-1.551894
b0006,-0.002532,0.002532,0.058648,0.028509,0.182000,0.219244,-0.220072,-0.201030,0.172858,0.164312,...,-0.084561,-0.498233,-0.335719,-0.505013,-0.622039,-0.445848,-0.415734,-1.224948,-1.360534,-1.196386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b4747,0.094248,-0.094248,0.450694,0.406225,0.212918,0.285921,0.891436,0.650269,0.660096,0.674446,...,1.164704,1.043154,0.931296,1.191928,1.678528,1.441241,1.121452,1.219583,1.386652,1.455498
b4748,-0.024745,0.024745,-1.294694,-1.145757,0.111523,0.145740,-1.020024,-0.967290,-0.745186,-0.750796,...,2.667818,1.928184,1.902512,1.561423,2.111489,2.228372,2.914478,1.139789,1.463314,1.551137
b4751,0.026774,-0.026774,0.166797,0.405473,-0.213673,-0.024479,-0.260607,-0.022577,0.160923,0.089919,...,-0.362010,0.161287,-0.114691,0.200970,0.402051,0.161794,-1.431401,-0.283186,0.200778,0.069117
b4755,0.025545,-0.025545,1.050473,1.081670,-0.108489,-0.061924,1.250258,1.057307,0.484142,0.597362,...,1.141307,1.111832,1.529429,0.582942,0.808067,1.805616,0.566726,1.543155,1.073590,1.014377


In [15]:
PRECISE_1K_X

Unnamed: 0,p1k_00001,p1k_00002,p1k_00003,p1k_00004,p1k_00005,p1k_00006,p1k_00007,p1k_00008,p1k_00009,p1k_00010,...,p1k_01046,p1k_01047,p1k_01048,p1k_01049,p1k_01050,p1k_01051,p1k_01052,p1k_01053,p1k_01054,p1k_01055
b0002,-0.053993,0.053993,0.879043,1.089600,0.182453,-0.078873,-0.837092,-0.502703,0.421312,0.412354,...,-0.868029,-2.012060,-1.862453,-1.950586,-1.642161,-1.666869,-0.639387,-1.552748,-1.892598,-1.831359
b0003,-0.061973,0.061973,1.063330,1.477889,0.392573,0.234671,-0.763015,-0.120989,0.546181,0.520841,...,-1.501588,-2.532496,-2.515189,-2.443481,-2.258418,-2.312108,-1.124294,-2.060022,-2.348524,-2.122772
b0004,-0.036972,0.036972,0.003132,0.356702,-0.096756,-0.334913,-1.714913,-1.322707,-0.234147,-0.250669,...,-0.691635,-1.692102,-1.608955,-1.723764,-1.376420,-1.251488,-0.529185,-1.577833,-1.383521,-1.527079
b0005,-0.104967,0.104967,-1.118145,-1.277963,-0.181546,-0.364057,-2.121475,-1.994770,-0.386684,-0.634659,...,-2.925632,-4.626965,-4.067417,-2.964371,-3.075562,-3.101967,-2.210546,-3.821019,-4.563928,-2.946659
b0006,0.042242,-0.042242,-0.123592,-0.107778,-0.173177,-0.089888,-0.208133,-0.003044,-0.034707,-0.027806,...,0.190536,-0.618740,-0.175381,-0.317349,-0.466331,-0.169979,-0.402588,-0.915991,-0.804728,-0.858978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b4747,0.619141,-0.619141,-0.619141,-0.619141,0.202715,-0.619141,0.734510,-0.619141,0.790289,0.284033,...,-0.619141,0.766947,-0.619141,-0.619141,-0.619141,-0.619141,-0.619141,-0.619141,-0.619141,1.164489
b4748,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.421363,0.000000,0.000000,0.000000,...,5.441998,5.734377,5.614587,5.648036,5.957326,6.657140,3.672372,4.138619,5.809142,4.465101
b4751,-0.039552,0.039552,-1.290021,-1.298327,-0.481621,-0.609865,-1.680827,-1.239934,-0.134430,-0.363082,...,0.274369,-0.356653,-0.371515,-0.064824,-0.504051,0.323457,-1.267812,-0.534742,0.039449,-0.051373
b4755,-0.235166,0.235166,2.621649,3.256277,-0.302443,-0.096198,2.518406,3.268205,1.379971,0.910647,...,-0.633352,-0.039091,-0.136657,0.734843,1.699093,-2.399592,-2.399592,3.413940,1.914515,1.564518


In [35]:
def ICA_explained_variance_each_component(
    M_matrix, A_matrix, log_tpm , genes=None, samples=None, imodulons=None
):

    # Check inputs
    if genes is None:
        genes = log_tpm.index
    elif isinstance(genes, str):
        genes = [genes]

    if samples is None:
        samples = log_tpm.columns
    elif isinstance(samples, str):
        samples = [samples]

    if imodulons is None:
        imodulons = M_matrix.columns
    elif isinstance(imodulons, str) or isinstance(imodulons, int):
        imodulons = [imodulons]

    centered = log_tpm
    
    # Account for normalization procedures before ICA (X=SA-x_mean)
    baseline = centered.subtract(centered.mean(axis=0), axis=1)
    baseline = baseline.loc[genes, samples]

    # Initialize variables
    base_err = np.linalg.norm(baseline) ** 2
    MA = np.zeros(baseline.shape)
    rec_var = [0]
    ma_arrs = {}
    ma_weights = {}
    explained_variance = []
    i = 0
    
    # Get individual modulon contributions
    for k in imodulons:
        ma_arr = np.dot(
            M_matrix.loc[genes, k].values.reshape(len(genes), 1),
            A_matrix.loc[k, samples].values.reshape(1, len(samples)),
        )
        ma_arrs[k] = ma_arr
        ma_weights[k] = np.sum(ma_arr**2)

    # Calculate explained variance in the original order
    for k in imodulons:
        MA = MA + ma_arrs[k]
        sa_err = np.linalg.norm(MA - baseline) ** 2
        rec_var.append((1 - sa_err / base_err))
        explained_variance.append((k, rec_var[i+1] - rec_var[i]))
        i += 1

    # Create a DataFrame from the collected data without sorting by explained variance
    explained_variance_df = pd.DataFrame(explained_variance, columns=['iModulon', 'Explained Variance'])
    
    return explained_variance_df

In [37]:
PRECISE_1K_round_1_A.index = PRECISE_1K_round_1_A.index.astype('str')

In [38]:
ICA_explained_variance = ICA_explained_variance_each_component(PRECISE_1K_round_1_M, PRECISE_1K_round_1_A, PRECISE_1K_X)
ICA_explained_variance

Unnamed: 0,iModulon,Explained Variance
0,0,0.005008
1,1,0.002744
2,2,0.005003
3,3,0.001608
4,4,0.019949
...,...,...
133,153,0.002990
134,155,0.000874
135,156,0.001429
136,157,0.002933


In [43]:
ICA_explained_variance['Explained Variance'].sum()

0.6602858204981917

In [45]:
PRECISE_1K_round_2_for_ICA = PRECISE_1K_X - PRECISE_1K_round_1_X

In [46]:
PRECISE_1K_round_2_for_ICA.to_csv('./P1K_minicoli_removed/P1K_minicoli_removed_X_round_2.csv')