In [1]:
from pathlib import Path
import sys
import os.path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import f_oneway, gaussian_kde, mannwhitneyu, pearsonr, spearmanr
import seaborn as sns
from tqdm.notebook import tqdm
from pymodulon.compare import compare_ica
from pymodulon.core import IcaData
from pymodulon.io import load_json_model, save_to_json
from pymodulon.plotting import *

sns.set_style('whitegrid')

In [9]:
PRECISE_1K_X = pd.read_csv('./P1K_minicoli_removed/P1K_minicoli_removed_X.csv', index_col=0)

In [10]:
PRECISE_1K_round_3_X = pd.read_csv('./P1K_minicoli_removed/P1K_minicoli_removed_X_round_3.csv', index_col=0)

In [11]:
PRECISE_1K_round_3_A = pd.read_csv('./Third_run_60_180/A.csv', index_col=0)

In [12]:
PRECISE_1K_round_3_M = pd.read_csv('./Third_run_60_180/M.csv', index_col=0)

In [13]:
PRECISE_1K_round_3_M.shape

(4257, 15)

In [14]:
PRECISE_1K_round_3_A.shape

(15, 981)

In [15]:
PRECISE_1K_round_4_X = pd.DataFrame(np.dot(PRECISE_1K_round_3_M.to_numpy(), PRECISE_1K_round_3_A.to_numpy()))

In [16]:
PRECISE_1K_round_4_X.index = PRECISE_1K_X.index
PRECISE_1K_round_4_X.columns = PRECISE_1K_X.columns

In [17]:
PRECISE_1K_round_4_X

Unnamed: 0,p1k_00001,p1k_00002,p1k_00003,p1k_00004,p1k_00005,p1k_00006,p1k_00007,p1k_00008,p1k_00009,p1k_00010,...,p1k_01046,p1k_01047,p1k_01048,p1k_01049,p1k_01050,p1k_01051,p1k_01052,p1k_01053,p1k_01054,p1k_01055
b0002,0.002985,-0.002985,0.843168,0.726196,0.140232,0.191548,1.257439,1.304228,0.366706,0.314833,...,0.006641,0.035132,0.270115,0.141938,-0.056517,-0.425008,-0.162854,-0.061392,0.095353,0.037035
b0003,0.021339,-0.021339,-0.014433,0.021868,-0.034729,-0.012274,-0.116637,-0.111456,-0.092886,-0.065961,...,0.108098,0.103761,0.105433,0.031847,0.083441,0.346343,0.121052,0.078861,0.003884,0.038401
b0004,0.010429,-0.010429,-0.176078,-0.156474,-0.073562,-0.095445,-0.266331,-0.270335,-0.160271,-0.143396,...,0.132650,0.027795,-0.003069,-0.012343,0.002600,0.368039,0.149675,-0.076229,-0.087501,-0.093322
b0005,-0.027436,0.027436,-0.037715,0.019915,-0.019846,-0.066566,-0.029172,-0.036180,-0.029949,-0.067325,...,-0.049234,-0.093817,-0.052879,0.051431,0.075849,-0.121052,0.280243,-0.087494,-0.061676,0.050657
b0006,-0.006622,0.006622,-0.124135,-0.139906,-0.033335,0.004262,-0.164456,-0.153767,-0.039725,-0.056505,...,-0.146088,-0.083509,-0.064295,-0.114165,0.010251,-0.233109,-0.212143,-0.150582,-0.193285,-0.186067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b4747,0.025048,-0.025048,-0.000994,-0.034278,0.028132,-0.021253,-0.082164,-0.064346,-0.152217,-0.121305,...,-0.007988,0.099133,0.098743,-0.084958,-0.161593,0.263152,0.137278,0.158355,0.089770,0.134781
b4748,-0.033070,0.033070,0.193472,0.177487,0.023155,0.151234,0.093736,0.060691,0.225145,0.143572,...,0.491769,0.540566,0.799825,0.254554,0.579371,0.364192,0.651310,0.116848,0.473295,0.182170
b4751,-0.020998,0.020998,-0.403576,-0.414432,0.025226,0.041417,-0.352435,-0.351039,0.093141,0.065791,...,-0.025345,-0.191150,-0.217604,-0.153812,-0.183998,-0.176516,-0.176154,-0.274253,-0.157265,-0.163471
b4755,-0.027936,0.027936,0.660154,1.008781,-0.046057,-0.002513,0.708439,1.169890,0.376894,0.015254,...,-2.338082,-1.513758,-2.386956,0.257476,0.919787,-4.989745,-2.808898,1.253362,-0.634701,-0.364697


In [14]:
PRECISE_1K_round_3_X

Unnamed: 0,p1k_00001,p1k_00002,p1k_00003,p1k_00004,p1k_00005,p1k_00006,p1k_00007,p1k_00008,p1k_00009,p1k_00010,...,p1k_01046,p1k_01047,p1k_01048,p1k_01049,p1k_01050,p1k_01051,p1k_01052,p1k_01053,p1k_01054,p1k_01055
b0002,-0.026720,0.026720,-0.685464,-0.563789,0.470414,0.301510,-2.254617,-1.548279,0.581309,0.504169,...,-0.522040,-1.618304,-1.289768,-1.415232,-0.817926,-0.908667,-1.541790,-2.687546,-2.203905,-2.485513
b0003,-0.000939,0.000939,-0.218348,0.172683,0.353903,0.250039,-1.858250,-1.092000,0.096292,0.082004,...,-0.558788,-1.424882,-1.231732,-1.177221,-1.325804,-1.205490,-0.788669,-1.703468,-1.835606,-1.589995
b0004,0.014802,-0.014802,-1.262271,-0.942337,-0.190432,-0.411562,-2.838152,-2.314773,-0.559076,-0.557521,...,-0.223929,-0.974052,-0.814781,-0.800523,-0.824566,-0.562565,-0.745037,-1.849511,-1.422841,-1.449446
b0005,-0.167406,0.167406,-0.536316,-0.676610,0.121587,-0.080235,-1.556457,-1.315838,-0.170005,-0.344394,...,-1.283893,-2.249306,-1.702193,-0.851898,-1.715641,-1.118447,-1.092365,-2.289286,-2.922847,-1.394765
b0006,0.044774,-0.044774,-0.182239,-0.136287,-0.355177,-0.309132,0.011939,0.197986,-0.207565,-0.192119,...,0.275097,-0.120508,0.160339,0.187664,0.155708,0.275869,0.013146,0.308957,0.555806,0.337408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b4747,0.524893,-0.524893,-1.069835,-1.025366,-0.010203,-0.905062,-0.156926,-1.269409,0.130193,-0.390413,...,-1.783845,-0.276207,-1.550436,-1.811069,-2.297669,-2.060382,-1.740593,-1.838724,-2.005793,-0.291009
b4748,0.024745,-0.024745,1.294694,1.145757,-0.111523,-0.145740,2.441387,0.967290,0.745186,0.750796,...,2.774181,3.806192,3.712075,4.086613,3.845837,4.428768,0.757894,2.998831,4.345828,2.913964
b4751,-0.066326,0.066326,-1.456818,-1.703800,-0.267948,-0.585387,-1.420220,-1.217357,-0.295353,-0.453001,...,0.636378,-0.517940,-0.256824,-0.265795,-0.906102,0.161663,0.163589,-0.251556,-0.161330,-0.120490
b4755,-0.260712,0.260712,1.571176,2.174608,-0.193954,-0.034274,1.268147,2.210898,0.895829,0.313285,...,-1.774659,-1.150923,-1.666086,0.151901,0.891026,-4.205208,-2.966318,1.870785,0.840926,0.550141


In [19]:
def ICA_explained_variance_each_component(
    M_matrix, A_matrix, log_tpm , genes=None, samples=None, imodulons=None
):

    # Check inputs
    if genes is None:
        genes = log_tpm.index
    elif isinstance(genes, str):
        genes = [genes]

    if samples is None:
        samples = log_tpm.columns
    elif isinstance(samples, str):
        samples = [samples]

    if imodulons is None:
        imodulons = M_matrix.columns
    elif isinstance(imodulons, str) or isinstance(imodulons, int):
        imodulons = [imodulons]

    centered = log_tpm
    
    # Account for normalization procedures before ICA (X=SA-x_mean)
    baseline = centered.subtract(centered.mean(axis=0), axis=1)
    baseline = baseline.loc[genes, samples]

    # Initialize variables
    base_err = np.linalg.norm(baseline) ** 2
    MA = np.zeros(baseline.shape)
    rec_var = [0]
    ma_arrs = {}
    ma_weights = {}
    explained_variance = []
    i = 0
    
    # Get individual modulon contributions
    for k in imodulons:
        ma_arr = np.dot(
            M_matrix.loc[genes, k].values.reshape(len(genes), 1),
            A_matrix.loc[k, samples].values.reshape(1, len(samples)),
        )
        ma_arrs[k] = ma_arr
        ma_weights[k] = np.sum(ma_arr**2)

    # Calculate explained variance in the original order
    for k in imodulons:
        MA = MA + ma_arrs[k]
        sa_err = np.linalg.norm(MA - baseline) ** 2
        rec_var.append((1 - sa_err / base_err))
        explained_variance.append((k, rec_var[i+1] - rec_var[i]))
        i += 1

    # Create a DataFrame from the collected data without sorting by explained variance
    explained_variance_df = pd.DataFrame(explained_variance, columns=['iModulon', 'Explained Variance'])
    
    return explained_variance_df

In [20]:
PRECISE_1K_round_3_A.index = PRECISE_1K_round_3_A.index.astype('str')

In [21]:
ICA_explained_variance = ICA_explained_variance_each_component(PRECISE_1K_round_3_M, PRECISE_1K_round_3_A, PRECISE_1K_round_3_X)
ICA_explained_variance

Unnamed: 0,iModulon,Explained Variance
0,0,0.001911
1,2,0.006271
2,5,0.005613
3,6,0.050073
4,7,0.006506
5,8,0.010322
6,9,0.025831
7,10,0.004896
8,11,0.013471
9,14,0.003274


In [22]:
ICA_explained_variance['Explained Variance'].sum()

0.17144329165003513

In [23]:
PRECISE_1K_round_4_for_ICA = PRECISE_1K_round_3_X - PRECISE_1K_round_4_X

In [25]:
PRECISE_1K_round_4_for_ICA.to_csv('./P1K_minicoli_removed/P1K_minicoli_removed_X_round_4.csv')

In [26]:
PRECISE_1K_round_4_A = pd.read_csv('./Fourth_run_40_120/A.csv', index_col=0)

In [27]:
PRECISE_1K_round_4_M = pd.read_csv('./Fourth_run_40_120/M.csv', index_col=0)

In [29]:
PRECISE_1K_round_4_A.index = PRECISE_1K_round_4_A.index.astype('str')

In [30]:
ICA_explained_variance = ICA_explained_variance_each_component(PRECISE_1K_round_4_M, PRECISE_1K_round_4_A, PRECISE_1K_round_4_for_ICA)
ICA_explained_variance

Unnamed: 0,iModulon,Explained Variance
0,0,0.004265
1,1,0.003367
2,3,0.045871
3,5,0.025371
4,8,0.008389
5,9,0.005154
6,11,0.010865
7,15,0.034418
8,17,0.023539
9,19,0.004517


In [31]:
ICA_explained_variance['Explained Variance'].sum()

0.16575621684232644