In [1]:
from pathlib import Path
import sys
import os.path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import f_oneway, gaussian_kde, mannwhitneyu, pearsonr, spearmanr
import seaborn as sns
from tqdm.notebook import tqdm
from pymodulon.compare import compare_ica
from pymodulon.core import IcaData
from pymodulon.io import load_json_model, save_to_json
from pymodulon.plotting import *

sns.set_style('whitegrid')

In [9]:
PRECISE_1K_X = pd.read_csv('./P1K_minicoli_removed/P1K_minicoli_removed_X.csv', index_col=0)

In [3]:
PRECISE_1K_round_1_X = pd.read_csv('./P1K_minicoli_removed/P1K_minicoli_removed_X_round_2.csv', index_col=0)

In [4]:
PRECISE_1K_round_2_A = pd.read_csv('./Second_run_100_300/A.csv', index_col=0)

In [5]:
PRECISE_1K_round_2_M = pd.read_csv('./Second_run_100_300/M.csv', index_col=0)

In [6]:
PRECISE_1K_round_2_M.shape

(4257, 39)

In [7]:
PRECISE_1K_round_2_A.shape

(39, 981)

In [8]:
PRECISE_1K_round_3_X = pd.DataFrame(np.dot(PRECISE_1K_round_2_M.to_numpy(), PRECISE_1K_round_2_A.to_numpy()))

In [12]:
PRECISE_1K_round_3_X.index = PRECISE_1K_X.index
PRECISE_1K_round_3_X.columns = PRECISE_1K_X.columns

In [13]:
PRECISE_1K_round_3_X

Unnamed: 0,p1k_00001,p1k_00002,p1k_00003,p1k_00004,p1k_00005,p1k_00006,p1k_00007,p1k_00008,p1k_00009,p1k_00010,...,p1k_01046,p1k_01047,p1k_01048,p1k_01049,p1k_01050,p1k_01051,p1k_01052,p1k_01053,p1k_01054,p1k_01055
b0002,-0.014017,0.014017,-1.914910,-1.675813,0.216899,0.198383,-3.408499,-2.506678,-0.110318,-0.239731,...,-0.734850,-1.378984,-1.084010,-0.585458,0.203832,-0.028175,-1.613630,-2.649980,-1.607055,-2.115792
b0003,-0.021909,0.021909,-0.625126,-0.613259,0.056251,0.071905,-0.918980,-0.631812,-0.027859,-0.071401,...,-0.216764,-0.434510,-0.409541,-0.201786,0.027814,-0.106116,-0.447766,-1.181622,-0.893502,-0.958997
b0004,-0.014176,0.014176,-0.780912,-0.798788,0.025850,0.032977,-1.034554,-0.804468,-0.117243,-0.152881,...,-0.153024,-0.215061,-0.136551,0.048602,0.187511,-0.002985,-0.491158,-1.090319,-0.761851,-0.857883
b0005,0.002599,-0.002599,0.174107,0.169016,0.128384,0.088054,0.054385,0.119672,0.186349,0.216845,...,0.104155,0.056705,0.001697,0.044525,0.333072,0.360302,0.277160,-0.136686,-0.412401,-0.174655
b0006,0.009992,-0.009992,-0.198707,-0.199412,-0.080043,-0.091652,-0.134069,-0.198928,-0.128049,-0.125911,...,-0.145852,-0.083912,-0.112395,-0.160210,-0.190101,-0.280443,0.046822,0.022769,0.081008,0.038246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b4747,0.420749,-0.420749,-0.693528,-1.032428,-0.461308,-1.160604,-0.240815,-1.074750,-0.165695,-0.218122,...,-2.369883,-0.469869,-1.706593,-2.003250,-2.275730,-2.368352,-1.180977,-1.195541,-2.147764,-1.101610
b4748,0.096726,-0.096726,-0.734249,-0.895356,-0.190589,-0.247392,-0.526779,-0.643773,-0.035025,0.032432,...,-0.210687,-0.283628,-0.537124,-0.324407,-0.756185,0.082537,-1.239489,-0.815961,-0.118758,-0.714928
b4751,-0.012768,0.012768,-0.581351,-0.563326,-0.114843,-0.109069,-0.581293,-0.281062,-0.244368,-0.182828,...,-0.083149,-0.341674,-0.295502,-0.254148,-0.320754,-0.008041,0.028794,-0.444246,-0.305261,-0.378353
b4755,0.017471,-0.017471,0.659830,0.674734,0.079145,0.145165,0.709273,0.639747,0.259307,0.249642,...,0.269297,0.678678,0.855909,0.762350,0.831065,0.913207,0.173676,0.989758,1.056406,0.892750


In [14]:
PRECISE_1K_round_1_X

Unnamed: 0,p1k_00001,p1k_00002,p1k_00003,p1k_00004,p1k_00005,p1k_00006,p1k_00007,p1k_00008,p1k_00009,p1k_00010,...,p1k_01046,p1k_01047,p1k_01048,p1k_01049,p1k_01050,p1k_01051,p1k_01052,p1k_01053,p1k_01054,p1k_01055
b0002,-0.026720,0.026720,-0.685464,-0.563789,0.470414,0.301510,-2.254617,-1.548279,0.581309,0.504169,...,-0.522040,-1.618304,-1.289768,-1.415232,-0.817926,-0.908667,-1.541790,-2.687546,-2.203905,-2.485513
b0003,-0.000939,0.000939,-0.218348,0.172683,0.353903,0.250039,-1.858250,-1.092000,0.096292,0.082004,...,-0.558788,-1.424882,-1.231732,-1.177221,-1.325804,-1.205490,-0.788669,-1.703468,-1.835606,-1.589995
b0004,0.014802,-0.014802,-1.262271,-0.942337,-0.190432,-0.411562,-2.838152,-2.314773,-0.559076,-0.557521,...,-0.223929,-0.974052,-0.814781,-0.800523,-0.824566,-0.562565,-0.745037,-1.849511,-1.422841,-1.449446
b0005,-0.167406,0.167406,-0.536316,-0.676610,0.121587,-0.080235,-1.556457,-1.315838,-0.170005,-0.344394,...,-1.283893,-2.249306,-1.702193,-0.851898,-1.715641,-1.118447,-1.092365,-2.289286,-2.922847,-1.394765
b0006,0.044774,-0.044774,-0.182239,-0.136287,-0.355177,-0.309132,0.011939,0.197986,-0.207565,-0.192119,...,0.275097,-0.120508,0.160339,0.187664,0.155708,0.275869,0.013146,0.308957,0.555806,0.337408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b4747,0.524893,-0.524893,-1.069835,-1.025366,-0.010203,-0.905062,-0.156926,-1.269409,0.130193,-0.390413,...,-1.783845,-0.276207,-1.550436,-1.811069,-2.297669,-2.060382,-1.740593,-1.838724,-2.005793,-0.291009
b4748,0.024745,-0.024745,1.294694,1.145757,-0.111523,-0.145740,2.441387,0.967290,0.745186,0.750796,...,2.774181,3.806192,3.712075,4.086613,3.845837,4.428768,0.757894,2.998831,4.345828,2.913964
b4751,-0.066326,0.066326,-1.456818,-1.703800,-0.267948,-0.585387,-1.420220,-1.217357,-0.295353,-0.453001,...,0.636378,-0.517940,-0.256824,-0.265795,-0.906102,0.161663,0.163589,-0.251556,-0.161330,-0.120490
b4755,-0.260712,0.260712,1.571176,2.174608,-0.193954,-0.034274,1.268147,2.210898,0.895829,0.313285,...,-1.774659,-1.150923,-1.666086,0.151901,0.891026,-4.205208,-2.966318,1.870785,0.840926,0.550141


In [15]:
def ICA_explained_variance_each_component(
    M_matrix, A_matrix, log_tpm , genes=None, samples=None, imodulons=None
):

    # Check inputs
    if genes is None:
        genes = log_tpm.index
    elif isinstance(genes, str):
        genes = [genes]

    if samples is None:
        samples = log_tpm.columns
    elif isinstance(samples, str):
        samples = [samples]

    if imodulons is None:
        imodulons = M_matrix.columns
    elif isinstance(imodulons, str) or isinstance(imodulons, int):
        imodulons = [imodulons]

    centered = log_tpm
    
    # Account for normalization procedures before ICA (X=SA-x_mean)
    baseline = centered.subtract(centered.mean(axis=0), axis=1)
    baseline = baseline.loc[genes, samples]

    # Initialize variables
    base_err = np.linalg.norm(baseline) ** 2
    MA = np.zeros(baseline.shape)
    rec_var = [0]
    ma_arrs = {}
    ma_weights = {}
    explained_variance = []
    i = 0
    
    # Get individual modulon contributions
    for k in imodulons:
        ma_arr = np.dot(
            M_matrix.loc[genes, k].values.reshape(len(genes), 1),
            A_matrix.loc[k, samples].values.reshape(1, len(samples)),
        )
        ma_arrs[k] = ma_arr
        ma_weights[k] = np.sum(ma_arr**2)

    # Calculate explained variance in the original order
    for k in imodulons:
        MA = MA + ma_arrs[k]
        sa_err = np.linalg.norm(MA - baseline) ** 2
        rec_var.append((1 - sa_err / base_err))
        explained_variance.append((k, rec_var[i+1] - rec_var[i]))
        i += 1

    # Create a DataFrame from the collected data without sorting by explained variance
    explained_variance_df = pd.DataFrame(explained_variance, columns=['iModulon', 'Explained Variance'])
    
    return explained_variance_df

In [18]:
PRECISE_1K_round_2_A.index = PRECISE_1K_round_2_A.index.astype('str')

In [19]:
ICA_explained_variance = ICA_explained_variance_each_component(PRECISE_1K_round_2_M, PRECISE_1K_round_2_A, PRECISE_1K_round_1_X)
ICA_explained_variance

Unnamed: 0,iModulon,Explained Variance
0,0,0.028979
1,1,0.004706
2,3,0.025546
3,4,0.005348
4,5,0.003686
5,6,0.002883
6,7,0.001984
7,9,0.004752
8,11,0.001251
9,12,0.015894


In [20]:
ICA_explained_variance['Explained Variance'].sum()

0.4368269977132547

In [21]:
PRECISE_1K_round_3_for_ICA = PRECISE_1K_round_1_X - PRECISE_1K_round_3_X

In [23]:
PRECISE_1K_round_3_for_ICA.to_csv('./P1K_minicoli_removed/P1K_minicoli_removed_X_round_3.csv')