## Apply ICA to gene dependencies

Independent Component Analysis (ICA) is a dimensionality reduction method that separates a multivariate signal into additive, independent components.

We apply it to GeneEffect scores here and save latent representations (ICA components) for downstream comparative analyses (to compare with BetaVAE).


In [9]:
import sys
import pathlib
import pandas as pd
import plotnine as gg
from sklearn.decomposition import FastICA

sys.path.insert(0, "../utils/")
from data_loader import load_model_data

In [10]:
data_directory = pathlib.Path("../0.data-download/data").resolve()
dependency_file = pathlib.Path(f"{data_directory}/CRISPRGeneEffect.parquet").resolve()
gene_dict_file = pathlib.Path(f"{data_directory}/CRISPR_gene_dictionary.parquet").resolve()

output_dir = pathlib.Path("results").resolve()
ica_output_file = pathlib.Path(f"{output_dir}/ica_latent.parquet.gz").resolve()
output_ica_weights_file = pathlib.Path(f"{output_dir}/ICA_weight_matrix_gsea.parquet").resolve()

In [11]:
ica_components = 50

In [12]:
# Load data
dependency_df, gene_dict_df = load_model_data(dependency_file, gene_dict_file)

(1150, 18444)


# Perform ICA

In [13]:
ica = FastICA(n_components=ica_components, random_state=0)
ica.fit(dependency_df.drop(columns=["ModelID"]))

In [14]:
#Transform models into ICA space
dependency_df_transformed = pd.DataFrame(
   ica.transform(dependency_df.drop(columns=["ModelID"]))
)

In [15]:
# Recode column space and add back model IDs
dependency_df_transformed.columns = [f"ICA_{x}" for x in range(0, dependency_df_transformed.shape[1])]
dependency_df_transformed = pd.concat([dependency_df.loc[:, "ModelID"], dependency_df_transformed], axis="columns")


dependency_df_transformed.to_parquet(ica_output_file, index=False)


print(dependency_df_transformed.shape)
dependency_df_transformed.head(3)

(1150, 51)


Unnamed: 0,ModelID,ICA_0,ICA_1,ICA_2,ICA_3,ICA_4,ICA_5,ICA_6,ICA_7,ICA_8,...,ICA_40,ICA_41,ICA_42,ICA_43,ICA_44,ICA_45,ICA_46,ICA_47,ICA_48,ICA_49
0,ACH-000001,-1.89482,-0.420639,0.152816,0.683013,-0.576386,-1.254303,0.875927,-1.027659,0.147783,...,-0.392907,-0.136282,-3.179023,0.041179,0.131887,0.356428,-0.911628,-0.139935,0.427778,-1.135748
1,ACH-000004,0.19919,0.878038,0.234728,-2.292786,0.262948,-0.014035,-0.237673,-1.491049,-0.696543,...,-3.667632,0.400782,0.015032,0.764805,1.560523,0.082876,0.558015,0.268848,-0.079606,0.221561
2,ACH-000005,0.102404,0.134753,-0.001711,-0.578072,-0.197917,-1.19091,-0.06503,-0.879704,-1.128394,...,-3.43967,0.167856,-0.19318,3.324927,0.527463,0.296716,0.149641,-0.156201,-0.212288,0.140653


In [16]:
# Obtain weights, which can be used in GSEA
ica_weights = pd.DataFrame(ica.mixing_, index=dependency_df.drop(columns=["ModelID"]).columns.tolist())
ica_weights.columns = [f"ICA_{x}" for x in range(ica_weights.shape[1])]


ica_weights = ica_weights.reset_index().rename(columns={"index": "genes"})


ica_weights.to_parquet(output_ica_weights_file, index=False)


print(ica_weights.shape)
ica_weights.head(3)

(2718, 51)


Unnamed: 0,genes,ICA_0,ICA_1,ICA_2,ICA_3,ICA_4,ICA_5,ICA_6,ICA_7,ICA_8,...,ICA_40,ICA_41,ICA_42,ICA_43,ICA_44,ICA_45,ICA_46,ICA_47,ICA_48,ICA_49
0,CYB5R4,0.002743,0.004387,-0.002686,0.013452,-0.024221,-0.012599,-0.004682,0.001535,-0.008149,...,-0.025868,0.024477,0.01824,0.006149,-0.009292,-0.000889,0.037068,0.019506,0.003236,-0.018785
1,CNEP1R1,0.000538,0.000177,-0.007442,0.011053,-0.015358,-0.00489,-0.00838,0.005434,-0.021387,...,0.014664,0.012351,0.040456,-0.010147,0.022489,-0.005652,0.010396,0.026636,-0.021068,-0.012807
2,ARHGEF12,0.014816,-0.023335,-0.010923,0.002226,-0.002431,0.004405,0.001888,0.002457,0.007297,...,-0.007788,-0.004481,0.003721,-0.024953,0.000335,0.014994,0.003732,-0.022538,-0.007353,-0.000842
