## Apply NMF to gene dependencies

Non-negative Matrix Factorization (NMF) is a dimensionality reduction technique that factors a non-negative matrix into two non-negative matrices. This is particularly useful when the data is inherently non-negative.

We apply it to GeneEffect scores here and save latent representations (NMF components) for downstream comparative analyses (to compare with BetaVAE).

In [1]:
import sys
import pathlib
import pandas as pd
import plotnine as gg
from sklearn.decomposition import NMF

sys.path.insert(0, "../utils/")
from data_loader import load_model_data

In [2]:
data_directory = pathlib.Path("../0.data-download/data").resolve()
dependency_file = pathlib.Path(f"{data_directory}/CRISPRGeneEffect.parquet").resolve()
gene_dict_file = pathlib.Path(f"{data_directory}/CRISPR_gene_dictionary.parquet").resolve()


output_dir = pathlib.Path("results").resolve()
nmf_output_file = pathlib.Path(f"{output_dir}/nmf_latent.parquet.gz").resolve()
output_nmf_weights_file = pathlib.Path(f"{output_dir}/NMF_weight_matrix_gsea.parquet").resolve()

In [3]:
nmf_components = 50

In [4]:
# Load data
dependency_df, gene_dict_df = load_model_data(dependency_file, gene_dict_file)

(1150, 18444)


In [5]:
# Ensure all values are non-negative by shifting the data
min_value = dependency_df.drop(columns=["ModelID"]).min().min()
if min_value < 0:
   dependency_df_non_negative = dependency_df.drop(columns=["ModelID"]) - min_value
else:
   dependency_df_non_negative = dependency_df.drop(columns=["ModelID"])

# Perform NMF

In [6]:
nmf = NMF(n_components=nmf_components, random_state=0)
W = nmf.fit_transform(dependency_df_non_negative)
H = nmf.components_

In [7]:
# Transform models into NMF space (W matrix)
dependency_df_transformed = pd.DataFrame(W)

In [8]:
# Recode column space and add back model IDs
dependency_df_transformed.columns = [f"NMF_{x}" for x in range(0, dependency_df_transformed.shape[1])]
dependency_df_transformed = pd.concat([dependency_df.loc[:, "ModelID"], dependency_df_transformed], axis="columns")


dependency_df_transformed.to_parquet(nmf_output_file, index=False)


print(dependency_df_transformed.shape)
dependency_df_transformed.head(3)

(1150, 51)


Unnamed: 0,ModelID,NMF_0,NMF_1,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,...,NMF_40,NMF_41,NMF_42,NMF_43,NMF_44,NMF_45,NMF_46,NMF_47,NMF_48,NMF_49
0,ACH-000001,0.2098,0.0,0.0,0.014303,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009268,0.00485,0.982635
1,ACH-000004,0.215258,0.004779,0.0,0.0,0.0,0.004609,0.0,0.002082,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001776,0.012277,0.973513
2,ACH-000005,0.215434,0.000937,0.0,0.0,0.0,0.012313,0.0,0.000643,0.000318,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002394,0.013144,0.974102


In [9]:
# Obtain weights (H matrix), which can be used in GSEA
nmf_weights = pd.DataFrame(H, columns=dependency_df_non_negative.columns.tolist()).transpose()
nmf_weights.columns = [f"NMF_{x}" for x in range(nmf_weights.shape[1])]


nmf_weights = nmf_weights.reset_index().rename(columns={"index": "genes"})


nmf_weights.to_parquet(output_nmf_weights_file, index=False)


print(nmf_weights.shape)
nmf_weights.head(3)

(2718, 51)


Unnamed: 0,genes,NMF_0,NMF_1,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,...,NMF_40,NMF_41,NMF_42,NMF_43,NMF_44,NMF_45,NMF_46,NMF_47,NMF_48,NMF_49
0,SARS2,0.071005,0.0,21.368498,11.065316,0.0,7.578911,0.0,29.019315,61.728789,...,0.0,0.0,9.783745,4.862253,1.096877,3.659648,3.391358,3.903741,3.06248,5.001846
1,LARP4,0.322014,16.745962,7.208615,3.910373,16.466121,0.478806,13.895235,10.180814,31.927719,...,25.773102,3.64141,0.0,0.901202,5.477956,3.124627,4.322935,3.423482,3.464876,5.39967
2,APH1A,4.372661,3.957601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,7.401898,4.173206,3.360864,2.882472,1.78311,3.483496,4.29828,4.546053
