In [1]:

import pathlib 
import optuna
import pandas as pd
import numpy as np
import random
import torch
import sys
import blitzgsea as blitz
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA, FastICA, NMF
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
from scipy.stats import ttest_ind

script_directory = pathlib.Path("../2.train-VAE/utils/").resolve()
sys.path.insert(0, str(script_directory))
from betavae import BetaVAE, train_vae, weights
from betatcvae import BetaTCVAE, train_tc_vae, tc_weights
from vanillavae import VanillaVAE, train_vvae, vanilla_weights
from optimize_utils import get_optimize_args, objective, get_optimizer
from optimize_utils_tcvae import get_optimize_args_tc, objective_tc, get_optimizer_tc
from optimize_utils_vvae import get_optimize_args_vvae, objective_vvae, get_optimizer_vvae

script_directory = pathlib.Path("../utils/").resolve()
sys.path.insert(0, str(script_directory))
from data_loader import load_train_test_data, load_model_data

In [2]:
# Load command line arguments
args = get_optimize_args()
tc_args = get_optimize_args_tc()
vvae_args = get_optimize_args_vvae()

# Load data
data_directory = pathlib.Path("../0.data-download/data").resolve()

train_df, test_df, val_df, load_gene_stats = load_train_test_data(
    data_directory, train_or_test="all", load_gene_stats=True, zero_one_normalize=True
)
train_data = pd.DataFrame(train_df)

dependency_file = pathlib.Path(f"{data_directory}/CRISPRGeneEffect.parquet").resolve()
gene_dict_file = pathlib.Path(f"{data_directory}/CRISPR_gene_dictionary.parquet").resolve()
dependency_df, gene_dict_df= load_model_data(dependency_file, gene_dict_file)
gene_dict_df = pd.DataFrame(gene_dict_df)

(1150, 18444)


In [3]:
#Load weight data for VAEs
data_directory = pathlib.Path("../0.data-download/data").resolve()
weight_df = load_train_test_data(
    data_directory, train_or_test="train"
)

gene_list_passed_qc = gene_dict_df.loc[
    gene_dict_df["qc_pass"], "dependency_column"
].tolist()

weight_data = weight_df.filter(gene_list_passed_qc, axis=1)
weight_data.head()

Unnamed: 0,FDX2 (112812),NSMCE4A (54780),AASDHPPT (60496),RAD50 (10111),PRMT5 (10419),ALG6 (29929),EXOC1 (55763),RHNO1 (83695),ATG12 (9140),ITPK1 (3705),...,CDC25B (994),CNOT4 (4850),WDR83 (84292),UBALD2 (283991),DOT1L (84444),DNM1L (10059),PET117 (100303755),RNF168 (165918),SENP8 (123228),SECISBP2 (79048)
0,-0.880565,-0.782422,-0.778251,-0.187087,-1.177537,-0.213722,-0.403387,-0.028371,0.04306,-0.078512,...,-0.172492,0.196713,-0.185486,-0.357539,-0.062151,-0.181394,-0.100426,-0.425072,0.003515,-0.336083
1,-0.74912,-0.516381,-0.080714,0.056364,-0.466129,-0.245435,-0.899518,-0.135167,-0.259756,-0.167318,...,-0.256712,0.465617,-0.28465,-0.20333,-0.107819,-0.684306,0.141742,-0.104726,-0.087244,-0.022045
2,-0.575482,-0.353394,-0.083243,0.076057,-0.953753,-0.25106,-0.902062,-0.093195,-0.043493,-0.141663,...,-0.446836,0.05456,-0.494846,0.071733,0.25825,-0.370277,-0.059265,-0.713488,0.090763,-0.299328
3,-0.47595,-0.310926,0.048373,0.064619,-1.014308,-0.407698,-0.765044,-0.196072,0.05962,-0.307384,...,-0.2292,-0.077849,0.027445,-0.074084,-0.541231,-0.420726,-0.092938,-1.170949,0.135269,-0.194019
4,-0.604045,-0.210099,-0.567472,-0.106073,-0.733926,-0.223492,-0.325947,-0.015823,0.244816,-0.422609,...,-0.049858,0.20208,-0.217986,-0.402349,-0.482002,-0.8513,-0.654365,-0.428541,0.077339,0.096456


In [4]:
# Convert dataframes to tensors
train_tensor = torch.tensor(train_df, dtype=torch.float32)
test_tensor = torch.tensor(test_df, dtype=torch.float32)
val_tensor = torch.tensor(val_df, dtype=torch.float32)

In [5]:
output_dir = pathlib.Path("./results").resolve()
output_dir.mkdir(parents=True, exist_ok=True)

In [6]:
# Function to perform dimensionality reduction and extract z matrices for sklearn models
def extract_z_matrix_sklearn(model, data, model_name, num_components, output_dir):
    if model_name == "nmf":
        min_value = data.min().min()
        if min_value < 0:
            data = data - min_value
    # Fit the model and transform the data
    z_matrix = model.fit_transform(data)
    
    # Create a DataFrame and save it
    z_matrix_df = pd.DataFrame(z_matrix, columns=[f'{model_name}_{i}' for i in range(num_components)])
    z_matrix_df.insert(0, 'ModelID', train_data.index) 
    
    return z_matrix_df

In [36]:
def extract_weights(model, model_name, H=None):
    if model_name in ["pca", "ica", "nmf"]:
        # Transpose PCA components and format columns
        weights_df = pd.DataFrame(model.components_, columns=dependency_df.drop(columns=["ModelID"]).columns.tolist()).transpose()
        weights_df.columns = [f"{x}" for x in range(0, weights_df.shape[1])]
        
    # Reset index to rename 'index' to 'genes'
    weights_df = weights_df.reset_index().rename(columns={"index": "genes"})
    
    return weights_df


def perform_gsea(weights_df, model_name, num_components):
    library = blitz.enrichr.get_library("Reactome_2022")
    seed = random.random()
    gsea_results = []
    # Define cutoff values
    lfc_cutoff = 0.584
    fdr_cutoff = 0.25
    # Perform GSEA for each component column in weights_df
    for col in weights_df.columns[1:]:  # Skip 'genes' column
        gene_signature = weights_df[['genes', col]]
        
        if gene_signature.shape[0] > 0:
            try:
                # Perform GSEA using the gene signature (weights)
                gsea_result = blitz.gsea(gene_signature, library, seed=seed)
                gsea_result = gsea_result.reset_index()
                for _, pathway_result in gsea_result.iterrows():
                    result_row = {
                        "z": int(col),
                        "full_model_z": num_components,
                        "model": str(model_name),
                        "reactome pathway": str(pathway_result['Term']),
                        "gsea es score": pathway_result['es'],
                        "nes score": pathway_result['nes'],
                        "p value": pathway_result['pval'],
                        "shuffled": False
                    }
                    gsea_results.append(result_row)
                    
            except ZeroDivisionError:
                print(f"Skipping GSEA for {col} due to zero division error.")
    
    # Convert GSEA results into a DataFrame and save to a Parquet file
    gsea_results_df = pd.DataFrame(gsea_results)
    # Filter for significant results
    significant_gsea_df = gsea_results_df[
            (gsea_results_df['gsea es score'].abs() > lfc_cutoff) &
            (gsea_results_df['p value'] < fdr_cutoff)
        ]
    print(significant_gsea_df)
    
    return significant_gsea_df


# Initialize a list to store the final results
final_gsea_results = []

# GSEA settings
latent_dims = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 150, 200]
model_names = ["pca", "ica", "nmf", "vanillavae", "betavae", "betatcvae"]

# Define the output file path
final_output_file = pathlib.Path(output_dir) / "combined_z_matrix_gsea_results.parquet"

# Try to load the existing combined results DataFrame if it exists
try:
    combined_results_df = pd.read_parquet(final_output_file)
    print(f"Loaded existing combined results from {final_output_file}")
except FileNotFoundError:
    # If the file doesn't exist, initialize an empty DataFrame
    combined_results_df = pd.DataFrame()
    print(f"No existing file found. Initialized empty DataFrame.")

for num_components in latent_dims:
    for model_name in model_names:
        # Check if this model and latent dimension have already been processed
        if not combined_results_df.empty:
            if ((combined_results_df['model'] == model_name) & 
                (combined_results_df['full_model_z'] == num_components)).any():
                print(f"Skipping {model_name} with {num_components} dimensions as it is already processed.")
                continue  # Skip to the next iteration if this combination is already present
        z_matrix_df = None
        
        if model_name in ["pca", "ica", "nmf"]:
            # Sklearn models (PCA, ICA, NMF)
            if model_name == "pca":
                model = PCA(n_components=num_components)
            elif model_name == "ica":
                model = FastICA(n_components=num_components)
            elif model_name == "nmf":
                model = NMF(n_components=num_components, init='nndsvd', max_iter=1000, random_state=0)
            
            model.fit(train_data)
            H = model.transform(train_data) if model_name == "nmf" else None
            weight_matrix_df = extract_weights(model, model_name, H)
                
        elif model_name == "betavae":
            # Optuna optimization for BetaVAE
            study = optuna.create_study(direction="minimize")
            study.optimize(lambda trial: objective(trial, train_tensor, train_tensor, train_data, latent_dim=num_components), n_trials=50)
            
            # Train the best BetaVAE model and extract z matrix
            best_trial = study.best_trial
            model = BetaVAE(input_dim=train_data.shape[1], latent_dim=num_components, beta=best_trial.params['beta'])
            train_loader = DataLoader(TensorDataset(train_tensor), batch_size=best_trial.params['batch_size'], shuffle=True)
            optimizer = get_optimizer(best_trial.params['optimizer_type'], model.parameters(), best_trial.params['learning_rate'])
            train_vae(model, train_loader, optimizer, best_trial.params['epochs'])
            
            weight_matrix_df = weights(model, weight_data)
            weight_matrix_df.rename(columns={0: 'genes'}, inplace=True)
        elif model_name == "betatcvae":
            # Optuna optimization for BetaTCVAE
            study = optuna.create_study(direction="minimize")
            study.optimize(lambda trial: objective_tc(trial, train_tensor, train_tensor, train_data, latent_dim=num_components), n_trials=50)
            # Train the best BetaTCVAE model and extract z matrix
            best_trial = study.best_trial
            model = BetaTCVAE(input_dim=train_data.shape[1], latent_dim=num_components, beta=best_trial.params['beta'])
            train_loader = DataLoader(TensorDataset(train_tensor), batch_size=best_trial.params['batch_size'], shuffle=True)
            optimizer = get_optimizer_tc(best_trial.params['optimizer_type'], model.parameters(), best_trial.params['learning_rate'])
            train_tc_vae(model, train_loader, optimizer, best_trial.params['epochs'])
            
            # Extract weight matrix
            weight_matrix_df = tc_weights(model, weight_data)
            weight_matrix_df.rename(columns={0: 'genes'}, inplace=True)
        
        elif model_name == "vanillavae":
            # Optuna optimization for VanillaVAE
            study = optuna.create_study(direction="minimize")
            study.optimize(lambda trial: objective_vvae(trial, train_tensor, train_tensor, train_data, latent_dim=num_components), n_trials=50)
            
            # Train the best BetaTCVAE model and extract z matrix
            best_trial = study.best_trial
            model = VanillaVAE(input_dim=train_data.shape[1], latent_dim=num_components)
            train_loader = DataLoader(TensorDataset(train_tensor), batch_size=best_trial.params['batch_size'], shuffle=True)
            optimizer = get_optimizer_vvae(best_trial.params['optimizer_type'], model.parameters(), best_trial.params['learning_rate'])
            train_vvae(model, train_loader, optimizer, best_trial.params['epochs'])
            
            # Extract weight matrix
            weight_matrix_df = vanilla_weights(model, weight_data)
            weight_matrix_df.rename(columns={0: 'genes'}, inplace=True)
        # If weight_matrix is generated, proceed to GSEA and append to combined dataframe
        if weight_matrix_df is not None:
            print(f"Running GSEA for {model_name}")
            gsea_results_df = perform_gsea(weight_matrix_df, model_name, num_components)
            combined_results_df = pd.concat([combined_results_df, gsea_results_df], ignore_index=True)
            

Loaded existing combined results from /home/juliacurd/gene_dependency_representations/4.gene_expression_signatures/results/combined_z_matrix_gsea_results.parquet
Skipping pca with 2 dimensions as it is already processed.
Skipping ica with 2 dimensions as it is already processed.
Skipping nmf with 2 dimensions as it is already processed.
Skipping vanillavae with 2 dimensions as it is already processed.
Skipping betavae with 2 dimensions as it is already processed.
Skipping betatcvae with 2 dimensions as it is already processed.
Skipping pca with 3 dimensions as it is already processed.
Skipping ica with 3 dimensions as it is already processed.
Skipping nmf with 3 dimensions as it is already processed.
Skipping vanillavae with 3 dimensions as it is already processed.
Skipping betavae with 3 dimensions as it is already processed.
Skipping betatcvae with 3 dimensions as it is already processed.
Skipping pca with 4 dimensions as it is already processed.
Skipping ica with 4 dimensions as it 



Running GSEA for ica
          z  full_model_z model  \
10        0           200   ica   
11        0           200   ica   
14        0           200   ica   
15        0           200   ica   
16        0           200   ica   
...     ...           ...   ...   
206756  199           200   ica   
206763  199           200   ica   
206770  199           200   ica   
206781  199           200   ica   
206783  199           200   ica   

                                         reactome pathway  gsea es score  \
10      Resolution Of AP Sites Via Multiple-Nucleotide...      -0.587210   
11      PCNA-Dependent Long Patch Base Excision Repair...      -0.587210   
14      Infection With Mycobacterium Tuberculosis R-HS...       0.717561   
15          Response Of Mtb To Phagocytosis R-HSA-9637690       0.717560   
16      Plasma Lipoprotein Assembly, Remodeling, And C...       0.673192   
...                                                   ...            ...   
206756  Defective Intrinsi



Running GSEA for nmf


[I 2024-09-20 11:24:46,848] A new study created in memory with name: no-name-7a995e52-991e-42d8-b057-e25c79ca6427


          z  full_model_z model  \
0         0           200   nmf   
1         0           200   nmf   
2         0           200   nmf   
4         0           200   nmf   
5         0           200   nmf   
...     ...           ...   ...   
206645  199           200   nmf   
206654  199           200   nmf   
206660  199           200   nmf   
206690  199           200   nmf   
206715  199           200   nmf   

                                         reactome pathway  gsea es score  \
0                 Signaling By CSF3 (G-CSF) R-HSA-9674555       0.682296   
1                      Signaling By SCF-KIT R-HSA-1433557       0.705609   
2       Inactivation Of CSF3 (G-CSF) Signaling R-HSA-9...       0.709191   
4                 FLT3 Signaling In Disease R-HSA-9682385       0.694062   
5             Interleukin-2 Family Signaling R-HSA-451927       0.690910   
...                                                   ...            ...   
206645          RHO GTPases Activate IQGAPs R-H

[I 2024-09-20 11:25:19,301] Trial 0 finished with value: 46.82982305270522 and parameters: {'learning_rate': 0.0017584585349813963, 'batch_size': 78, 'epochs': 776, 'optimizer_type': 'rmsprop'}. Best is trial 0 with value: 46.82982305270522.
[I 2024-09-20 11:25:47,335] Trial 1 finished with value: 46.72542888584422 and parameters: {'learning_rate': 0.004255308785424037, 'batch_size': 97, 'epochs': 621, 'optimizer_type': 'adam'}. Best is trial 1 with value: 46.72542888584422.
[I 2024-09-20 11:26:48,484] Trial 2 finished with value: 46.77761859039762 and parameters: {'learning_rate': 0.0011923529493946341, 'batch_size': 33, 'epochs': 806, 'optimizer_type': 'rmsprop'}. Best is trial 1 with value: 46.72542888584422.
[I 2024-09-20 11:27:14,930] Trial 3 finished with value: 46.86338920023904 and parameters: {'learning_rate': 0.0023892918163823003, 'batch_size': 94, 'epochs': 643, 'optimizer_type': 'rmsprop'}. Best is trial 1 with value: 46.72542888584422.
[I 2024-09-20 11:27:30,459] Trial 4 

Running GSEA for vanillavae


[I 2024-09-20 11:58:53,962] A new study created in memory with name: no-name-cbd38c39-4890-4b10-a787-6563a7aa4b14


          z  full_model_z       model  \
0         1           200  vanillavae   
1         1           200  vanillavae   
3         1           200  vanillavae   
13        1           200  vanillavae   
14        1           200  vanillavae   
...     ...           ...         ...   
206733  200           200  vanillavae   
206738  200           200  vanillavae   
206739  200           200  vanillavae   
206744  200           200  vanillavae   
206759  200           200  vanillavae   

                                         reactome pathway  gsea es score  \
0       Downregulation Of TGF-beta Receptor Signaling ...      -0.731404   
1       Translesion Synthesis By Y Family DNA Polymera...      -0.600198   
3          Eukaryotic Translation Termination R-HSA-72764      -0.880339   
13                               Methylation R-HSA-156581      -0.743081   
14      Estrogen-dependent Nuclear Events Downstream O...      -0.666458   
...                                                

[I 2024-09-20 11:59:12,373] Trial 0 finished with value: 47.58187711345616 and parameters: {'beta': 7.493976013844335, 'learning_rate': 0.003898074223506265, 'batch_size': 62, 'epochs': 417, 'optimizer_type': 'rmsprop'}. Best is trial 0 with value: 47.58187711345616.
[I 2024-09-20 12:00:03,609] Trial 1 finished with value: 47.564102400594685 and parameters: {'beta': 6.724034283118957, 'learning_rate': 0.00047403081387432293, 'batch_size': 46, 'epochs': 958, 'optimizer_type': 'rmsprop'}. Best is trial 1 with value: 47.564102400594685.
[I 2024-09-20 12:00:17,827] Trial 2 finished with value: 47.678801488164645 and parameters: {'beta': 5.943240146082806, 'learning_rate': 0.00413096225697889, 'batch_size': 40, 'epochs': 208, 'optimizer_type': 'adam'}. Best is trial 1 with value: 47.564102400594685.
[I 2024-09-20 12:00:40,027] Trial 3 finished with value: 47.58383574983967 and parameters: {'beta': 3.528588949400702, 'learning_rate': 0.004513367866099909, 'batch_size': 95, 'epochs': 583, 'op

Running GSEA for betavae


[I 2024-09-20 12:24:20,234] A new study created in memory with name: no-name-0bf9fd92-6d45-4375-a289-ff53a8e2b885


          z  full_model_z    model  \
0         1           200  betavae   
2         1           200  betavae   
4         1           200  betavae   
8         1           200  betavae   
16        1           200  betavae   
...     ...           ...      ...   
206683  200           200  betavae   
206690  200           200  betavae   
206696  200           200  betavae   
206707  200           200  betavae   
206753  200           200  betavae   

                                         reactome pathway  gsea es score  \
0                  Visual Phototransduction R-HSA-2187338      -0.753047   
2       VEGFR2 Mediated Vascular Permeability R-HSA-52...      -0.659029   
4       Regulation Of TP53 Activity Thru Acetylation R...      -0.612920   
8                   Cholesterol Biosynthesis R-HSA-191273       0.660623   
16        Downregulation Of ERBB2 Signaling R-HSA-8863795       0.615898   
...                                                   ...            ...   
206683  IRA

[I 2024-09-20 12:24:41,401] Trial 0 finished with value: 37.74495248367537 and parameters: {'beta': 1.9277921012058044, 'learning_rate': 7.776075823740862e-05, 'batch_size': 61, 'epochs': 250, 'optimizer_type': 'rmsprop'}. Best is trial 0 with value: 37.74495248367537.
[I 2024-09-20 12:25:15,570] Trial 1 finished with value: 48.99225673390858 and parameters: {'beta': 1.0462210055019918, 'learning_rate': 3.945288877182508e-05, 'batch_size': 61, 'epochs': 407, 'optimizer_type': 'rmsprop'}. Best is trial 0 with value: 37.74495248367537.
[I 2024-09-20 12:25:25,142] Trial 2 finished with value: 41.39392180940998 and parameters: {'beta': 1.6630818653084516, 'learning_rate': 9.556825880116263e-05, 'batch_size': 55, 'epochs': 111, 'optimizer_type': 'rmsprop'}. Best is trial 0 with value: 37.74495248367537.
[I 2024-09-20 12:27:16,565] Trial 3 finished with value: 39.06326047982743 and parameters: {'beta': 1.7220471187673403, 'learning_rate': 5.462179749812174e-05, 'batch_size': 76, 'epochs': 93

Running GSEA for betatcvae
          z  full_model_z      model  \
0         1           200  betatcvae   
1         1           200  betatcvae   
3         1           200  betatcvae   
8         1           200  betatcvae   
9         1           200  betatcvae   
...     ...           ...        ...   
206768  200           200  betatcvae   
206769  200           200  betatcvae   
206770  200           200  betatcvae   
206773  200           200  betatcvae   
206774  200           200  betatcvae   

                                         reactome pathway  gsea es score  \
0       Regulation Of TP53 Activity Thru Acetylation R...      -0.638290   
1       STAT3 Nuclear Events Downstream Of ALK Signali...      -0.793609   
3       SUMOylation Of Transcription Factors R-HSA-323...       0.674916   
8       Downregulation Of TGF-beta Receptor Signaling ...       0.619637   
9       Regulation Of IGF Transport And Uptake By IGFB...      -0.848614   
...                                 

In [37]:
# Save the combined dataframe to a file
final_output_file = output_dir / "combined_z_matrix_gsea_results.parquet"
combined_results_df.to_parquet(final_output_file, index=False)

print(f"Saved final combined z_matrix and GSEA results to {final_output_file}")

#Save as CSV for R 
csv_output_file = output_dir / "combined_z_matrix_gsea_results.csv"
combined_results_df.to_csv(csv_output_file, index=False)

Saved final combined z_matrix and GSEA results to /home/juliacurd/gene_dependency_representations/4.gene_expression_signatures/results/combined_z_matrix_gsea_results.parquet


In [38]:
combined_results_df.sort_values(by='gsea es score', key=abs, ascending = False).head(50)

Unnamed: 0,z,full_model_z,model,reactome pathway,gsea es score,nes score,p value,shuffled
94309,21,40,betavae,Regulation Of IGF Transport And Uptake By IGFB...,0.982719,2.749145,0.005975,False
94308,21,40,betavae,Post-translational Protein Phosphorylation R-H...,0.982719,2.749145,0.005975,False
28586,5,14,vanillavae,Post-translational Protein Phosphorylation R-H...,0.976058,2.868438,0.004125,False
28587,5,14,vanillavae,Regulation Of IGF Transport And Uptake By IGFB...,0.976058,2.868438,0.004125,False
174148,26,70,betavae,Post-translational Protein Phosphorylation R-H...,-0.974218,-2.913465,0.003574,False
174149,26,70,betavae,Regulation Of IGF Transport And Uptake By IGFB...,-0.974218,-2.913465,0.003574,False
176386,62,70,betavae,Post-translational Protein Phosphorylation R-H...,-0.97098,-3.084574,0.002038,False
176387,62,70,betavae,Regulation Of IGF Transport And Uptake By IGFB...,-0.97098,-3.084574,0.002038,False
222392,2,90,nmf,Viral mRNA Translation R-HSA-192823,0.968566,3.166789,0.001541,False
56865,6,25,betavae,Regulation Of IGF Transport And Uptake By IGFB...,0.963643,2.771649,0.005577,False
