## Dimensionality Reduction and Variational Autoencoder Optimization Pipeline

This notebook performs dimensionality reduction and optimizes Variational Autoencoder (VAE) models
using Optuna. It supports multiple techniques including PCA, ICA, NMF, VanillaVAE, BetaVAE, and BetaTCVAE.
The pipeline processes gene expression data, fits the models for a range of latent dimensions and initializations,
and saves the trained models for future analysis.

In [1]:
import pathlib 
import optuna
import pandas as pd
import torch
import sys
import random

from sklearn.decomposition import PCA, FastICA, NMF
from torch.utils.data import DataLoader, TensorDataset

script_directory = pathlib.Path("./scripts")
sys.path.insert(0, str(script_directory))
from utils import save_model, set_random_seed

script_directory = pathlib.Path("../2.train-VAE/utils/").resolve()
sys.path.insert(0, str(script_directory))
from betavae import BetaVAE, train_vae, evaluate_vae
from betatcvae import BetaTCVAE, train_tc_vae, evaluate_tc_vae
from vanillavae import VanillaVAE, train_vvae, evaluate_vvae
from optimize_utils import get_optimize_args, objective, get_optimizer
from optimize_utils_tcvae import get_optimize_args_tc, objective_tc, get_optimizer_tc
from optimize_utils_vvae import get_optimize_args_vvae, objective_vvae, get_optimizer_vvae

script_directory = pathlib.Path("../utils/").resolve()
sys.path.insert(0, str(script_directory))
from data_loader import load_train_test_data, load_model_data

In [2]:
# Load command line arguments
args = get_optimize_args()
tc_args = get_optimize_args_tc()
vvae_args = get_optimize_args_vvae()

# Load data
data_directory = pathlib.Path("../0.data-download/data").resolve()

train_df, test_df, val_df, load_gene_stats = load_train_test_data(
    data_directory, train_or_test="all", load_gene_stats=True, zero_one_normalize=True
)
train_data = pd.DataFrame(train_df)

dependency_file = pathlib.Path(f"{data_directory}/CRISPRGeneEffect.parquet").resolve()
gene_dict_file = pathlib.Path(f"{data_directory}/CRISPR_gene_dictionary.parquet").resolve()
dependency_df, gene_dict_df= load_model_data(dependency_file, gene_dict_file)
gene_dict_df = pd.DataFrame(gene_dict_df)
train_data.head()

(1150, 18444)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2708,2709,2710,2711,2712,2713,2714,2715,2716,2717
0,0.856396,0.52413,0.454598,0.477647,0.290616,0.493821,0.509459,0.470619,0.678614,0.764572,...,0.546061,0.42449,0.630406,0.43784,0.612547,0.100948,0.727655,0.567081,0.550219,0.857448
1,0.602398,0.458392,0.422174,0.66784,0.317417,0.866981,0.645869,0.469371,0.569442,0.691008,...,0.820237,0.297817,0.611111,0.461061,0.726131,0.620933,0.59627,0.571577,0.439725,0.699469
2,0.470072,0.441811,0.333899,0.622735,0.623862,0.423447,0.621539,0.422418,0.666295,0.836754,...,0.356129,0.108649,0.706207,0.530195,0.706685,0.37708,0.541181,0.475121,0.520993,0.766563
3,0.696976,0.05809,0.218041,0.641029,0.416265,0.749956,0.584399,0.235217,0.619033,0.760014,...,0.600717,0.491875,0.557667,0.263071,0.55638,0.458748,0.52511,0.285958,0.020888,0.800589
4,0.636979,0.446745,0.601733,0.748012,0.387747,0.696226,0.635404,0.700367,0.569001,0.812043,...,0.475938,0.485309,0.678319,0.476837,0.473643,0.513316,0.658391,0.644729,0.414735,0.462667


In [3]:
# Convert dataframes to tensors
train_tensor = torch.tensor(train_df, dtype=torch.float32)
test_tensor = torch.tensor(test_df, dtype=torch.float32)
val_tensor = torch.tensor(val_df, dtype=torch.float32)

In [4]:
# Directory where models will be saved
model_save_dir = pathlib.Path("saved_models")
model_save_dir.mkdir(parents=True, exist_ok=True)

# Define the optimization process for the models
latent_dims = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 150, 200]
model_names = ["pca", "ica", "nmf", "vanillavae", "betavae", "betatcvae"]

# Dynamically generate random seeds for the initializations
initialization_seeds = [random.randint(0, 2**32 - 1) for _ in range(5)]
print(f"Generated seeds: {initialization_seeds}")

for num_components in latent_dims:
    for model_name in model_names:
        if model_name in ["pca", "ica", "nmf"]:
            # Single initialization for PCA, ICA, and NMF
            if model_name == "pca":
                model = PCA(n_components=num_components)
            elif model_name == "ica":
                model = FastICA(n_components=num_components)
            elif model_name == "nmf":
                model = NMF(n_components=num_components, init='nndsvd', max_iter=2000, random_state=0)
            
            # Fit the model to the data
            model.fit(train_data)
            
            # Save the trained model (single initialization)
            save_model(
                trial="non-optuna",
                model=model,
                directory=model_save_dir,
                modelname=model_name,
                latent_dims=num_components,
                init=0,  # Single initialization for non-VAE models
                seed=0   # Placeholder seed
            )

        elif model_name in ["betavae", "betatcvae", "vanillavae"]:
            # Multiple initializations for VAEs
            for init_idx, init_seed in enumerate(initialization_seeds):  # Loop over seeds
                set_random_seed(init_seed)
                
                study = optuna.create_study(direction="minimize")
                if model_name == "betavae":
                    study.optimize(
                        lambda trial: objective(
                            trial, train_tensor, train_tensor, train_data, 
                            latent_dim=num_components
                        ), 
                        n_trials=50
                    )
                elif model_name == "betatcvae":
                    study.optimize(
                        lambda trial: objective_tc(
                            trial, train_tensor, train_tensor, train_data, 
                            latent_dim=num_components
                        ), 
                        n_trials=50
                    )
                elif model_name == "vanillavae":
                    study.optimize(
                        lambda trial: objective_vvae(
                            trial, train_tensor, train_tensor, train_data, 
                            latent_dim=num_components
                        ), 
                        n_trials=50
                    )
                
                # Retrieve the best trial and initialize the model
                best_trial = study.best_trial
                if model_name == "betavae":
                    model = BetaVAE(input_dim=train_data.shape[1], latent_dim=num_components, beta=best_trial.params['beta'])
                    train_loader = DataLoader(TensorDataset(train_tensor), batch_size=best_trial.params['batch_size'], shuffle=True)
                    optimizer = get_optimizer(best_trial.params['optimizer_type'], model.parameters(), best_trial.params['learning_rate'])
                    train_vae(model, train_loader, optimizer, best_trial.params['epochs'])
                elif model_name == "betatcvae":
                    model = BetaTCVAE(input_dim=train_data.shape[1], latent_dim=num_components, beta=best_trial.params['beta'])
                    train_loader = DataLoader(TensorDataset(train_tensor), batch_size=best_trial.params['batch_size'], shuffle=True)
                    optimizer = get_optimizer_tc(best_trial.params['optimizer_type'], model.parameters(), best_trial.params['learning_rate'])
                    train_tc_vae(model, train_loader, optimizer, best_trial.params['epochs'])
                elif model_name == "vanillavae":
                    model = VanillaVAE(input_dim=train_data.shape[1], latent_dim=num_components)
                    train_loader = DataLoader(TensorDataset(train_tensor), batch_size=best_trial.params['batch_size'], shuffle=True)
                    optimizer = get_optimizer_vvae(best_trial.params['optimizer_type'], model.parameters(), best_trial.params['learning_rate'])
                    train_vvae(model, train_loader, optimizer, best_trial.params['epochs'])
                
                
                # Save the trained model (multiple initializations)
                save_model(
                    trial=best_trial,
                    model=model,
                    directory=model_save_dir,
                    modelname=model_name,
                    latent_dims=num_components,
                    init=init_idx,
                    seed=init_seed
                )


Generated seeds: [1435653907, 1599180430, 2627152240, 2893110282, 2091252109]


[I 2024-12-16 13:43:56,733] A new study created in memory with name: no-name-d41facfa-a0d0-474c-a3f3-2ed6940f8422
[W 2024-12-16 13:44:03,564] Trial 0 failed with parameters: {'learning_rate': 0.004736153780857164, 'batch_size': 76, 'epochs': 709, 'optimizer_type': 'rmsprop'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/juliacurd/anaconda3/envs/gene_dependency_representations/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_237274/2395119748.py", line 62, in <lambda>
    lambda trial: objective_vvae(
                  ^^^^^^^^^^^^^^^
  File "/home/juliacurd/gene_dependency_representations/2.train-VAE/utils/optimize_utils_vvae.py", line 154, in objective_vvae
    train_vvae(model, train_loader, optimizer, epochs=epochs)
  File "/home/juliacurd/gene_dependency_representations/2.train-VAE/utils/vanillavae.py", li

KeyboardInterrupt: 