In [1]:

import pathlib 
import optuna
import pandas as pd
import joblib
import torch
import sys

from sklearn.decomposition import PCA, FastICA, NMF
from torch.utils.data import DataLoader, TensorDataset

script_directory = pathlib.Path("../2.train-VAE/utils/").resolve()
sys.path.insert(0, str(script_directory))
from betavae import BetaVAE, train_vae
from betatcvae import BetaTCVAE, train_tc_vae
from vanillavae import VanillaVAE, train_vvae
from optimize_utils import get_optimize_args, objective, get_optimizer
from optimize_utils_tcvae import get_optimize_args_tc, objective_tc, get_optimizer_tc
from optimize_utils_vvae import get_optimize_args_vvae, objective_vvae, get_optimizer_vvae

script_directory = pathlib.Path("../utils/").resolve()
sys.path.insert(0, str(script_directory))
from data_loader import load_train_test_data, load_model_data

In [2]:
# Load command line arguments
args = get_optimize_args()
tc_args = get_optimize_args_tc()
vvae_args = get_optimize_args_vvae()

# Load data
data_directory = pathlib.Path("../0.data-download/data").resolve()

train_df, test_df, val_df, load_gene_stats = load_train_test_data(
    data_directory, train_or_test="all", load_gene_stats=True, zero_one_normalize=True
)
train_data = pd.DataFrame(train_df)

dependency_file = pathlib.Path(f"{data_directory}/CRISPRGeneEffect.parquet").resolve()
gene_dict_file = pathlib.Path(f"{data_directory}/CRISPR_gene_dictionary.parquet").resolve()
dependency_df, gene_dict_df= load_model_data(dependency_file, gene_dict_file)
gene_dict_df = pd.DataFrame(gene_dict_df)
train_data.head()

(1150, 18444)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2708,2709,2710,2711,2712,2713,2714,2715,2716,2717
0,0.856396,0.52413,0.454598,0.477647,0.290616,0.493821,0.509459,0.470619,0.678614,0.764572,...,0.546061,0.42449,0.630406,0.43784,0.612547,0.100948,0.727655,0.567081,0.550219,0.857448
1,0.602398,0.458392,0.422174,0.66784,0.317417,0.866981,0.645869,0.469371,0.569442,0.691008,...,0.820237,0.297817,0.611111,0.461061,0.726131,0.620933,0.59627,0.571577,0.439725,0.699469
2,0.470072,0.441811,0.333899,0.622735,0.623862,0.423447,0.621539,0.422418,0.666295,0.836754,...,0.356129,0.108649,0.706207,0.530195,0.706685,0.37708,0.541181,0.475121,0.520993,0.766563
3,0.696976,0.05809,0.218041,0.641029,0.416265,0.749956,0.584399,0.235217,0.619033,0.760014,...,0.600717,0.491875,0.557667,0.263071,0.55638,0.458748,0.52511,0.285958,0.020888,0.800589
4,0.636979,0.446745,0.601733,0.748012,0.387747,0.696226,0.635404,0.700367,0.569001,0.812043,...,0.475938,0.485309,0.678319,0.476837,0.473643,0.513316,0.658391,0.644729,0.414735,0.462667


In [3]:
# Convert dataframes to tensors
train_tensor = torch.tensor(train_df, dtype=torch.float32)
test_tensor = torch.tensor(test_df, dtype=torch.float32)
val_tensor = torch.tensor(val_df, dtype=torch.float32)

In [6]:
# Directory where models will be saved
model_save_dir = pathlib.Path("saved_models")
model_save_dir.mkdir(parents=True, exist_ok=True)

# Define the optimization process for the models
latent_dims = [30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 150, 200]
model_names = ["pca", "ica", "nmf", "vanillavae", "betavae", "betatcvae"]

for num_components in latent_dims:
    for model_name in model_names:
        model = None  # Initialize model as None for each iteration
        
        if model_name in ["pca", "ica", "nmf"]:
            if model_name == "pca":
                model = PCA(n_components=num_components)
            elif model_name == "ica":
                model = FastICA(n_components=num_components)
            elif model_name == "nmf":
                model = NMF(n_components=num_components, init='nndsvd', max_iter=2000, random_state=0)
            
            # Fit model to data
            model.fit(train_data)
        
        elif model_name == "betavae":
            study = optuna.create_study(direction="minimize")
            study.optimize(lambda trial: objective(trial, train_tensor, train_tensor, train_data, latent_dim=num_components), n_trials=50)
            best_trial = study.best_trial
            model = BetaVAE(input_dim=train_data.shape[1], latent_dim=num_components, beta=best_trial.params['beta'])
            train_loader = DataLoader(TensorDataset(train_tensor), batch_size=best_trial.params['batch_size'], shuffle=True)
            optimizer = get_optimizer(best_trial.params['optimizer_type'], model.parameters(), best_trial.params['learning_rate'])
            train_vae(model, train_loader, optimizer, best_trial.params['epochs'])
        
        elif model_name == "betatcvae":
            study = optuna.create_study(direction="minimize")
            study.optimize(lambda trial: objective_tc(trial, train_tensor, train_tensor, train_data, latent_dim=num_components), n_trials=50)
            best_trial = study.best_trial
            model = BetaTCVAE(input_dim=train_data.shape[1], latent_dim=num_components, beta=best_trial.params['beta'])
            train_loader = DataLoader(TensorDataset(train_tensor), batch_size=best_trial.params['batch_size'], shuffle=True)
            optimizer = get_optimizer_tc(best_trial.params['optimizer_type'], model.parameters(), best_trial.params['learning_rate'])
            train_tc_vae(model, train_loader, optimizer, best_trial.params['epochs'])

        elif model_name == "vanillavae":
            study = optuna.create_study(direction="minimize")
            study.optimize(lambda trial: objective_vvae(trial, train_tensor, train_tensor, train_data, latent_dim=num_components), n_trials=50)
            
            best_trial = study.best_trial
            model = VanillaVAE(input_dim=train_data.shape[1], latent_dim=num_components)
            train_loader = DataLoader(TensorDataset(train_tensor), batch_size=best_trial.params['batch_size'], shuffle=True)
            optimizer = get_optimizer_vvae(best_trial.params['optimizer_type'], model.parameters(), best_trial.params['learning_rate'])
            train_vvae(model, train_loader, optimizer, best_trial.params['epochs'])

        # Save the trained model with joblib
        if model:
            model_filename = model_save_dir / f"{model_name}_{num_components}_components_model.joblib"
            joblib.dump(model, model_filename)
            print(f"Saved {model_name} with {num_components} components to {model_filename}")

Saved pca with 30 components to saved_models/pca_30_components_model.joblib
Saved ica with 30 components to saved_models/ica_30_components_model.joblib


[I 2024-10-24 12:02:24,859] A new study created in memory with name: no-name-cd113957-9ac7-4b8f-96d0-e70727b2a9d6


Saved nmf with 30 components to saved_models/nmf_30_components_model.joblib


[I 2024-10-24 12:02:32,977] Trial 0 finished with value: 46.76656006770347 and parameters: {'learning_rate': 0.002037115081288579, 'batch_size': 44, 'epochs': 250, 'optimizer_type': 'rmsprop'}. Best is trial 0 with value: 46.76656006770347.
[I 2024-10-24 12:03:26,446] Trial 1 finished with value: 46.60639256719333 and parameters: {'learning_rate': 0.0033940207615709194, 'batch_size': 22, 'epochs': 978, 'optimizer_type': 'adam'}. Best is trial 1 with value: 46.60639256719333.
[I 2024-10-24 12:03:26,660] Trial 2 finished with value: 105.85518015391791 and parameters: {'learning_rate': 0.0021227532729113127, 'batch_size': 102, 'epochs': 11, 'optimizer_type': 'rmsprop'}. Best is trial 1 with value: 46.60639256719333.
[I 2024-10-24 12:03:51,778] Trial 3 finished with value: 46.50505407532649 and parameters: {'learning_rate': 0.004779842644069216, 'batch_size': 46, 'epochs': 828, 'optimizer_type': 'adam'}. Best is trial 3 with value: 46.50505407532649.
[I 2024-10-24 12:04:09,752] Trial 4 fin

Saved vanillavae with 30 components to saved_models/vanillavae_30_components_model.joblib


[I 2024-10-24 12:24:29,399] Trial 0 finished with value: 206.3187937266791 and parameters: {'beta': 8.561880508243696, 'learning_rate': 0.0002155596378475613, 'batch_size': 90, 'epochs': 59, 'optimizer_type': 'adam'}. Best is trial 0 with value: 206.3187937266791.
[I 2024-10-24 12:24:36,746] Trial 1 finished with value: 118.16895770172574 and parameters: {'beta': 7.358399409359441, 'learning_rate': 0.00013275760309964762, 'batch_size': 107, 'epochs': 399, 'optimizer_type': 'adam'}. Best is trial 1 with value: 118.16895770172574.
[I 2024-10-24 12:24:37,114] Trial 2 finished with value: 203.3394922968167 and parameters: {'beta': 6.244974939811752, 'learning_rate': 0.0006161786225859116, 'batch_size': 55, 'epochs': 14, 'optimizer_type': 'rmsprop'}. Best is trial 1 with value: 118.16895770172574.
[I 2024-10-24 12:24:44,498] Trial 3 finished with value: 47.42822119869403 and parameters: {'beta': 2.6746032167830123, 'learning_rate': 0.004598859010745316, 'batch_size': 89, 'epochs': 390, 'opt

KeyboardInterrupt: 