In [1]:
!pip install -q git+https://github.com/un-gcpds/python-gcpds.luker_multiple_annotators.git

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m392.9/392.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for gcpds_luker_multiple_annotators (setup.py) ... [?25l[?25hdone


In [2]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

from AutoTabVAE import TabNetVAE, TabularDataset, train_model, run_optuna

In [3]:
# 1. Generate synthetic data
N, D = 5000, 8         # N samples, D input features
output_dim = 5         # number of output regression targets

X = np.random.rand(N, D).astype(np.float32)
y = (
    np.sin(X[:, 0:1] * 6) +
    X[:, 1:2] * 3 +
    np.random.normal(0, 0.2, size=(N, output_dim))
).astype(np.float32)

# 2. Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# 3. Define a constrained search space for quick optimization
search_space = {
    "n_d": {"low": 8, "high": 16, "step": 8},
    "n_a": {"low": 8, "high": 16, "step": 8},
    "n_steps": {"low": 3, "high": 3},  # Fixed
    "latent_dim": {"low": 2, "high": 2},  # Fixed
    "gamma": {"low": 1.0, "high": 1.0},   # Fixed
    "lr": {"low": 1e-3, "high": 1e-3},    # Fixed
    "recon": {"low": 1.0, "high": 1.0},
    "kl": {"low": 1e-3, "high": 1e-3},
    "reg": {"low": 1.0, "high": 1.0},
    "sparse": {"low": 1e-3, "high": 1e-3},
    "batch_size": {"choices": [32]},
    "max_reg_neurons": {"choices": [32]},
    "num_reg_layers": {"low": 1, "high": 1}
}

# 4. Run a lightweight Optuna hyperparameter search
study = run_optuna(X_train, y_train, n_trials=2, param_config=search_space, train_settings={'epochs': 50})
print("Best hyperparameters:\n", study.best_params)

[I 2025-03-28 13:57:44,608] A new study created in memory with name: no-name-16402011-0023-4cb0-ad80-d50583c370fa
[I 2025-03-28 14:10:17,522] Trial 0 finished with value: 0.08181064829230308 and parameters: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 1.0, 'latent_dim': 2, 'recon': 1.0, 'kl': 0.001, 'reg': 1.0, 'sparse': 0.001, 'lr': 0.001, 'batch_size': 32, 'max_reg_neurons': 32, 'num_reg_layers': 1}. Best is trial 0 with value: 0.08181064829230308.
[I 2025-03-28 14:22:32,331] Trial 1 finished with value: 0.08374994191527367 and parameters: {'n_d': 8, 'n_a': 16, 'n_steps': 3, 'gamma': 1.0, 'latent_dim': 2, 'recon': 1.0, 'kl': 0.001, 'reg': 1.0, 'sparse': 0.001, 'lr': 0.001, 'batch_size': 32, 'max_reg_neurons': 32, 'num_reg_layers': 1}. Best is trial 0 with value: 0.08181064829230308.


Best hyperparameters:
 {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 1.0, 'latent_dim': 2, 'recon': 1.0, 'kl': 0.001, 'reg': 1.0, 'sparse': 0.001, 'lr': 0.001, 'batch_size': 32, 'max_reg_neurons': 32, 'num_reg_layers': 1}


In [5]:
# ================================
# FINAL MODEL TRAINING CONFIGURATION
# ================================

# 5. Extract best parameters from the study
best = study.best_params

# 6. Define the regression head structure
hidden_sizes = [
    best['max_reg_neurons'] // (2 ** i)
    for i in range(best['num_reg_layers'])
    if best['max_reg_neurons'] // (2 ** i) >= 8
]

# 7. Instantiate the final model
final_model = TabNetVAE(
    input_dim=X_train.shape[1],
    latent_dim=best['latent_dim'],
    output_dim=y_train.shape[1],
    n_d=best['n_d'],
    n_a=best['n_a'],
    n_steps=best['n_steps'],
    gamma=best['gamma'],
    hidden_sizes=hidden_sizes
)

# 8. Create data loaders
train_dataset = TabularDataset(X_train, y_train)
val_dataset = TabularDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=best['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=best['batch_size'])

# 9. Training configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_config = {
    "epochs": 200,
    "lr": best['lr'],
    "patience": 20,
    "loss_weights": {
        "recon": best['recon'],
        "kl": best['kl'],
        "reg": best['reg'],
        "sparse": best['sparse']
    }
}

# 10. Train the final model
final_model, final_val_loss = train_model(final_model, train_loader, val_loader, train_config, device)

# 11. Evaluate the final model on the test set
final_model.eval()
with torch.no_grad():
    y_pred = []
    for x_batch, _ in val_loader:
        x_batch = x_batch.to(device)
        _, regression_output, *_ = final_model(x_batch)
        y_pred.append(regression_output.cpu())

    y_pred = torch.cat(y_pred, dim=0).numpy()

# 12. Final metric (MSE)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f"✅ Test MSE: {mse:.4f}")

✅ Test MSE: 0.0735
