## Import Libraries and Set Up Environment

In [1]:
# Import necessary libraries
import warnings
import torch
import numpy as np
import random
from torch.utils.data import Subset
from sklearn.model_selection import KFold
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, Timer
from pytorch_lightning import Trainer
from fukui_net.utils.utils import MoleculeDataModule, initialize_cuda, evaluate_model_full
from fukui_net.utils.train import MoleculeModel, Model, CrossValDataModule
from lion_pytorch import Lion

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Initialize CUDA settings
initialize_cuda()

# Suppress unnecessary warnings
warnings.filterwarnings("ignore", category=UserWarning, module="pytorch_lightning.trainer.connectors.data_connector")
warnings.filterwarnings("ignore", category=UserWarning, module="lightning_fabric.plugins.environments.slurm")

# Ensure deterministic behavior in CUDA operations
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

cuda True
NVIDIA GeForce RTX 3090


### Load Dataset and Define Model Parameters

In [2]:
# Load dataset
dataset = torch.load(f'../data/processed/QM_137k.pt')
in_features = dataset[0].x.shape[1]
out_features = 1
edge_attr_dim = dataset[0].edge_attr.shape[1]

# Data module settings
batch_size = 1024 
num_workers = 8  

# Define the model architecture parameters
preprocess_hidden_features = [128] * 9
postprocess_hidden_features = [128, 128]
cheb_hidden_features = [128, 128]
K = [10, 16]
cheb_normalization = ['sym', 'sym']

dropout_rates = [0.0] * (len(preprocess_hidden_features) + len(postprocess_hidden_features))
activation_fns = [torch.nn.PReLU] * (len(preprocess_hidden_features) + len(postprocess_hidden_features))
use_batch_norm = [True] * (len(preprocess_hidden_features) + len(postprocess_hidden_features))

# Optimizer settings
optimizer_class = Lion
learning_rate = 2.2e-5
weight_decay = 3e-5
step_size = 80
gamma = 0.2
metric = 'rmse'

## Instantiate Model Backbone

In [3]:
# Instantiate model backbone
backbone = Model(
    atom_in_features=in_features,
    edge_attr_dim=edge_attr_dim,
    preprocess_hidden_features=preprocess_hidden_features,
    cheb_hidden_features=cheb_hidden_features,
    K=K,
    cheb_normalizations=cheb_normalization,
    dropout_rates=dropout_rates,
    activation_fns=activation_fns,
    use_batch_norm=use_batch_norm,
    postprocess_hidden_features=postprocess_hidden_features,
    out_features=out_features
)

In [6]:
print("Model Backbone Architecture:\n", backbone)

Model Backbone Architecture:
 Model(
  (atom_preprocess): ModuleList(
    (0): AtomEdgeInteraction(
      (interaction): KANLinear(
        (base_activation): SiLU()
      )
      (activation): ReLU()
      (batch_norm): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (dropout): Dropout(p=0.0, inplace=False)
      (residual): Linear(in_features=133, out_features=128, bias=True)
    )
    (1-8): 8 x Sequential(
      (0): KANLinear(
        (base_activation): SiLU()
      )
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): PReLU(num_parameters=1)
      (3): Dropout(p=0.0, inplace=False)
    )
  )
  (cheb_convolutions): ModuleList(
    (0): ChebConv(128, 128, K=10, normalization=sym)
    (1): ChebConv(128, 128, K=16, normalization=sym)
  )
  (postprocess): ModuleList(
    (0-1): 2 x Sequential(
      (0): KANLinear(
        (base_activation): SiLU()
      )
      (1): BatchNorm1d(128, eps=1e-05, m

## K-Fold cross-validation setup

In [4]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
data_indices = list(range(len(dataset)))

fold_results = []
best_val_loss = float('inf')
best_model = None

## Training Loop for Cross-Validation

In [5]:
# Perform K-Fold cross-validation
for fold, (train_index, val_index) in enumerate(kf.split(data_indices)):
    print(f"Fold {fold+1}/{n_splits}")

    # Create data subsets for this fold
    train_subset = Subset(dataset, train_index)
    val_subset = Subset(dataset, val_index)
    data_module = CrossValDataModule(train_subset, val_subset, batch_size=batch_size, num_workers=num_workers)

    # Initialize model for this fold
    model = MoleculeModel(
        model_backbone=backbone,
        optimizer_class=optimizer_class,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        step_size=step_size,
        gamma=gamma,
        batch_size=batch_size,
        metric=metric
    )

    # Callbacks for model checkpointing and early stopping
    checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=1, verbose=True, dirpath='.', filename='best_model')
    early_stop_callback = EarlyStopping(monitor='val_loss', patience=5, verbose=True, mode='min')
    timer = Timer()
    logger = pl.loggers.TensorBoardLogger('../reports/tb_logs', name=f'KAN_fold_{fold+1}')

    # Trainer configuration
    trainer = Trainer(
        max_epochs=100,
        enable_checkpointing=True,
        callbacks=[early_stop_callback, timer, checkpoint_callback],
        enable_progress_bar=False,
        logger=logger,
        accelerator='gpu',
        devices=1
    )

    # Train the model
    trainer.fit(model, data_module)

    # Get validation loss
    val_loss = trainer.callback_metrics["val_loss"].item()
    fold_results.append(val_loss)

    # Save the best model based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model


    seconds = timer.time_elapsed()
    h, m, s = int(seconds // 3600), int((seconds % 3600) // 60), int(seconds % 60)
    print(f"Training time for fold {fold+1}: {h}:{m:02d}:{s:02d}")

Fold 1/5


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: tb_logs/KAN_fold_1
/home/nikolenko/.local/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /home/nikolenko/work/fukui_index_prediction/notebooks exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type  | Params
-----------------------------------------
0 | model_backbone | Model | 2.3 M 
-----------------------------------------
2.3 M     Trainable params
0         N

KeyError: 'val_loss'

## Cross-Validation Results and Final Model Save

In [None]:
# Print cross-validation results
print("Cross-validation results:")
for i, val_loss in enumerate(fold_results):
    print(f"Fold {i+1}: Validation error (RMSE) = {val_loss:.4f}")

mean_val_loss = sum(fold_results) / len(fold_results)
print(f"Average validation error (RMSE): {mean_val_loss:.4f}")

# Save the final best model

torch.save("../model/final_best_model.ckpt")
print(f"Final best model saved!")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: tb_logs/KAN_fold_1
/home/nikolenko/.local/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /home/nikolenko/work/gat exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type  | Params
-----------------------------------------
0 | model_backbone | Model | 2.3 M 
-----------------------------------------
2.3 M     Trainable params
0         Non-trainable params
2.3 M     Total params
9.096     Total estimated model params size (MB)


Fold 1/5


/home/nikolenko/.local/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
Metric val_loss improved. New best score: 0.205
Epoch 0, global step 1: 'val_loss' reached 0.20457 (best 0.20457), saving model to '/home/nikolenko/work/gat/best_model.ckpt' as top 1
Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.204
Epoch 1, global step 2: 'val_loss' reached 0.20401 (best 0.20401), saving model to '/home/nikolenko/work/gat/best_model.ckpt' as top 1
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.204
Epoch 2, global step 3: 'val_loss' reached 0.20357 (best 0.20357), saving model to '/home/nikolenko/work/gat/best_model.ckpt' as top 1
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.203
Epoch 3, global step 4: 'val_loss' 

Время обучения fold 1: 0:02:34
Fold 2/5


Metric val_loss improved. New best score: 0.175
Epoch 0, global step 1: 'val_loss' reached 0.17494 (best 0.17494), saving model to '/home/nikolenko/work/gat/best_model-v1.ckpt' as top 1
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.175
Epoch 1, global step 2: 'val_loss' reached 0.17484 (best 0.17484), saving model to '/home/nikolenko/work/gat/best_model-v1.ckpt' as top 1
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.175
Epoch 2, global step 3: 'val_loss' reached 0.17479 (best 0.17479), saving model to '/home/nikolenko/work/gat/best_model-v1.ckpt' as top 1
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.175
Epoch 3, global step 4: 'val_loss' reached 0.17468 (best 0.17468), saving model to '/home/nikolenko/work/gat/best_model-v1.ckpt' as top 1
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.175
Epoch 4, global step 5: 'val_loss' reached 0.17461 (best 0.17461), saving model to '/home/nikole

Время обучения fold 2: 0:02:42
Fold 3/5


Metric val_loss improved. New best score: 0.180
Epoch 0, global step 1: 'val_loss' reached 0.17987 (best 0.17987), saving model to '/home/nikolenko/work/gat/best_model-v2.ckpt' as top 1
Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.179
Epoch 1, global step 2: 'val_loss' reached 0.17925 (best 0.17925), saving model to '/home/nikolenko/work/gat/best_model-v2.ckpt' as top 1
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.179
Epoch 2, global step 3: 'val_loss' reached 0.17905 (best 0.17905), saving model to '/home/nikolenko/work/gat/best_model-v2.ckpt' as top 1
Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.178
Epoch 3, global step 4: 'val_loss' reached 0.17835 (best 0.17835), saving model to '/home/nikolenko/work/gat/best_model-v2.ckpt' as top 1
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.178
Epoch 4, global step 5: 'val_loss' reached 0.17798 (best 0.17798), saving model to '/home/nikole

Время обучения fold 3: 0:02:32
Fold 4/5


Metric val_loss improved. New best score: 0.095
Epoch 0, global step 1: 'val_loss' reached 0.09542 (best 0.09542), saving model to '/home/nikolenko/work/gat/best_model-v3.ckpt' as top 1
Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 0.095
Epoch 1, global step 2: 'val_loss' reached 0.09518 (best 0.09518), saving model to '/home/nikolenko/work/gat/best_model-v3.ckpt' as top 1
Epoch 2, global step 3: 'val_loss' was not in top 1
Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.094
Epoch 3, global step 4: 'val_loss' reached 0.09435 (best 0.09435), saving model to '/home/nikolenko/work/gat/best_model-v3.ckpt' as top 1
Epoch 4, global step 5: 'val_loss' was not in top 1
Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.094
Epoch 5, global step 6: 'val_loss' reached 0.09359 (best 0.09359), saving model to '/home/nikolenko/work/gat/best_model-v3.ckpt' as top 1
Epoch 6, global step 7: 'val_loss' was not in top 1
Metric val_loss 

Время обучения fold 4: 0:01:52
Fold 5/5


Metric val_loss improved. New best score: 0.080
Epoch 0, global step 1: 'val_loss' reached 0.07979 (best 0.07979), saving model to '/home/nikolenko/work/gat/best_model-v4.ckpt' as top 1
Metric val_loss improved by 0.007 >= min_delta = 0.0. New best score: 0.072
Epoch 1, global step 2: 'val_loss' reached 0.07229 (best 0.07229), saving model to '/home/nikolenko/work/gat/best_model-v4.ckpt' as top 1
Epoch 2, global step 3: 'val_loss' was not in top 1
Epoch 3, global step 4: 'val_loss' was not in top 1
Epoch 4, global step 5: 'val_loss' was not in top 1
Epoch 5, global step 6: 'val_loss' was not in top 1
Monitored metric val_loss did not improve in the last 5 records. Best score: 0.072. Signaling Trainer to stop.
Epoch 6, global step 7: 'val_loss' was not in top 1


Время обучения fold 5: 0:00:08
Результаты кросс-валидации:
Fold 1: Валид. ошибка (RMSE) = 0.1944
Fold 2: Валид. ошибка (RMSE) = 0.1576
Fold 3: Валид. ошибка (RMSE) = 0.1014
Fold 4: Валид. ошибка (RMSE) = 0.0707
Fold 5: Валид. ошибка (RMSE) = 0.0763
Средняя валид. ошибка (RMSE): 0.1201
Лучший чекпоинт модели сохранен по пути: /home/nikolenko/work/gat/best_model-v3.ckpt


## Evaluate the Final Model

In [None]:
# Evaluate the final model on the entire dataset
evaluate_model_full(best_model, dataset, batch_size, num_workers)