# Pretrain & PI Finetuning Suite

This notebook orchestrates 20 randomized incremental pretrain runs on non-PI polymers followed by PI-property finetuning with a frozen shared encoder. It combines the continual-task recipes from `dynamic_task_finetuning_demo.ipynb` and `dynamic_task_incremental_finetuning.ipynb`.


## Data Overview

- **Descriptors**: `data/amorphous_polymer_FFDescriptor_20250730.parquet`
- **Non-PI properties**: `data/amorphous_polymer_non_PI_properties_20250730.parquet`
- **PI properties**: `data/amorphous_polymer_PI_properties_20250730.parquet`
- Pretrain tasks: 15 properties (density through thermal_diffusivity) sampled in random order per run
- PI finetune tasks: density, Rg, r2, self-diffusion, Cp, Cv, linear_expansion, refractive_index, tg


In [4]:
import json
import math
import random
import re
from pathlib import Path
from typing import Any

import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from lightning.pytorch import Trainer
from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger, TensorBoardLogger
from loguru import logger as fm_logger

from foundation_model.data.datamodule import CompoundDataModule
from foundation_model.models.flexible_multi_task_model import FlexibleMultiTaskModel
from foundation_model.models.model_config import OptimizerConfig, RegressionTaskConfig, TaskType


[32m2025-10-31 10:10:53.632[0m | [1mINFO    [0m | [36m__init__[0m:[36m<module>[0m:[36m34[0m - [1mLoguru logger initialized for foundation_model package.[0m


In [5]:
DATA_DIR = Path("../data")
DESCRIPTOR_PATH = DATA_DIR / "amorphous_polymer_FFDescriptor_20250730.parquet"
NON_PI_PATH = DATA_DIR / "amorphous_polymer_non_PI_properties_20250730.parquet"
PI_PATH = DATA_DIR / "amorphous_polymer_PI_properties_20250730.parquet"
SCALER_PATH = DATA_DIR / "amorphous_polymer_properties_scaler_20250730.pkl.z"

USE_NORMALIZED_TARGETS = False
FINETUNE_FREEZE_SHARED = True
QUIET_MODEL_LOGGING = True

PRETRAIN_TASK_NAMES = [
    "density",
    "Rg",
    "r2",
    # "self-diffusion",
    # "Cp",
    # "Cv",
    # "bulk_modulus",
    # "volume_expansion",
    # "linear_expansion",
    # "static_dielectric_const",
    # "dielectric_const_dc",
    # "refractive_index",
    # "tg",
    # "thermal_conductivity",
    # "thermal_diffusivity",
]
FINETUNE_TASK_NAMES = [
    # "density",
    # "Rg",
    # "r2",
    # "self-diffusion",
    # "Cp",
    # "Cv",
    "linear_expansion",
    "refractive_index",
    "tg",
]

LOWER_CASE_PROPERTIES = sorted(set(PRETRAIN_TASK_NAMES) | set(FINETUNE_TASK_NAMES))

def target_column(property_name: str) -> str:
    return f"{property_name}{'(normalized)' if USE_NORMALIZED_TARGETS else ''}"

TARGET_COLUMNS = {name: target_column(name) for name in LOWER_CASE_PROPERTIES}
PRETRAIN_TARGET_COLUMNS = {name: TARGET_COLUMNS[name] for name in PRETRAIN_TASK_NAMES}
FINETUNE_TARGET_COLUMNS = {name: TARGET_COLUMNS[name] for name in FINETUNE_TASK_NAMES}

SHARED_BLOCK_DIMS = [190, 256, 128]
HEAD_HIDDEN = 64
ARTIFACT_ROOT = Path("../artifacts/polymers_pretrain_finetune_runs")
ARTIFACT_ROOT.mkdir(parents=True, exist_ok=True)

# NUM_PRETRAIN_RUNS = 20
# PRETRAIN_MAX_EPOCHS = 100
# FINETUNE_MAX_EPOCHS = 60
NUM_PRETRAIN_RUNS = 2
PRETRAIN_MAX_EPOCHS = 10
FINETUNE_MAX_EPOCHS = 10
BATCH_SIZE = 256
NUM_WORKERS = 0
LOG_EVERY_N_STEPS = 20
RANDOM_SEED_BASE = 1729

PRETRAIN_SAMPLE = None  # Set to an int for smoke tests
PI_SAMPLE = None  # Set to an int for smoke tests

PROPERTY_SCALERS: dict[str, Any] = {}

if QUIET_MODEL_LOGGING:
    fm_logger.disable("foundation_model")
else:
    fm_logger.enable("foundation_model")


In [6]:
descriptor_df = pd.read_parquet(DESCRIPTOR_PATH)
non_pi_df = pd.read_parquet(NON_PI_PATH)
pi_df = pd.read_parquet(PI_PATH)

if USE_NORMALIZED_TARGETS:
    if not SCALER_PATH.exists():
        raise FileNotFoundError(f"Missing scaler file: {SCALER_PATH}")
    PROPERTY_SCALERS = joblib.load(SCALER_PATH)
    missing_scalers = [name for name in LOWER_CASE_PROPERTIES if name not in PROPERTY_SCALERS]
    if missing_scalers:
        raise KeyError(f"Scaler missing entries for: {missing_scalers}")
else:
    PROPERTY_SCALERS = {}

missing_pretrain = [PRETRAIN_TARGET_COLUMNS[name] for name in PRETRAIN_TASK_NAMES if PRETRAIN_TARGET_COLUMNS[name] not in non_pi_df.columns]
if missing_pretrain:
    raise KeyError(f"Non-PI table missing columns: {missing_pretrain}")

missing_finetune = [name for name in FINETUNE_TASK_NAMES if FINETUNE_TARGET_COLUMNS[name] not in pi_df.columns]
if missing_finetune:
    print(f"Warning: PI table missing columns for tasks: {missing_finetune}. They will be skipped.")
available_finetune_tasks = [name for name in FINETUNE_TASK_NAMES if name not in missing_finetune]
if not available_finetune_tasks:
    raise ValueError("No PI finetune tasks remain after filtering missing columns.")
original_finetune_columns = FINETUNE_TARGET_COLUMNS
FINETUNE_TASK_NAMES = available_finetune_tasks
FINETUNE_TARGET_COLUMNS = {name: original_finetune_columns[name] for name in FINETUNE_TASK_NAMES}

common_non_pi_index = descriptor_df.index.intersection(non_pi_df.index)
pretrain_features = descriptor_df.loc[common_non_pi_index]
pretrain_targets = non_pi_df.loc[common_non_pi_index, [PRETRAIN_TARGET_COLUMNS[name] for name in PRETRAIN_TASK_NAMES]]

if PRETRAIN_SAMPLE is not None and PRETRAIN_SAMPLE < len(pretrain_features):
    pretrain_features = pretrain_features.sample(n=PRETRAIN_SAMPLE, random_state=42)
    pretrain_targets = pretrain_targets.loc[pretrain_features.index]

common_pi_index = descriptor_df.index.intersection(pi_df.index)
pi_features = descriptor_df.loc[common_pi_index]
pi_targets = pi_df.loc[common_pi_index, [FINETUNE_TARGET_COLUMNS[name] for name in FINETUNE_TASK_NAMES]]

if PI_SAMPLE is not None and PI_SAMPLE < len(pi_features):
    pi_features = pi_features.sample(n=PI_SAMPLE, random_state=13)
    pi_targets = pi_targets.loc[pi_features.index]

print(f"Pretrain feature matrix: {pretrain_features.shape}")
print(f"Pretrain target matrix: {pretrain_targets.shape}")
print(f"PI feature matrix: {pi_features.shape}")
print(f"PI target matrix: {pi_targets.shape}")


Pretrain feature matrix: (71725, 190)
Pretrain target matrix: (71725, 3)
PI feature matrix: (1083, 190)
PI target matrix: (1083, 3)


## Helper Utilities


In [7]:
def safe_slug(name: str) -> str:
    slug = re.sub(r"[^a-z0-9]+", "_", name.lower()).strip("_")
    return slug or "task"

def maybe_inverse_transform(property_name: str, values: np.ndarray) -> np.ndarray:
    if not USE_NORMALIZED_TARGETS:
        return values
    scaler = PROPERTY_SCALERS.get(property_name)
    if scaler is None:
        raise KeyError(f"Scaler not found for property '{property_name}'")
    reshaped = values.reshape(-1, 1)
    restored = scaler.inverse_transform(reshaped)
    return np.asarray(restored).reshape(-1)

def build_regression_task(name: str, column: str) -> RegressionTaskConfig:
    return RegressionTaskConfig(
        name=name,
        data_column=column,
        dims=[SHARED_BLOCK_DIMS[-1], HEAD_HIDDEN, 1],
        norm=True,
        residual=False,
    )

def make_pretrain_task_configs(task_names: list[str]) -> list[RegressionTaskConfig]:
    return [build_regression_task(name, PRETRAIN_TARGET_COLUMNS[name]) for name in task_names]

def make_finetune_task_config(task_name: str) -> RegressionTaskConfig:
    return build_regression_task(task_name, FINETUNE_TARGET_COLUMNS[task_name])

def build_pretrain_datamodule(task_names: list[str], *, batch_size: int = BATCH_SIZE) -> CompoundDataModule:
    stage_targets = pretrain_targets.loc[:, [PRETRAIN_TARGET_COLUMNS[name] for name in task_names]]
    return CompoundDataModule(
        formula_desc_source=pretrain_features,
        attributes_source=stage_targets,
        task_configs=make_pretrain_task_configs(task_names),
        batch_size=batch_size,
        num_workers=NUM_WORKERS,
    )

def build_finetune_datamodule(task_name: str, *, batch_size: int = BATCH_SIZE) -> CompoundDataModule:
    target_frame = pi_targets.loc[:, [FINETUNE_TARGET_COLUMNS[task_name]]]
    task_config = make_finetune_task_config(task_name)
    return CompoundDataModule(
        formula_desc_source=pi_features,
        attributes_source=target_frame,
        task_configs=[task_config],
        batch_size=batch_size,
        num_workers=NUM_WORKERS,
    )

def plot_test_predictions(
    *,
    model: FlexibleMultiTaskModel,
    datamodule: CompoundDataModule,
    phase: str,
    run_id: int,
    stage_num: int | None,
    stage_tasks: list[str],
    new_task_name: str,
    output_dir: Path | str,
) -> None:
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    metrics_path = output_dir / "metrics.json"
    predictions_path = output_dir / "predictions.parquet"
    task_order_path = output_dir / "tasks.txt"
    task_order_path.write_text(" -> ".join(stage_tasks) + "", encoding="utf-8")

    if torch.cuda.is_available():
        device = "cuda"
    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        device = "mps"
    else:
        device = "cpu"

    datamodule.setup(stage="test")
    test_loader = datamodule.test_dataloader()
    if test_loader is None:
        raise RuntimeError(f"{phase} stage {stage_num} has no test dataloader")

    original_device = next(model.parameters()).device
    was_training = model.training
    model = model.to(device)
    model.eval()

    aggregated: dict[str, dict[str, list[np.ndarray]]] = {}
    prediction_rows: list[dict[str, float | int | str | None]] = []
    per_task_counts: dict[str, int] = {}

    with torch.no_grad():
        for batch in test_loader:
            x, y_dict, mask_dict, t_sequences = batch
            x = x.to(device)
            preds = model(x, t_sequences)

            for name, pred_tensor in preds.items():
                if name not in y_dict:
                    continue

                target_tensor = y_dict[name]
                mask_tensor = mask_dict.get(name)

                if isinstance(target_tensor, list):
                    target_flat = torch.cat([t.detach().cpu().reshape(-1) for t in target_tensor])
                else:
                    target_flat = target_tensor.detach().cpu().reshape(-1)

                pred_flat = pred_tensor.detach().cpu().reshape(-1)

                if mask_tensor is not None:
                    if isinstance(mask_tensor, list):
                        mask_flat = torch.cat([m.detach().cpu().reshape(-1) for m in mask_tensor])
                    else:
                        mask_flat = mask_tensor.detach().cpu().reshape(-1)
                    mask_flat = mask_flat.bool()
                    target_flat = target_flat[mask_flat]
                    pred_flat = pred_flat[mask_flat]

                if target_flat.numel() == 0:
                    continue

                target_np = target_flat.numpy()
                pred_np = pred_flat.numpy()
                target_np = maybe_inverse_transform(name, target_np)
                pred_np = maybe_inverse_transform(name, pred_np)

                entry = aggregated.setdefault(name, {"preds": [], "targets": []})
                entry["preds"].append(pred_np)
                entry["targets"].append(target_np)

                start_idx = per_task_counts.get(name, 0)
                for offset, (actual_val, pred_val) in enumerate(zip(target_np.tolist(), pred_np.tolist())):
                    prediction_rows.append(
                        {
                            "run": run_id,
                            "phase": phase,
                            "stage": stage_num,
                            "task": name,
                            "sample_index": start_idx + offset,
                            "actual": actual_val,
                            "predicted": pred_val,
                        }
                    )
                per_task_counts[name] = start_idx + len(target_np)

    if not aggregated:
        print(f"No predictions to log for run {run_id} stage {stage_num} ({phase}).")
        model.to(original_device)
        if was_training:
            model.train()
        return

    metrics: dict[str, dict[str, float | int | None]] = {}

    for name in stage_tasks:
        if name not in aggregated:
            continue
        preds = np.concatenate(aggregated[name]["preds"])
        targets = np.concatenate(aggregated[name]["targets"])
        diff = preds - targets
        mae = float(np.mean(np.abs(diff)))
        mse = float(np.mean(diff ** 2))
        rmse = float(np.sqrt(np.mean(diff ** 2)))
        ss_tot = float(np.sum((targets - np.mean(targets)) ** 2))
        ss_res = float(np.sum(diff ** 2))
        r2_value = 1.0 - ss_res / ss_tot if ss_tot > 0 else None
        metrics[name] = {
            "samples": int(targets.size),
            "mae": mae,
            "mse": mse,
            "rmse": rmse,
            "r2": r2_value,
        }

        lo = float(min(preds.min(), targets.min()))
        hi = float(max(preds.max(), targets.max()))
        buffer = 0.05 * (hi - lo) if hi > lo else 0.1
        lo -= buffer
        hi += buffer

        fig, ax = plt.subplots(figsize=(9, 9))
        ax.scatter(targets, preds, s=14, alpha=0.6, edgecolors="none")
        ax.plot([lo, hi], [lo, hi], "--", color="tab:red", linewidth=1.5)
        annotation_lines = [
            f"MAE: {mae:.3f}",
            rf"$R^2$: {r2_value:.3f}" if r2_value is not None else r"$R^2$: N/A",
            f"Samples: {int(targets.size):,}",
        ]
        ax.text(
            0.05,
            0.95,
            "\n".join(annotation_lines),
            transform=ax.transAxes,
            fontsize=13,
            verticalalignment="top",
            bbox=dict(boxstyle="round,pad=0.4", facecolor="white", alpha=0.7),
        )
        ax.set_xlim(lo, hi)
        ax.set_ylim(lo, hi)
        ax.set_xlabel("Actual")
        ax.set_ylabel("Predicted")
        if phase == "pretrain" and stage_num is not None:
            title_prefix = f"Pretrain Stage {stage_num}"
        else:
            title_prefix = "Finetune"
        ax.set_title(f"{title_prefix}: {name}")
        ax.grid(alpha=0.25)
        ax.set_aspect("equal", adjustable="box")
        fig.tight_layout()
        fig.savefig(output_dir / f"{safe_slug(name)}_pred.png", dpi=100)
        plt.close(fig)

    metrics_payload = {
        "run_id": run_id,
        "phase": phase,
        "stage": stage_num,
        "new_task": new_task_name,
        "task_sequence": list(stage_tasks),
        "metrics": metrics,
    }

    if prediction_rows:
        pd.DataFrame(prediction_rows).to_parquet(predictions_path, index=False)
        print(f"Saved predictions to {predictions_path}")

    with open(metrics_path, "w", encoding="utf-8") as f:
        json.dump(metrics_payload, f, indent=2)
    print(f"Saved metrics to {metrics_path}")

    model.to(original_device)
    if was_training:
        model.train()


In [8]:
torch.serialization.add_safe_globals([RegressionTaskConfig, TaskType, OptimizerConfig])


## Pretrain & Finetune Workflow


In [9]:
experiment_records: list[dict] = []

for run_idx in range(1, NUM_PRETRAIN_RUNS + 1):
    rng = random.Random(RANDOM_SEED_BASE + run_idx)
    task_sequence = rng.sample(PRETRAIN_TASK_NAMES, k=len(PRETRAIN_TASK_NAMES))
    run_label = f"run{run_idx:02d}"
    print(f"""
====================
Starting {run_label}
Task order: {task_sequence}
===================="""
)

    run_root = ARTIFACT_ROOT / run_label
    run_root.mkdir(parents=True, exist_ok=True)

    previous_checkpoint: str | None = None
    pretrain_stage_records: list[dict] = []

    for stage_idx, task_name in enumerate(task_sequence, start=1):
        stage_tasks = task_sequence[:stage_idx]
        datamodule = build_pretrain_datamodule(stage_tasks)
        task_configs = make_pretrain_task_configs(stage_tasks)

        if previous_checkpoint is None:
            model = FlexibleMultiTaskModel(
                shared_block_dims=SHARED_BLOCK_DIMS,
                task_configs=task_configs,
                enable_learnable_loss_balancer=True,
                shared_block_optimizer=OptimizerConfig(lr=1e-2),
            )
        else:
            model = FlexibleMultiTaskModel.load_from_checkpoint(
                checkpoint_path=previous_checkpoint,
                strict=False,
                enable_learnable_loss_balancer=True,
            )
            existing = set(model.task_heads.keys())
            new_configs = [cfg for cfg in task_configs if cfg.name not in existing]
            if new_configs:
                model.add_task(*new_configs)

        stage_dir = run_root / f"pretrain_stage{stage_idx:02d}_{safe_slug(task_name)}"
        stage_dir.mkdir(parents=True, exist_ok=True)

        checkpoint_cb = ModelCheckpoint(
            dirpath=stage_dir / "checkpoints",
            filename=f"{safe_slug(task_name)}-{{epoch:02d}}-{{val_final_loss:.4f}}",
            monitor="val_final_loss",
            mode="min",
            save_top_k=1,
        )
        early_stopping = EarlyStopping(monitor="val_final_loss", mode="min", patience=10)
        csv_logger = CSVLogger(save_dir=stage_dir / "logs", name="csv")
        tensorboard_logger = TensorBoardLogger(save_dir=stage_dir / "logs", name="tensorboard")

        trainer = Trainer(
            max_epochs=PRETRAIN_MAX_EPOCHS,
            accelerator="auto",
            devices="auto",
            callbacks=[checkpoint_cb, early_stopping],
            logger=[csv_logger, tensorboard_logger],
            log_every_n_steps=LOG_EVERY_N_STEPS,
        )

        trainer.fit(model, datamodule=datamodule)
        best_model_path = checkpoint_cb.best_model_path
        print(f"Run {run_label} stage {stage_idx}: best checkpoint -> {best_model_path}")

        if best_model_path:
            state = torch.load(best_model_path, map_location="cpu", weights_only=True)
            state_dict = state.get("state_dict", state)
            model.load_state_dict(state_dict)
            previous_checkpoint = best_model_path
        else:
            print("Warning: no best checkpoint captured; using current weights.")

        prediction_dir = stage_dir / "prediction"
        plot_test_predictions(
            model=model,
            datamodule=datamodule,
            phase="pretrain",
            run_id=run_idx,
            stage_num=stage_idx,
            stage_tasks=stage_tasks,
            new_task_name=task_name,
            output_dir=prediction_dir,
        )

        pretrain_stage_records.append(
            {
                "stage": stage_idx,
                "task_name": task_name,
                "task_sequence": list(stage_tasks),
                "checkpoint": best_model_path,
                "stage_dir": stage_dir,
            }
        )

    if previous_checkpoint is None:
        raise RuntimeError(f"Run {run_label} produced no pretrain checkpoint; cannot finetune.")

    finetune_records: list[dict] = []
    for task_name in FINETUNE_TASK_NAMES:
        finetune_model = FlexibleMultiTaskModel.load_from_checkpoint(
            checkpoint_path=previous_checkpoint,
            strict=False,
            enable_learnable_loss_balancer=True,
            freeze_shared_encoder=FINETUNE_FREEZE_SHARED,
        )
        active_tasks = list(finetune_model.task_heads.keys())
        if active_tasks:
            finetune_model.remove_tasks(*active_tasks)

        task_config = make_finetune_task_config(task_name)
        finetune_model.add_task(task_config)

        datamodule = build_finetune_datamodule(task_name)

        stage_dir = run_root / f"finetune_{safe_slug(task_name)}"
        stage_dir.mkdir(parents=True, exist_ok=True)

        checkpoint_cb = ModelCheckpoint(
            dirpath=stage_dir / "checkpoints",
            filename=f"{safe_slug(task_name)}-{{epoch:02d}}-{{val_final_loss:.4f}}",
            monitor="val_final_loss",
            mode="min",
            save_top_k=1,
        )
        early_stopping = EarlyStopping(monitor="val_final_loss", mode="min", patience=10)
        csv_logger = CSVLogger(save_dir=stage_dir / "logs", name="csv")
        tensorboard_logger = TensorBoardLogger(save_dir=stage_dir / "logs", name="tensorboard")

        trainer = Trainer(
            max_epochs=FINETUNE_MAX_EPOCHS,
            accelerator="auto",
            devices="auto",
            callbacks=[checkpoint_cb, early_stopping],
            logger=[csv_logger, tensorboard_logger],
            log_every_n_steps=LOG_EVERY_N_STEPS,
        )

        trainer.fit(finetune_model, datamodule=datamodule)
        best_model_path = checkpoint_cb.best_model_path
        print(f"Run {run_label} finetune {task_name}: best checkpoint -> {best_model_path}")

        if best_model_path:
            state = torch.load(best_model_path, map_location="cpu", weights_only=True)
            state_dict = state.get("state_dict", state)
            finetune_model.load_state_dict(state_dict)
        else:
            print("Warning: finetune stage missing checkpoint; using current weights.")

        prediction_dir = stage_dir / "prediction"
        plot_test_predictions(
            model=finetune_model,
            datamodule=datamodule,
            phase="finetune",
            run_id=run_idx,
            stage_num=None,
            stage_tasks=[task_name],
            new_task_name=task_name,
            output_dir=prediction_dir,
        )

        finetune_records.append(
            {
                "task_name": task_name,
                "checkpoint": best_model_path,
                "stage_dir": stage_dir,
            }
        )

    experiment_records.append(
        {
            "run": run_label,
            "task_sequence": task_sequence,
            "pretrain": pretrain_stage_records,
            "pretrain_checkpoint": previous_checkpoint,
            "finetune": finetune_records,
        }
    )

print("Completed all pretrain + finetune runs.")


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Starting run01
Task order: ['density', 'Rg', 'r2']


/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py:317: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.

  | Name                | Type              | Params | Mode 
------------------------------------------------------------------
0 | task_log_sigmas     | ParameterDict     | 1      | train
1 | encoder             | FoundationEncoder | 99.1 K | train
2 | shared              | LinearBlock       | 82.6 K | train
3 | deposit             | Sequential        | 16.5 K | train
4 | task_heads          | ModuleDict        | 8.4 K  | train
5 | disabled_task_heads | ModuleDict        | 0      | train
------------------------------------------------------------------
107 K     Trainable params
0         Non-trainable params
107 K     Total params
0.430     Total estimated model params size (MB)
26        Modules in train mode
0

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]



                                                                           

/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 9: 100%|██████████| 225/225 [00:01<00:00, 119.32it/s, v_num=0, train_final_loss_step=-2.43, val_final_loss=-2.43, train_final_loss_epoch=-2.38] 

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 225/225 [00:01<00:00, 118.09it/s, v_num=0, train_final_loss_step=-2.43, val_final_loss=-2.43, train_final_loss_epoch=-2.38]
Run run01 stage 1: best checkpoint -> /Users/liuchang/projects/foundation_model/artifacts/polymers_pretrain_finetune_runs/run01/pretrain_stage01_density/checkpoints/density-epoch=09-val_final_loss=-2.4284.ckpt
Saved predictions to ../artifacts/polymers_pretrain_finetune_runs/run01/pretrain_stage01_density/prediction/predictions.parquet
Saved metrics to ../artifacts/polymers_pretrain_finetune_runs/run01/pretrain_stage01_density/prediction/metrics.json


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py:317: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.

  | Name                | Type              | Params | Mode 
------------------------------------------------------------------
0 | task_log_sigmas     | ParameterDict     | 2      | train
1 | encoder             | FoundationEncoder | 99.1 K | train
2 | shared              | LinearBlock       | 82.6 K | train
3 | deposit             | Sequential        | 16.5 K | train
4 | task_heads          | ModuleDict        | 16.9 K | train
5 | disabled_task_heads | ModuleDict        | 0      | train
------------------------------------------------------------------
115 K     Trainable params
0         Non-trainab

                                                                           

/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 9: 100%|██████████| 225/225 [00:02<00:00, 79.25it/s, v_num=0, train_final_loss_step=8.010, val_final_loss=1.330, train_final_loss_epoch=1.340]  

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 225/225 [00:02<00:00, 79.12it/s, v_num=0, train_final_loss_step=8.010, val_final_loss=1.330, train_final_loss_epoch=1.340]
Run run01 stage 2: best checkpoint -> /Users/liuchang/projects/foundation_model/artifacts/polymers_pretrain_finetune_runs/run01/pretrain_stage02_rg/checkpoints/rg-epoch=08-val_final_loss=1.3182.ckpt


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Saved predictions to ../artifacts/polymers_pretrain_finetune_runs/run01/pretrain_stage02_rg/prediction/predictions.parquet
Saved metrics to ../artifacts/polymers_pretrain_finetune_runs/run01/pretrain_stage02_rg/prediction/metrics.json


/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py:317: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.

  | Name                | Type              | Params | Mode 
------------------------------------------------------------------
0 | task_log_sigmas     | ParameterDict     | 3      | train
1 | encoder             | FoundationEncoder | 99.1 K | train
2 | shared              | LinearBlock       | 82.6 K | train
3 | deposit             | Sequential        | 16.5 K | train
4 | task_heads          | ModuleDict        | 25.3 K | train
5 | disabled_task_heads | ModuleDict        | 0      | train
------------------------------------------------------------------
124 K     Trainable params
0         Non-trainable params
124 K     Total params
0.498     Total estimated model params size (MB)
46        Modules in train mode
0

                                                                           

/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 9: 100%|██████████| 225/225 [00:03<00:00, 61.00it/s, v_num=0, train_final_loss_step=135.0, val_final_loss=314.0, train_final_loss_epoch=301.0]  

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 225/225 [00:03<00:00, 60.49it/s, v_num=0, train_final_loss_step=135.0, val_final_loss=314.0, train_final_loss_epoch=301.0]
Run run01 stage 3: best checkpoint -> /Users/liuchang/projects/foundation_model/artifacts/polymers_pretrain_finetune_runs/run01/pretrain_stage03_r2/checkpoints/r2-epoch=09-val_final_loss=314.0172.ckpt


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py:317: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.

  | Name                | Type              | Params | Mode 
------------------------------------------------------------------
0 | task_log_sigmas     | ParameterDict     | 1      | train
1 | encoder             | FoundationEncoder | 99.1 K | train
2 | shared              | LinearBlock       | 82.6 K | train
3 | deposit             | Sequential        | 16.5 K | train
4 | task_heads          | ModuleDict        | 8.4 K  | train
5 | disabled_task_heads | ModuleDict        | 0      | train
------------------------------------------------------------------
8.4 K     Trainable params
99.1 K    Non-trainab

Saved predictions to ../artifacts/polymers_pretrain_finetune_runs/run01/pretrain_stage03_r2/prediction/predictions.parquet
Saved metrics to ../artifacts/polymers_pretrain_finetune_runs/run01/pretrain_stage03_r2/prediction/metrics.json
                                                                           

/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (4) is smaller than the logging interval Trainer(log_every_n_steps=20). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 9: 100%|██████████| 4/4 [00:00<00:00, 92.74it/s, v_num=0, train_final_loss_step=-0.39, val_final_loss=-0.40, train_final_loss_epoch=-0.372]       

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 4/4 [00:00<00:00, 69.33it/s, v_num=0, train_final_loss_step=-0.39, val_final_loss=-0.40, train_final_loss_epoch=-0.372]
Run run01 finetune linear_expansion: best checkpoint -> /Users/liuchang/projects/foundation_model/artifacts/polymers_pretrain_finetune_runs/run01/finetune_linear_expansion/checkpoints/linear_expansion-epoch=09-val_final_loss=-0.4002.ckpt
Saved predictions to ../artifacts/polymers_pretrain_finetune_runs/run01/finetune_linear_expansion/prediction/predictions.parquet
Saved metrics to ../artifacts/polymers_pretrain_finetune_runs/run01/finetune_linear_expansion/prediction/metrics.json


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py:317: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.

  | Name                | Type              | Params | Mode 
------------------------------------------------------------------
0 | task_log_sigmas     | ParameterDict     | 1      | train
1 | encoder             | FoundationEncoder | 99.1 K | train
2 | shared              | LinearBlock       | 82.6 K | train
3 | deposit             | Sequential        | 16.5 K | train
4 | task_heads          | ModuleDict        | 8.4 K  | train
5 | disabled_task_heads | ModuleDict        | 0      | train
------------------------------------------------------------------
8.4 K     Trainable params
99.1 K    Non-trainab

                                                                           

/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.




/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (4) is smaller than the logging interval Trainer(log_every_n_steps=20). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 9: 100%|██████████| 4/4 [00:00<00:00, 96.01it/s, v_num=0, train_final_loss_step=-0.122, val_final_loss=-0.141, train_final_loss_epoch=-0.109]    

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 4/4 [00:00<00:00, 72.54it/s, v_num=0, train_final_loss_step=-0.122, val_final_loss=-0.141, train_final_loss_epoch=-0.109]
Run run01 finetune refractive_index: best checkpoint -> /Users/liuchang/projects/foundation_model/artifacts/polymers_pretrain_finetune_runs/run01/finetune_refractive_index/checkpoints/refractive_index-epoch=09-val_final_loss=-0.1414.ckpt
Saved predictions to ../artifacts/polymers_pretrain_finetune_runs/run01/finetune_refractive_index/prediction/predictions.parquet
Saved metrics to ../artifacts/polymers_pretrain_finetune_runs/run01/finetune_refractive_index/prediction/metrics.json


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py:317: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.

  | Name                | Type              | Params | Mode 
------------------------------------------------------------------
0 | task_log_sigmas     | ParameterDict     | 1      | train
1 | encoder             | FoundationEncoder | 99.1 K | train
2 | shared              | LinearBlock       | 82.6 K | train
3 | deposit             | Sequential        | 16.5 K | train
4 | task_heads          | ModuleDict        | 8.4 K  | train
5 | disabled_task_heads | ModuleDict        | 0      | train
------------------------------------------------------------------
8.4 K     Trainable params
99.1 K    Non-trainab

                                                                           

/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (4) is smaller than the logging interval Trainer(log_every_n_steps=20). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 9: 100%|██████████| 4/4 [00:00<00:00, 102.72it/s, v_num=0, train_final_loss_step=1.57e+5, val_final_loss=8.84e+5, train_final_loss_epoch=3.44e+6]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 4/4 [00:00<00:00, 74.72it/s, v_num=0, train_final_loss_step=1.57e+5, val_final_loss=8.84e+5, train_final_loss_epoch=3.44e+6] 
Run run01 finetune tg: best checkpoint -> /Users/liuchang/projects/foundation_model/artifacts/polymers_pretrain_finetune_runs/run01/finetune_tg/checkpoints/tg-epoch=09-val_final_loss=883993.6250.ckpt
Saved predictions to ../artifacts/polymers_pretrain_finetune_runs/run01/finetune_tg/prediction/predictions.parquet
Saved metrics to ../artifacts/polymers_pretrain_finetune_runs/run01/finetune_tg/prediction/metrics.json

Starting run02
Task order: ['Rg', 'density', 'r2']


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py:317: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.

  | Name                | Type              | Params | Mode 
------------------------------------------------------------------
0 | task_log_sigmas     | ParameterDict     | 1      | train
1 | encoder             | FoundationEncoder | 99.1 K | train
2 | shared              | LinearBlock       | 82.6 K | train
3 | deposit             | Sequential        | 16.5 K | train
4 | task_heads          | ModuleDict        | 8.4 K  | train
5 | disabled_task_heads | ModuleDict        | 0      | train
------------------------------------------------------------------
107 K     Trainable params
0         Non-trainab

                                                                            

/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 9: 100%|██████████| 225/225 [00:01<00:00, 116.20it/s, v_num=0, train_final_loss_step=1.730, val_final_loss=3.510, train_final_loss_epoch=3.420]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 225/225 [00:01<00:00, 114.99it/s, v_num=0, train_final_loss_step=1.730, val_final_loss=3.510, train_final_loss_epoch=3.420]
Run run02 stage 1: best checkpoint -> /Users/liuchang/projects/foundation_model/artifacts/polymers_pretrain_finetune_runs/run02/pretrain_stage01_rg/checkpoints/rg-epoch=09-val_final_loss=3.5054.ckpt
Saved predictions to ../artifacts/polymers_pretrain_finetune_runs/run02/pretrain_stage01_rg/prediction/predictions.parquet
Saved metrics to ../artifacts/polymers_pretrain_finetune_runs/run02/pretrain_stage01_rg/prediction/metrics.json


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py:317: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.

  | Name                | Type              | Params | Mode 
------------------------------------------------------------------
0 | task_log_sigmas     | ParameterDict     | 2      | train
1 | encoder             | FoundationEncoder | 99.1 K | train
2 | shared              | LinearBlock       | 82.6 K | train
3 | deposit             | Sequential        | 16.5 K | train
4 | task_heads          | ModuleDict        | 16.9 K | train
5 | disabled_task_heads | ModuleDict        | 0      | train
------------------------------------------------------------------
115 K     Trainable params
0         Non-trainab

                                                                           

/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 9: 100%|██████████| 225/225 [00:02<00:00, 79.41it/s, v_num=0, train_final_loss_step=1.220, val_final_loss=1.860, train_final_loss_epoch=1.820]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 225/225 [00:02<00:00, 78.67it/s, v_num=0, train_final_loss_step=1.220, val_final_loss=1.860, train_final_loss_epoch=1.820]
Run run02 stage 2: best checkpoint -> /Users/liuchang/projects/foundation_model/artifacts/polymers_pretrain_finetune_runs/run02/pretrain_stage02_density/checkpoints/density-epoch=09-val_final_loss=1.8555.ckpt


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Saved predictions to ../artifacts/polymers_pretrain_finetune_runs/run02/pretrain_stage02_density/prediction/predictions.parquet
Saved metrics to ../artifacts/polymers_pretrain_finetune_runs/run02/pretrain_stage02_density/prediction/metrics.json


/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py:317: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.

  | Name                | Type              | Params | Mode 
------------------------------------------------------------------
0 | task_log_sigmas     | ParameterDict     | 3      | train
1 | encoder             | FoundationEncoder | 99.1 K | train
2 | shared              | LinearBlock       | 82.6 K | train
3 | deposit             | Sequential        | 16.5 K | train
4 | task_heads          | ModuleDict        | 25.3 K | train
5 | disabled_task_heads | ModuleDict        | 0      | train
------------------------------------------------------------------
124 K     Trainable params
0         Non-trainable params
124 K     Total params
0.498     Total estimated model params size (MB)
46        Modules in train mode
0

                                                                           

/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 9: 100%|██████████| 225/225 [00:03<00:00, 58.78it/s, v_num=0, train_final_loss_step=130.0, val_final_loss=316.0, train_final_loss_epoch=297.0]  

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 225/225 [00:03<00:00, 58.29it/s, v_num=0, train_final_loss_step=130.0, val_final_loss=316.0, train_final_loss_epoch=297.0]
Run run02 stage 3: best checkpoint -> /Users/liuchang/projects/foundation_model/artifacts/polymers_pretrain_finetune_runs/run02/pretrain_stage03_r2/checkpoints/r2-epoch=09-val_final_loss=316.0619.ckpt


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py:317: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.

  | Name                | Type              | Params | Mode 
------------------------------------------------------------------
0 | task_log_sigmas     | ParameterDict     | 1      | train
1 | encoder             | FoundationEncoder | 99.1 K | train
2 | shared              | LinearBlock       | 82.6 K | train
3 | deposit             | Sequential        | 16.5 K | train
4 | task_heads          | ModuleDict        | 8.4 K  | train
5 | disabled_task_heads | ModuleDict        | 0      | train
------------------------------------------------------------------
8.4 K     Trainable params
99.1 K    Non-trainab

Saved predictions to ../artifacts/polymers_pretrain_finetune_runs/run02/pretrain_stage03_r2/prediction/predictions.parquet
Saved metrics to ../artifacts/polymers_pretrain_finetune_runs/run02/pretrain_stage03_r2/prediction/metrics.json
                                                                            

/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (4) is smaller than the logging interval Trainer(log_every_n_steps=20). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 9: 100%|██████████| 4/4 [00:00<00:00, 102.73it/s, v_num=0, train_final_loss_step=-0.391, val_final_loss=-0.40, train_final_loss_epoch=-0.373]      

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 4/4 [00:00<00:00, 75.13it/s, v_num=0, train_final_loss_step=-0.391, val_final_loss=-0.40, train_final_loss_epoch=-0.373] 
Run run02 finetune linear_expansion: best checkpoint -> /Users/liuchang/projects/foundation_model/artifacts/polymers_pretrain_finetune_runs/run02/finetune_linear_expansion/checkpoints/linear_expansion-epoch=09-val_final_loss=-0.4005.ckpt
Saved predictions to ../artifacts/polymers_pretrain_finetune_runs/run02/finetune_linear_expansion/prediction/predictions.parquet
Saved metrics to ../artifacts/polymers_pretrain_finetune_runs/run02/finetune_linear_expansion/prediction/metrics.json


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py:317: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.

  | Name                | Type              | Params | Mode 
------------------------------------------------------------------
0 | task_log_sigmas     | ParameterDict     | 1      | train
1 | encoder             | FoundationEncoder | 99.1 K | train
2 | shared              | LinearBlock       | 82.6 K | train
3 | deposit             | Sequential        | 16.5 K | train
4 | task_heads          | ModuleDict        | 8.4 K  | train
5 | disabled_task_heads | ModuleDict        | 0      | train
------------------------------------------------------------------
8.4 K     Trainable params
99.1 K    Non-trainab

                                                                           

/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (4) is smaller than the logging interval Trainer(log_every_n_steps=20). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 9: 100%|██████████| 4/4 [00:00<00:00, 91.72it/s, v_num=0, train_final_loss_step=-0.271, val_final_loss=-0.244, train_final_loss_epoch=-0.257]     

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 4/4 [00:00<00:00, 67.19it/s, v_num=0, train_final_loss_step=-0.271, val_final_loss=-0.244, train_final_loss_epoch=-0.257]
Run run02 finetune refractive_index: best checkpoint -> /Users/liuchang/projects/foundation_model/artifacts/polymers_pretrain_finetune_runs/run02/finetune_refractive_index/checkpoints/refractive_index-epoch=09-val_final_loss=-0.2442.ckpt
Saved predictions to ../artifacts/polymers_pretrain_finetune_runs/run02/finetune_refractive_index/prediction/predictions.parquet
Saved metrics to ../artifacts/polymers_pretrain_finetune_runs/run02/finetune_refractive_index/prediction/metrics.json


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py:317: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.

  | Name                | Type              | Params | Mode 
------------------------------------------------------------------
0 | task_log_sigmas     | ParameterDict     | 1      | train
1 | encoder             | FoundationEncoder | 99.1 K | train
2 | shared              | LinearBlock       | 82.6 K | train
3 | deposit             | Sequential        | 16.5 K | train
4 | task_heads          | ModuleDict        | 8.4 K  | train
5 | disabled_task_heads | ModuleDict        | 0      | train
------------------------------------------------------------------
8.4 K     Trainable params
99.1 K    Non-trainab

                                                                            

/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/Users/liuchang/projects/foundation_model/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/fit_loop.py:310: The number of training batches (4) is smaller than the logging interval Trainer(log_every_n_steps=20). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 9: 100%|██████████| 4/4 [00:00<00:00, 97.29it/s, v_num=0, train_final_loss_step=1.98e+5, val_final_loss=9.3e+5, train_final_loss_epoch=3.62e+6]  

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 4/4 [00:00<00:00, 70.24it/s, v_num=0, train_final_loss_step=1.98e+5, val_final_loss=9.3e+5, train_final_loss_epoch=3.62e+6]
Run run02 finetune tg: best checkpoint -> /Users/liuchang/projects/foundation_model/artifacts/polymers_pretrain_finetune_runs/run02/finetune_tg/checkpoints/tg-epoch=09-val_final_loss=930090.2500.ckpt
Saved predictions to ../artifacts/polymers_pretrain_finetune_runs/run02/finetune_tg/prediction/predictions.parquet
Saved metrics to ../artifacts/polymers_pretrain_finetune_runs/run02/finetune_tg/prediction/metrics.json
Completed all pretrain + finetune runs.


## Run Summary


In [None]:
print(f"Recorded {len(experiment_records)} runs.")
for record in experiment_records:
    print(record["run"], "pretrain stages:", len(record["pretrain"]), "finetune stages:", len(record["finetune"]))


Recorded 2 runs.
run01 pretrain stages: 3 finetune stages: 3
run02 pretrain stages: 3 finetune stages: 3
