Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
43e216e
Making train and trainer modules, needs typing and docstrings
lotif Jul 25, 2025
9d6458a
Adding docstrings to clava_clusteting
lotif Jul 25, 2025
4e58fec
Adding docstring to the train.py module
lotif Jul 28, 2025
08b9a10
fixing test, adding docstrings for the trainer module
lotif Jul 28, 2025
c7b5040
WIP
lotif Jul 31, 2025
0d8c499
Moving clustering code to its own module
lotif Jul 31, 2025
63b93d8
WIP
lotif Jul 31, 2025
a9d24e8
Adding module docstring
lotif Jul 31, 2025
cf74942
Merge branch 'clustering-module' into sampler-module
lotif Jul 31, 2025
53e562e
Moving sampler classes to their own module and some more additional t…
lotif Jul 31, 2025
6e0a3e4
1st round of Fatemeh's CR
lotif Aug 5, 2025
932e1af
Fatemeh's CR
lotif Aug 6, 2025
fdfc6f7
Merge branch 'train-module' into clustering-module
lotif Aug 6, 2025
e093592
removing print
lotif Aug 6, 2025
b60a3f9
Merge branch 'clustering-module' into sampler-module
lotif Aug 6, 2025
9cc7421
WIP David's and Fatemeh's CR
lotif Aug 6, 2025
152ee17
Merge branch 'train-module' into clustering-module
lotif Aug 6, 2025
ce871de
Merge branch 'clustering-module' into sampler-module
lotif Aug 6, 2025
fcc00e9
David's CR: renamings
lotif Aug 7, 2025
f20f482
David's CR: remaining comments
lotif Aug 7, 2025
9408819
Merge branch 'main' into train-module
lotif Sep 8, 2025
3213772
Removing unused ignore
lotif Sep 8, 2025
f4599ac
Merge branch 'train-module' into clustering-module
lotif Sep 8, 2025
98af2c7
Merge branch 'main' into clustering-module
lotif Sep 8, 2025
fa7c27a
Merge branch 'clustering-module' into sampler-module
lotif Sep 8, 2025
9541cce
Little refactorings
lotif Sep 12, 2025
0534b41
Addressing some more comments by David
lotif Sep 15, 2025
ed0a2d3
Merge branch 'main' into clustering-module
lotif Sep 15, 2025
756eaaa
Merge branch 'clustering-module' into sampler-module
lotif Sep 15, 2025
96ab05d
Addressing comments by David
lotif Sep 16, 2025
6b7a7f6
David's CR
lotif Sep 17, 2025
09e5dc8
Rewording docstring, replacing logger
lotif Sep 17, 2025
abaabac
Merge branch 'clustering-module' into sampler-module
lotif Sep 17, 2025
9860dbe
Merge branch 'main' into sampler-module
lotif Sep 17, 2025
bc462dd
David's last comments
lotif Sep 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
227 changes: 0 additions & 227 deletions src/midst_toolkit/models/clavaddpm/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import json
import math
import pickle
from abc import ABC, abstractmethod
from collections import Counter
from collections.abc import Callable, Generator
from copy import deepcopy
Expand Down Expand Up @@ -38,10 +37,6 @@
from torch import Tensor, nn

from midst_toolkit.common.enumerations import PredictionType, TaskType
from midst_toolkit.core import logger
from midst_toolkit.models.clavaddpm.gaussian_multinomial_diffusion import (
GaussianMultinomialDiffusion,
)


Normalization = Literal["standard", "quantile", "minmax"]
Expand Down Expand Up @@ -493,228 +488,6 @@ def get_model(
raise ValueError("Unknown model!")


class ScheduleSampler(ABC):
"""
A distribution over timesteps in the diffusion process, intended to reduce
variance of the objective.

By default, samplers perform unbiased importance sampling, in which the
objective's mean is unchanged.
However, subclasses may override sample() to change how the resampled
terms are reweighted, allowing for actual changes in the objective.
"""

@abstractmethod
def weights(self) -> Tensor:
"""
Get a numpy array of weights, one per diffusion step.

The weights needn't be normalized, but must be positive.
"""

def sample(self, batch_size: int, device: str) -> tuple[Tensor, Tensor]:
"""
Importance-sample timesteps for a batch.

:param batch_size: the number of timesteps.
:param device: the torch device to save to.
:return: a tuple (timesteps, weights):
- timesteps: a tensor of timestep indices.
- weights: a tensor of weights to scale the resulting losses.
"""
w = self.weights().cpu().numpy()
p = w / np.sum(w)
indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
indices = torch.from_numpy(indices_np).long().to(device)
weights_np = 1 / (len(p) * p[indices_np])
weights = torch.from_numpy(weights_np).float().to(device)
return indices, weights


class UniformSampler(ScheduleSampler):
def __init__(self, diffusion: GaussianMultinomialDiffusion):
self.diffusion = diffusion
self._weights = torch.from_numpy(np.ones([diffusion.num_timesteps]))

def weights(self) -> Tensor:
return self._weights


class LossAwareSampler(ScheduleSampler):
def update_with_local_losses(self, local_ts: Tensor, local_losses: Tensor) -> None:
"""
Update the reweighting using losses from a model.

Call this method from each rank with a batch of timesteps and the
corresponding losses for each of those timesteps.
This method will perform synchronization to make sure all of the ranks
maintain the exact same reweighting.

:param local_ts: an integer Tensor of timesteps.
:param local_losses: a 1D Tensor of losses.
"""
batch_sizes = [
torch.tensor([0], dtype=torch.int32, device=local_ts.device)
for _ in range(torch.distributed.get_world_size())
]
torch.distributed.all_gather(
batch_sizes,
torch.tensor([len(local_ts)], dtype=torch.int32, device=local_ts.device),
)

# Pad all_gather batches to be the maximum batch size.
max_bs = max([int(x.item()) for x in batch_sizes])

timestep_batches = [torch.zeros(max_bs).to(local_ts) for bs in batch_sizes]
loss_batches = [torch.zeros(max_bs).to(local_losses) for bs in batch_sizes]
torch.distributed.all_gather(timestep_batches, local_ts)
torch.distributed.all_gather(loss_batches, local_losses)
timesteps = [x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]]
losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
self.update_with_all_losses(timesteps, losses)

@abstractmethod
def update_with_all_losses(self, ts: list[int], losses: list[float]) -> None:
"""
Update the reweighting using losses from a model.

Sub-classes should override this method to update the reweighting
using losses from the model.

This method directly updates the reweighting without synchronizing
between workers. It is called by update_with_local_losses from all
ranks with identical arguments. Thus, it should have deterministic
behavior to maintain state across workers.

:param ts: a list of int timesteps.
:param losses: a list of float losses, one per timestep.
"""


class LossSecondMomentResampler(LossAwareSampler):
def __init__(
self,
diffusion: GaussianMultinomialDiffusion,
history_per_term: int = 10,
uniform_prob: float = 0.001,
):
self.diffusion = diffusion
self.history_per_term = history_per_term
self.uniform_prob = uniform_prob
self._loss_history = np.zeros([diffusion.num_timesteps, history_per_term], dtype=np.float64)
self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.uint)

def weights(self):
if not self._warmed_up():
return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
weights = np.sqrt(np.mean(self._loss_history**2, axis=-1))
weights /= np.sum(weights)
weights *= 1 - self.uniform_prob
weights += self.uniform_prob / len(weights)
return weights

def update_with_all_losses(self, ts: list[int], losses: list[float]) -> None:
for t, loss in zip(ts, losses):
if self._loss_counts[t] == self.history_per_term:
# Shift out the oldest loss term.
self._loss_history[t, :-1] = self._loss_history[t, 1:]
self._loss_history[t, -1] = loss
else:
self._loss_history[t, self._loss_counts[t]] = loss
self._loss_counts[t] += 1

def _warmed_up(self) -> bool:
return (self._loss_counts == self.history_per_term).all()


def create_named_schedule_sampler(name: str, diffusion: GaussianMultinomialDiffusion) -> ScheduleSampler:
"""
Create a ScheduleSampler from a library of pre-defined samplers.

:param name: the name of the sampler.
:param diffusion: the diffusion object to sample for.
"""
if name == "uniform":
return UniformSampler(diffusion)
if name == "loss-second-moment":
return LossSecondMomentResampler(diffusion)
raise NotImplementedError(f"unknown schedule sampler: {name}")


def split_microbatches(
microbatch: int,
batch: Tensor,
labels: Tensor,
t: Tensor,
) -> Generator[tuple[Tensor, Tensor, Tensor]]:
bs = len(batch)
if microbatch == -1 or microbatch >= bs:
yield batch, labels, t
else:
for i in range(0, bs, microbatch):
yield batch[i : i + microbatch], labels[i : i + microbatch], t[i : i + microbatch]


def compute_top_k(logits: Tensor, labels: Tensor, k: int, reduction: str = "mean") -> Tensor:
_, top_ks = torch.topk(logits, k, dim=-1)
if reduction == "mean":
return (top_ks == labels[:, None]).float().sum(dim=-1).mean()
if reduction == "none":
return (top_ks == labels[:, None]).float().sum(dim=-1)

raise ValueError(f"reduction should be one of ['mean', 'none']: {reduction}")


def log_loss_dict(diffusion: GaussianMultinomialDiffusion, ts: Tensor, losses: dict[str, Tensor]) -> None:
for key, values in losses.items():
logger.logkv_mean(key, values.mean().item())
# Log the quantiles (four quartiles, in particular).
for sub_t, sub_loss in zip(ts.cpu().numpy(), values.detach().cpu().numpy()):
quartile = int(4 * sub_t / diffusion.num_timesteps)
logger.logkv_mean(f"{key}_q{quartile}", sub_loss)


def numerical_forward_backward_log(
classifier: nn.Module,
optimizer: torch.optim.Optimizer,
data_loader: Generator[tuple[Tensor, ...]],
dataset: Dataset,
schedule_sampler: ScheduleSampler,
diffusion: GaussianMultinomialDiffusion,
prefix: str = "train",
remove_first_col: bool = False,
device: str = "cuda",
) -> None:
batch, labels = next(data_loader)
labels = labels.long().to(device)

if remove_first_col:
# Remove the first column of the batch, which is the label.
batch = batch[:, 1:]

num_batch = batch[:, : dataset.n_num_features].to(device)

t, _ = schedule_sampler.sample(num_batch.shape[0], device)
batch = diffusion.gaussian_q_sample(num_batch, t).to(device)

for i, (sub_batch, sub_labels, sub_t) in enumerate(split_microbatches(-1, batch, labels, t)):
logits = classifier(sub_batch, timesteps=sub_t)
loss = F.cross_entropy(logits, sub_labels, reduction="none")

losses = {}
losses[f"{prefix}_loss"] = loss.detach()
losses[f"{prefix}_acc@1"] = compute_top_k(logits, sub_labels, k=1, reduction="none")
if logits.shape[1] >= 5:
losses[f"{prefix}_acc@5"] = compute_top_k(logits, sub_labels, k=5, reduction="none")
log_loss_dict(diffusion, sub_t, losses)
del losses
loss = loss.mean()
if loss.requires_grad:
if i == 0:
optimizer.zero_grad()
loss.backward(loss * len(sub_batch) / len(batch))


def transform_dataset(
dataset: Dataset,
transformations: Transformations,
Expand Down
Loading