## Variational Autoencoder Model

In [1]:
# dependencies
from utils.paths import project_root
from dataloaders.sliding_window import LogsSlidingWindow

import torch
from torch.utils.data import Dataset, DataLoader
import polars as pl
import numpy as np
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

import os

import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torch.nn.functional as F

import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam

from sklearn.metrics import average_precision_score, roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# configuration
torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

assert pyro.__version__.startswith('1.9.1')
pyro.distributions.enable_validation(False)
pyro.set_rng_seed(0)
smoke_test = 'CI' in os.environ

Using device: cuda


In [3]:
# loading and preparing the data
df = pl.read_csv(f"{project_root()}/data/parsed/cleaned_BGL_structured.csv")[['Timestamp', 'EventId', 'Label']]

# filter only logs without anomaly labels "-"
df = df.with_columns(
    (pl.col("Label") != "-").alias("Anomaly")
)

# convert Timestamp to polars Timestamp
df = df.with_columns(
    pl.from_epoch(pl.col("Timestamp"), time_unit="s").alias("Timestamp")
)

df.head()

Timestamp,EventId,Label,Anomaly
datetime[μs],str,str,bool
2005-06-03 22:42:50,"""3aa50e45""","""-""",False
2005-06-03 22:42:50,"""3aa50e45""","""-""",False
2005-06-03 22:42:50,"""3aa50e45""","""-""",False
2005-06-03 22:42:50,"""3aa50e45""","""-""",False
2005-06-03 22:42:50,"""3aa50e45""","""-""",False


In [4]:
# train test spit | TODO: ver se tem outra forma melhor de dividir
split_point = int(len(df) * 0.7)
train_df = df[:split_point]
test_df = df[split_point:]

In [5]:
# auxiliary functions to try load all data into vram or use batches
def get_available_vram_gb():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        free_mem = torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)
        return free_mem / (1024**3)
    return 0

def estimate_tensor_size_gb(shape, dtype=torch.float32):
    bytes_per_element = torch.tensor([], dtype=dtype).element_size()
    total_elements = 1
    for dim in shape:
        total_elements *= dim
    return (total_elements * bytes_per_element) / (1024**3)

def precompute_windows(dataset):
    print(f"Pre-computing {len(dataset)} windows...")
    
    n_samples = len(dataset)
    input_dim = dataset.n_events
    
    X = torch.zeros((n_samples, input_dim), dtype=torch.float32)
    y = torch.zeros((n_samples,), dtype=torch.long)
    pred_types = torch.zeros((n_samples), dtype=torch.long)
    
    for i in tqdm(range(n_samples)):
        count_vec, label, pred_type = dataset[i]
        X[i] = count_vec
        y[i] = label
        pred_types[i] = pred_type
        
    print("Dataset converted to Tensor")
    return X, y, pred_types

In [6]:
# create datasets
def load_dataset(
        window_size='5m', 
        step_size='1m',
          n_prediction_window=3, 
          filter_strategy='label', 
          filter_params={'contamination': 0.01},
          batch_size = 4096
    ) -> dict:
    
    train_dataset = LogsSlidingWindow(
        df, 
        window_size=window_size,   
        step_size=step_size,
        n_prediction_window=n_prediction_window,
        filter_strategy=filter_strategy,
        filter_params=filter_params
    )

    train_event_ids = train_dataset.event_ids
    INPUT_DIM = train_dataset.n_events

    test_dataset = LogsSlidingWindow(
        test_df,
        window_size=window_size,
        step_size=step_size, 
        n_prediction_window=n_prediction_window,
        event_ids=train_event_ids,  # use same event IDs as training set
        filter_strategy='none' # keep all logs for evaluation 
    )

    X_train, y_train, pred_types_train = precompute_windows(train_dataset)
    X_test, y_test, pred_types_test = precompute_windows(test_dataset)

    available_vram = get_available_vram_gb()
    train_size_gb = estimate_tensor_size_gb(X_train.shape, X_train.dtype)
    test_size_gb = estimate_tensor_size_gb(X_test.shape, X_test.dtype)
    total_size_gb = train_size_gb + test_size_gb

    safety_factor = 1.4
    required_vram = total_size_gb * safety_factor

    print(f"Available VRAM: {available_vram:.2f} GB")
    print(f"Required VRAM: {required_vram:.2f} GB (train: {train_size_gb:.2f} GB, test: {test_size_gb:.2f} GB)")

    USE_VRAM = required_vram < available_vram
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if device.type == 'cuda' and USE_VRAM:
        print("Sufficient VRAM available - loading data to GPU")
        X_train = X_train.to(device)
        X_test = X_test.to(device)
        X_test = X_test.to(device)
        y_test = y_test.to(device)
        print("Data is ready in VRAM.")
        return {
            "mode": "tensor",
            "device": device,
            "input_dim": INPUT_DIM,
            "X_train": X_train,
            "y_train": y_train,
            "X_test": X_test,
            "y_test": y_test,
            "pred_types_train": pred_types_train,
            "pred_types_test": pred_types_test,
        }
    
    print("Insufficient VRAM or unavailable GPU - using DataLoader")
    batch_size = batch_size
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=(device.type == "cuda"))
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=(device.type == "cuda"))
    print(f"Train batches: {len(train_loader)}")
    print(f"Test batches: {len(test_loader)}")

    return {
        "mode": "dataloader",
        "device": device,
        "input_dim": INPUT_DIM,
        "train_loader": train_loader,
        "test_loader": test_loader,
        "pred_types_train": pred_types_train,
        "pred_types_test": pred_types_test,
    }


In [7]:
def get_vram_batches(X, batch_size=4096, shuffle=True):
    n_samples = X.shape[0]
    indices = torch.arange(n_samples, device=X.device)
    
    if shuffle:
        indices = torch.randperm(n_samples, device=X.device)
        
    for start_idx in range(0, n_samples, batch_size):
        batch_indices = indices[start_idx : start_idx + batch_size]
        yield X[batch_indices]

## Variational Autoencoder Neural Network

In [8]:
class Encoder(nn.Module):
    def __init__(self, input_dim, z_dim, hidden_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2_loc = nn.Linear(hidden_dim, z_dim)    # latent mean
        self.fc2_scale = nn.Linear(hidden_dim, z_dim)  # latent scale
        self.activation = nn.LeakyReLU(0.2)

    def forward(self, x):
        x = x.reshape(x.shape[0], -1)

        hidden = self.activation(self.fc1(x))
        z_loc = self.fc2_loc(hidden)
        # we need positive scale, so we exponentiate the output
        z_scale = torch.exp(self.fc2_scale(hidden))

        return z_loc, z_scale

class Decoder(nn.Module):
    def __init__(self, input_dim, z_dim, hidden_dim):
        super().__init__()
        self.fc1 = nn.Linear(z_dim, hidden_dim)
        
        # predicts the reconstructed value (mean)
        self.fc_loc = nn.Linear(hidden_dim, input_dim)
        
        # predicts the variance/scale
        self.fc_scale = nn.Linear(hidden_dim, input_dim)
        
        self.activation = nn.LeakyReLU(0.2)
        self.softplus = nn.Softplus()

    def forward(self, z):
        hidden = self.activation(self.fc1(z))
        
        # reconstructed Mean
        loc_img = self.softplus(self.fc_loc(hidden))
        
        # reconstructed variance/scale
        log_scale = self.fc_scale(hidden)
        log_scale = torch.clamp(log_scale, min=-6, max=2)  
        scale_img = torch.exp(log_scale)
        
        return loc_img, scale_img

In [9]:
class LogAnomalyVAE(nn.Module):
    def __init__(self, input_dim=390, z_dim=32, hidden_dim=128, use_cuda=False):
        super().__init__()
        self.encoder = Encoder(input_dim, z_dim, hidden_dim)
        self.decoder = Decoder(input_dim, z_dim, hidden_dim)
        self.input_dim = input_dim
        self.z_dim = z_dim
        
        if use_cuda:
            self.cuda()
            
    def model(self, x):
        pyro.module("decoder", self.decoder)        
        with pyro.plate("data", x.shape[0]):
            # Prior: normal N(0, 1)
            z_loc = x.new_zeros(torch.Size((x.shape[0], self.z_dim)))
            z_scale = x.new_ones(torch.Size((x.shape[0], self.z_dim)))
            
            # sample latent Z
            z = pyro.sample("latent", dist.Normal(z_loc, z_scale).to_event(1))
            
            # decode z -> predicted mean and variance for the logs
            recon_loc, recon_scale = self.decoder(z)
            
            # bserve 'x' (the real log-counts)
            pyro.sample("obs", dist.Normal(recon_loc, recon_scale).to_event(1), 
                        obs=x.reshape(-1, self.input_dim))

    def guide(self, x):
        pyro.module("encoder", self.encoder)
        with pyro.plate("data", x.shape[0]):
            # use encoder to guess Z
            z_loc, z_scale = self.encoder(x)
            
            # sample Z
            pyro.sample("latent", dist.Normal(z_loc, z_scale).to_event(1))

    def get_anomaly_score(self, x, n_samples=20):
        x = x.to(next(self.parameters()).device)
        
        z_loc, z_scale = self.encoder(x)
        
        log_probs = []
        
        for _ in range(n_samples):
            # sample z ~ q(z|x)
            z = dist.Normal(z_loc, z_scale).sample()
            
            # decode
            recon_loc, recon_scale = self.decoder(z)
            
            # calculate log probability of X given the reconstruction
            log_prob = dist.Normal(recon_loc, recon_scale).log_prob(x.reshape(-1, self.input_dim)).sum(dim=1)
            log_probs.append(log_prob)
            
        # average the log probabilities
        mean_log_prob = torch.stack(log_probs).mean(dim=0)
        
        return -mean_log_prob

In [10]:
def train_vae(
    vae,
    data,
    num_epochs=150,
    batch_size=4096,
    lr=1e-3
):
    optimizer = Adam({"lr": lr})
    svi = SVI(vae.model, vae.guide, optimizer, loss=Trace_ELBO())

    print("Starting VAE training...")

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        num_samples = 0

        if data["mode"] == "tensor":
            X_train = data["X_train"]

            for x_batch in get_vram_batches(
                X_train,
                batch_size=batch_size,
                shuffle=True
            ):
                loss = svi.step(x_batch)
                epoch_loss += loss
                num_samples += x_batch.size(0)

        else:
            train_loader = data["train_loader"]
            device = data["device"]

            for x_batch, *_ in train_loader:
                x_batch = x_batch.to(device, non_blocking=True)
                loss = svi.step(x_batch)
                epoch_loss += loss
                num_samples += x_batch.size(0)

        avg_loss = epoch_loss / num_samples
        print(f"Epoch {epoch:03d} | Loss: {avg_loss:.4f}")

In [11]:
def test_vae(
    vae,
    data,
    eval_batch_size=8192
):
    print("Starting evaluation...")
    vae.eval()

    all_scores = []
    all_labels = []

    with torch.no_grad():
        if data["mode"] == "tensor":
            X_test = data["X_test"]
            y_test = data["y_test"]

            num_samples = X_test.shape[0]

            for start_idx in range(0, num_samples, eval_batch_size):
                end_idx = min(start_idx + eval_batch_size, num_samples)

                x_batch = X_test[start_idx:end_idx]
                y_batch = y_test[start_idx:end_idx]

                scores = vae.get_anomaly_score(x_batch)

                all_scores.append(scores)
                all_labels.append(y_batch)

        else:
            test_loader = data["test_loader"]
            device = data["device"]

            for x_batch, y_batch, *_ in test_loader:
                x_batch = x_batch.to(device, non_blocking=True)

                scores = vae.get_anomaly_score(x_batch)

                all_scores.append(scores)
                all_labels.append(y_batch)

    scores = torch.cat(all_scores).cpu().numpy()
    labels = torch.cat(all_labels).cpu().numpy()

    print(f"Evaluation complete. Processed {len(scores)} samples.")

    return scores, labels

In [12]:
data = load_dataset(
    window_size='5m',
    step_size='1m',
    n_prediction_window=3,
    filter_strategy='combined',
    filter_params={'contamination':0.005},
    batch_size=4096
)

vae = LogAnomalyVAE(
    input_dim=data['input_dim'],
    z_dim=32,
    hidden_dim=128,
    use_cuda=(data["device"].type == "cuda")
)

train_vae(
    vae,
    data,
    num_epochs=120,
    batch_size=4096,
    lr=1e-3
)

scores, labels = test_vae(
    vae,
    data,
    eval_batch_size=8192
)

roc = roc_auc_score(labels, scores)
pr_auc = average_precision_score(labels, scores)

print(f"ROC AUC: {roc:.4f}")
print(f"PR AUC: {pr_auc:.4f}")

Sorted 4713493 rows by timestamp
Found 390 unique event types
Prediction Horizon: +0:03:00 beyond current window
Checking for failures in the next 3 sliding windows.
Period -> Start time: 2005-06-03 22:42:50, End time: 2006-01-04 16:00:05
Generated 309193 sliding windows
Building index...
Index built. Shape: (309193, 3)
Isolation Forest: Removed 1477 windows (0.5%)
Remaining windows: 307716
Sorted 1414048 rows by timestamp
Found 390 unique event types
Prediction Horizon: +0:03:00 beyond current window
Checking for failures in the next 3 sliding windows.
Period -> Start time: 2005-08-30 05:28:32, End time: 2006-01-04 16:00:05
Generated 183507 sliding windows
Building index...
Index built. Shape: (183507, 3)
Pre-computing 307716 windows...


100%|██████████| 307716/307716 [00:22<00:00, 13690.06it/s]


Dataset converted to Tensor
Pre-computing 183507 windows...


100%|██████████| 183507/183507 [00:08<00:00, 20525.26it/s]


Dataset converted to Tensor
Available VRAM: 15.47 GB
Required VRAM: 1.00 GB (train: 0.45 GB, test: 0.27 GB)
Sufficient VRAM available - loading data to GPU
Data is ready in VRAM.
Starting VAE training...
Epoch 000 | Loss: -178.3257
Epoch 001 | Loss: -1420.8641
Epoch 002 | Loss: -1650.4610
Epoch 003 | Loss: -1720.9490
Epoch 004 | Loss: -1777.5536
Epoch 005 | Loss: -1808.2236
Epoch 006 | Loss: -1831.9924
Epoch 007 | Loss: -1848.3526
Epoch 008 | Loss: -1856.7496
Epoch 009 | Loss: -1867.5051
Epoch 010 | Loss: -1874.0997
Epoch 011 | Loss: -1878.3438
Epoch 012 | Loss: -1882.3248
Epoch 013 | Loss: -1886.2893
Epoch 014 | Loss: -1891.7738
Epoch 015 | Loss: -1894.6698
Epoch 016 | Loss: -1898.7867
Epoch 017 | Loss: -1900.7940
Epoch 018 | Loss: -1903.4708
Epoch 019 | Loss: -1906.4460
Epoch 020 | Loss: -1906.7296
Epoch 021 | Loss: -1910.4509
Epoch 022 | Loss: -1913.0631
Epoch 023 | Loss: -1916.8701
Epoch 024 | Loss: -1817.7283
Epoch 025 | Loss: -1722.5414
Epoch 026 | Loss: -1729.0503
Epoch 027 | Lo

In [13]:
"""
ROC AUC: 0.8576
PR AUC: 0.7090

ROC AUC: 0.8548
PR AUC: 0.7103
"""

'\nROC AUC: 0.8576\nPR AUC: 0.7090\n\nROC AUC: 0.8548\nPR AUC: 0.7103\n'