## Analysis of the log sliding windows dataloader

In [None]:
# dependencies
from utils.paths import project_root
from dataloaders.sliding_window import LogsSlidingWindow

import torch
from torch.utils.data import Dataset, DataLoader
import polars as pl
import numpy as np
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [None]:
# configuration
torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
df = pl.read_csv(f"{project_root()}/data/parsed/cleaned_BGL_structured.csv")[['Timestamp', 'EventId', 'Label']]

# filter only logs without anomaly labels "-"
df = df.with_columns(
    (pl.col("Label") != "-").alias("Anomaly")
)

# convert Timestamp to polars Timestamp
df = df.with_columns(
    pl.from_epoch(pl.col("Timestamp"), time_unit="s").alias("Timestamp")
)

df.head()

Timestamp,EventId,Label,Anomaly
datetime[μs],str,str,bool
2005-06-03 22:42:50,"""3aa50e45""","""-""",False
2005-06-03 22:42:50,"""3aa50e45""","""-""",False
2005-06-03 22:42:50,"""3aa50e45""","""-""",False
2005-06-03 22:42:50,"""3aa50e45""","""-""",False
2005-06-03 22:42:50,"""3aa50e45""","""-""",False


In [None]:
# train test spit
split_point = int(len(df) * 0.7)
train_df = df[:split_point]
test_df = df[split_point:]

In [None]:
df['Label'].value_counts()

Label,count
str,u32
"""KERNPOW""",192
"""APPREAD""",5983
"""KERNSERV""",94
"""KERNSTOR""",63491
"""KERNSOCK""",209
…,…
"""MASABNORM""",37
"""KERNRTSP""",3983
"""KERNMC""",342
"""KERNREC""",6145


In [None]:
train_dataset = LogsSlidingWindow(
    df, 
    window_size='5m',   
    step_size='1m',
    filter_strategy='isolation_forest',
    filter_params={'contamination': 0.02}
)

train_event_ids = train_dataset.event_ids

test_dataset = LogsSlidingWindow(
    test_df,
    window_size='5m',
    step_size='1m', 
    event_ids=train_event_ids,  # use same event IDs as training set
    filter_strategy='none'  # keep everything for evaluation
)

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

print(f"Train batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")

Sorted 4713493 rows by timestamp
Found 390 unique event types
Period -> Start time: 2005-06-03 22:42:50, End time: 2006-01-04 16:00:05
Generated 309193 sliding windows
Window size: 5m, Step size: 1m
Building index...
Index built. Shape: (309193, 2)
Isolation Forest: Removed 6184 windows (2.0%)
Remaining windows: 303009
Sorted 1414048 rows by timestamp
Found 390 unique event types
Period -> Start time: 2005-08-30 05:28:32, End time: 2006-01-04 16:00:05
Generated 183507 sliding windows
Window size: 5m, Step size: 1m
Building index...
Index built. Shape: (183507, 2)
Train batches: 2368
Test batches: 1434


### Simple Autoencoder Baseline

In [None]:
class SimpleAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dims=[256, 128, 64]):
        super(SimpleAutoencoder, self).__init__()
        
        # Encoder
        encoder_layers = []
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            encoder_layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.2)
            ])
            prev_dim = hidden_dim
        self.encoder = nn.Sequential(*encoder_layers)
        
        # Decoder
        decoder_layers = []
        for i in range(len(hidden_dims) - 1, 0, -1):
            decoder_layers.extend([
                nn.Linear(hidden_dims[i], hidden_dims[i-1]),
                nn.ReLU(),
                nn.Dropout(0.2)
            ])
        decoder_layers.append(nn.Linear(hidden_dims[0], input_dim))
        self.decoder = nn.Sequential(*decoder_layers)
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

def train_autoencoder(model, train_loader, epochs=20, lr=0.001):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    model.to(device)
    train_losses = []
    
    print("\nTraining Autoencoder...")
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        
        for (data, _, _) in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            data = data.to(device)
            
            optimizer.zero_grad()
            reconstructed = model(data)
            loss = criterion(reconstructed, data)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(train_loader)
        train_losses.append(avg_loss)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.6f}")
    
    return train_losses

In [None]:
def evaluate_autoencoder(model, test_loader):
    model.eval()
    criterion = nn.MSELoss(reduction='none')
    
    all_scores = []
    all_labels = []
    
    print("\nEvaluating on test set...")
    with torch.no_grad():
        for data, labels, _ in tqdm(test_loader, desc="Evaluating"):
            data = data.to(device)
            reconstructed = model(data)
            
            reconstruction_error = criterion(reconstructed, data).mean(dim=1)
            
            all_scores.extend(reconstruction_error.cpu().numpy())
            all_labels.extend(labels.numpy())
    
    return np.array(all_scores), np.array(all_labels)

In [None]:
input_dim = len(train_event_ids)
model = SimpleAutoencoder(input_dim=input_dim, hidden_dims=[256, 128, 64])

train_losses = train_autoencoder(model, train_loader, epochs=20, lr=0.001)


Training Autoencoder...


Epoch 1/20: 100%|██████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 248.75it/s]


Epoch 1/20, Loss: 0.002993


Epoch 2/20: 100%|██████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 251.99it/s]


Epoch 2/20, Loss: 0.002043


Epoch 3/20: 100%|██████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 252.91it/s]


Epoch 3/20, Loss: 0.001772


Epoch 4/20: 100%|██████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 251.56it/s]


Epoch 4/20, Loss: 0.001644


Epoch 5/20: 100%|██████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 251.74it/s]


Epoch 5/20, Loss: 0.001555


Epoch 6/20: 100%|██████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 244.66it/s]


Epoch 6/20, Loss: 0.001510


Epoch 7/20: 100%|██████████████████████████████████████████████████████████████████| 2368/2368 [00:11<00:00, 211.23it/s]


Epoch 7/20, Loss: 0.001440


Epoch 8/20: 100%|██████████████████████████████████████████████████████████████████| 2368/2368 [00:10<00:00, 224.82it/s]


Epoch 8/20, Loss: 0.001384


Epoch 9/20: 100%|██████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 248.90it/s]


Epoch 9/20, Loss: 0.001348


Epoch 10/20: 100%|█████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 249.69it/s]


Epoch 10/20, Loss: 0.001325


Epoch 11/20: 100%|█████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 249.69it/s]


Epoch 11/20, Loss: 0.001288


Epoch 12/20: 100%|█████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 243.09it/s]


Epoch 12/20, Loss: 0.001284


Epoch 13/20: 100%|█████████████████████████████████████████████████████████████████| 2368/2368 [00:10<00:00, 216.29it/s]


Epoch 13/20, Loss: 0.001268


Epoch 14/20: 100%|█████████████████████████████████████████████████████████████████| 2368/2368 [00:10<00:00, 231.63it/s]


Epoch 14/20, Loss: 0.001237


Epoch 15/20: 100%|█████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 248.72it/s]


Epoch 15/20, Loss: 0.001237


Epoch 16/20: 100%|█████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 250.40it/s]


Epoch 16/20, Loss: 0.001229


Epoch 17/20: 100%|█████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 247.85it/s]


Epoch 17/20, Loss: 0.001209


Epoch 18/20: 100%|█████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 245.06it/s]


Epoch 18/20, Loss: 0.001212


Epoch 19/20: 100%|█████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 244.66it/s]


Epoch 19/20, Loss: 0.001195


Epoch 20/20: 100%|█████████████████████████████████████████████████████████████████| 2368/2368 [00:09<00:00, 243.43it/s]

Epoch 20/20, Loss: 0.001193





In [None]:
anomaly_scores, y_test = evaluate_autoencoder(model, test_loader)

print(anomaly_scores, '\n')
print(y_test)


Evaluating on test set...


Evaluating: 100%|██████████████████████████████████████████████████████████████████| 1434/1434 [00:05<00:00, 281.20it/s]

[1.0691329e-03 1.0663269e-03 2.3738075e-06 ... 2.6157585e-03 2.3738075e-06
 2.3738075e-06] 

[0 0 0 ... 0 0 0]





In [None]:
train_scores = []
model.eval()
with torch.no_grad():
    for data, _ in train_loader:
        data = data.to(device)
        reconstructed = model(data)
        reconstruction_error = nn.MSELoss(reduction='none')(reconstructed, data).mean(dim=1)
        train_scores.extend(reconstruction_error.cpu().numpy())

train_scores = np.array(train_scores)
threshold = np.percentile(train_scores, 95)
print(f"Anomaly threshold (95th percentile): {threshold:.6f}")

y_pred = (anomaly_scores > threshold).astype(int)

Anomaly threshold (95th percentile): 0.001780


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=['Normal', 'Anomaly']))

              precision    recall  f1-score   support

      Normal       1.00      0.94      0.97    180516
     Anomaly       0.21      0.88      0.34      2991

    accuracy                           0.94    183507
   macro avg       0.60      0.91      0.65    183507
weighted avg       0.99      0.94      0.96    183507

