## Analysis of the log sliding windows dataloader

In [1]:
# dependencies
from utils.paths import project_root
from dataloaders.sliding_window import LogsSlidingWindow

import torch
from torch.utils.data import Dataset, DataLoader
import polars as pl
import numpy as np

In [None]:
df = pl.read_csv(f"{project_root()}/data/parsed/cleaned_BGL_structured.csv")[['Timestamp', 'EventId']]

# create anomaly column with constant value False | TODO: definir anomalias
df = df.with_columns(
    pl.lit(False).alias("Label")
)

# convert Timestamp to polars Timestamp
df = df.with_columns(
    pl.from_epoch(pl.col("Timestamp"), time_unit="s").alias("Timestamp")
)

df.head()

Timestamp,EventId,Label
datetime[μs],str,bool
2005-06-03 22:42:50,"""3aa50e45""",False
2005-06-03 22:42:50,"""3aa50e45""",False
2005-06-03 22:42:50,"""3aa50e45""",False
2005-06-03 22:42:50,"""3aa50e45""",False
2005-06-03 22:42:50,"""3aa50e45""",False


In [3]:
# OBS:  não sei se tá funcionando, não consegui terminar de rodar
dataset = LogsSlidingWindow(
    df, 
    window_size='5m',   
    step_size='1m'    
)

print(f"\nDataset contains {len(dataset)} windows")
print(f"Each window has {dataset.n_events} features (event types)")

Sorted 474796 rows by timestamp
Found 125 unique event types
Period -> Start time: 2005-06-03 22:42:50, End time: 2005-06-14 16:39:09
Generated 15472 sliding windows
Window size: 5m, Step size: 1m
Building index...
Index built. Shape: (15472, 2)

Dataset contains 15472 windows
Each window has 125 features (event types)


In [4]:
dataset.__getitem__(0)

(tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         7.3939, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.6931, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0

In [5]:
# testing loading the data
for count_vectors, label in dataset:
    print(f"Batch shape: {count_vectors.shape}, Label: {label}")  

Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: torch.Size([125]), Label: 0
Batch shape: tor

### Test Simple Network

In [6]:
import torch.nn as nn
import torch.optim as optim

class LogClassifier(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()
        self.fc1 = nn.Linear(n_features, 128)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, n_classes)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

In [7]:
n_classes = len(df['Label'].unique())
model = LogClassifier(n_features=dataset.n_events, n_classes=n_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()

for epoch in range(3):  
    epoch_loss = 0
    
    for batch_idx, (count_vectors, labels) in enumerate(dataset):
        outputs = model(count_vectors)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        if (batch_idx + 1) % 100 == 0:
            print(f"Epoch {epoch+1}, Batch {batch_idx+1}/{len(dataset)}, Loss: {loss.item():.4f}")
    
    avg_loss = epoch_loss / len(dataset)
    print(f"Epoch {epoch+1} completed. Average loss: {avg_loss:.4f}")

Epoch 1, Batch 100/15472, Loss: 0.0000
Epoch 1, Batch 200/15472, Loss: 0.0000
Epoch 1, Batch 300/15472, Loss: 0.0000
Epoch 1, Batch 400/15472, Loss: 0.0000
Epoch 1, Batch 500/15472, Loss: 0.0000
Epoch 1, Batch 600/15472, Loss: 0.0000
Epoch 1, Batch 700/15472, Loss: 0.0000
Epoch 1, Batch 800/15472, Loss: 0.0000
Epoch 1, Batch 900/15472, Loss: 0.0000
Epoch 1, Batch 1000/15472, Loss: 0.0000
Epoch 1, Batch 1100/15472, Loss: 0.0000
Epoch 1, Batch 1200/15472, Loss: 0.0000
Epoch 1, Batch 1300/15472, Loss: 0.0000
Epoch 1, Batch 1400/15472, Loss: 0.0000
Epoch 1, Batch 1500/15472, Loss: 0.0000
Epoch 1, Batch 1600/15472, Loss: 0.0000
Epoch 1, Batch 1700/15472, Loss: 0.0000
Epoch 1, Batch 1800/15472, Loss: 0.0000
Epoch 1, Batch 1900/15472, Loss: 0.0000
Epoch 1, Batch 2000/15472, Loss: 0.0000
Epoch 1, Batch 2100/15472, Loss: 0.0000
Epoch 1, Batch 2200/15472, Loss: 0.0000
Epoch 1, Batch 2300/15472, Loss: 0.0000
Epoch 1, Batch 2400/15472, Loss: 0.0000
Epoch 1, Batch 2500/15472, Loss: 0.0000
Epoch 1, 