# Sequence Anomaly Detection (LSTM/GRU)
Synthetic behavior-like session sequences with padding/masking. Fast to run on CPU; no external data needed.

In [None]:
# Helper: clean DataFrame to avoid missing-column/NaN errors
import pandas as pd

def clean_frame(df, target=None, numeric_expected=None, categorical_expected=None):
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]
    df = df.dropna(axis=1, how='all').copy()
    numeric_expected = numeric_expected or []
    categorical_expected = categorical_expected or []
    for col in numeric_expected:
        if col not in df.columns:
            df[col] = 0.0
    for col in categorical_expected:
        if col not in df.columns:
            df[col] = 'missing'
    num_cols = df.select_dtypes(include=['number']).columns
    cat_cols = df.select_dtypes(exclude=['number']).columns
    if len(num_cols):
        df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    if len(cat_cols):
        df[cat_cols] = df[cat_cols].fillna('missing')
    if target:
        if target not in df.columns:
            raise KeyError(f"Target '{target}' missing. Columns: {df.columns.tolist()}")
        df[target] = pd.to_numeric(df[target], errors='coerce').fillna(0).astype(int)
    return df


In [2]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

project_root = Path('..').resolve()
src_path = project_root / 'src'
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

from uais.sequence.build_sequences import build_sequences, pad_sequences
from uais.sequence.train_lstm import train_lstm_classifier, predict_lstm
from uais.sequence.train_gru import train_gru_classifier, predict_gru
from uais.sequence.evaluate_sequence import evaluate_sequence_predictions
from uais.explainability.sequence_explainer import sequence_saliency

np.random.seed(42)
import torch
_ = torch.manual_seed(42)

print('Project root:', project_root)
print('Using torch', torch.__version__)


Project root: /Users/pratik_n/Desktop/MyComputer/universal-anomaly-intelligence
Using torch 2.9.1


In [3]:
# Generate synthetic CERT-like session sequences (variable length)
rng = np.random.default_rng(7)
num_sessions = 320
records = []
for session_id in range(num_sessions):
    length = int(rng.integers(8, 30))
    base_ts = np.datetime64('2024-01-01') + np.timedelta64(int(rng.integers(0, 21)), 'D')
    is_anomaly = bool(rng.random() < 0.2)
    for step in range(length):
        ts = base_ts + np.timedelta64(int(step * 5 + rng.integers(0, 4)), 'm')
        event_code = int(rng.integers(0, 3))
        bytes_out = float(rng.normal(120, 25))
        failed = int(rng.binomial(1, 0.06))
        if is_anomaly:
            event_code = int(rng.integers(2, 4))
            bytes_out += float(rng.normal(150, 40))
            failed = int(rng.binomial(1, 0.25))
        records.append({
            'session_id': f's{session_id}',
            'timestamp': pd.Timestamp(ts),
            'event_code': event_code,
            'bytes_out': max(bytes_out, 0.0),
            'failed_login': failed,
            'label': int(is_anomaly),
        })

seq_df = pd.DataFrame(records)
seq_df = seq_df.sort_values(['session_id', 'timestamp']).reset_index(drop=True)
print(seq_df.head())
print('Events:', len(seq_df), 'Sequences:', seq_df['session_id'].nunique())
print('Positive sequences:', seq_df.groupby('session_id')['label'].max().sum())


  session_id           timestamp  event_code   bytes_out  failed_login  label
0         s0 2024-01-14 00:02:00           2   97.735204             0      0
1         s0 2024-01-14 00:06:00           2  121.503590             0      0
2         s0 2024-01-14 00:10:00           2  104.488128             0      0
3         s0 2024-01-14 00:16:00           0  122.635356             0      0
4         s0 2024-01-14 00:21:00           1  137.382580             1      0
Events: 5807 Sequences: 320
Positive sequences: 66


In [4]:
# Build padded tensors + mask for the sequence models
sequences, labels = build_sequences(
    seq_df,
    id_column='session_id',
    time_column='timestamp',
    target_column='label',
)
padded, mask = pad_sequences(sequences, max_len=40)
labels = np.asarray(labels)

print('Padded shape:', padded.shape)
print('Mask shape:', mask.shape)
print('Feature dim:', padded.shape[-1])
print('Positive ratio:', labels.mean())


Padded shape: (320, 40, 3)
Mask shape: (320, 40)
Feature dim: 3
Positive ratio: 0.20625


In [5]:
# Train/test split and LSTM classifier
X_train, X_test, mask_train, mask_test, y_train, y_test = train_test_split(
    padded,
    mask,
    labels,
    test_size=0.25,
    stratify=labels,
    random_state=42,
)

config = {'sequence': {'hidden_dim': 32, 'batch_size': 32, 'epochs': 6, 'lr': 1e-3}}

lstm_model, lstm_loss = train_lstm_classifier(X_train, mask_train, y_train, config)
lstm_scores = predict_lstm(lstm_model, X_test, mask_test)
lstm_metrics = evaluate_sequence_predictions(y_test, lstm_scores)

print('LSTM train loss:', round(lstm_loss, 4))
print('LSTM metrics:')
for k, v in lstm_metrics.items():
    print(f"  {k}: {v:.4f}")


2025-11-25 18:31:51,549 [INFO] uais.sequence.train_lstm: LSTM epoch 1 loss 0.7120
2025-11-25 18:31:51,570 [INFO] uais.sequence.train_lstm: LSTM epoch 2 loss 0.6943
2025-11-25 18:31:51,591 [INFO] uais.sequence.train_lstm: LSTM epoch 3 loss 0.6763
2025-11-25 18:31:51,612 [INFO] uais.sequence.train_lstm: LSTM epoch 4 loss 0.6544
2025-11-25 18:31:51,633 [INFO] uais.sequence.train_lstm: LSTM epoch 5 loss 0.6217
2025-11-25 18:31:51,653 [INFO] uais.sequence.train_lstm: LSTM epoch 6 loss 0.5645


LSTM train loss: 0.5645
LSTM metrics:
  roc_auc: 0.5000
  pr_auc: 0.2000
  f1: 0.0000
  precision: 0.0000
  recall: 0.0000
  accuracy: 0.8000


In [6]:
# Lightweight GRU comparison (same hyperparams)
gru_model, gru_loss = train_gru_classifier(X_train, mask_train, y_train, config)
gru_scores = predict_gru(gru_model, X_test, mask_test)
gru_metrics = evaluate_sequence_predictions(y_test, gru_scores)

print('GRU train loss:', round(gru_loss, 4))
print('GRU metrics:')
for k, v in gru_metrics.items():
    print(f"  {k}: {v:.4f}")


2025-11-25 18:31:51,684 [INFO] uais.sequence.train_gru: GRU epoch 1 loss 0.6795
2025-11-25 18:31:51,707 [INFO] uais.sequence.train_gru: GRU epoch 2 loss 0.6445
2025-11-25 18:31:51,731 [INFO] uais.sequence.train_gru: GRU epoch 3 loss 0.6111
2025-11-25 18:31:51,771 [INFO] uais.sequence.train_gru: GRU epoch 4 loss 0.5684
2025-11-25 18:31:51,804 [INFO] uais.sequence.train_gru: GRU epoch 5 loss 0.5275
2025-11-25 18:31:51,828 [INFO] uais.sequence.train_gru: GRU epoch 6 loss 0.5071


GRU train loss: 0.5071
GRU metrics:
  roc_auc: 0.5000
  pr_auc: 0.2000
  f1: 0.0000
  precision: 0.0000
  recall: 0.0000
  accuracy: 0.8000


In [7]:
# Simple saliency over time steps for one test sequence
example_idx = 0
saliency = sequence_saliency(X_test[example_idx:example_idx+1], mask_test[example_idx:example_idx+1])
example_score = float(lstm_scores[example_idx])
print('Example score:', round(example_score, 4))
print('Saliency (step -> avg magnitude):')
for step, score in saliency.items():
    if mask_test[example_idx, step] > 0:
        print(f"  t={step}: {score:.4f}")


Example score: 0.269
Saliency (step -> avg magnitude):
  t=0: 47.0562
  t=1: 33.2935
  t=2: 49.3933
  t=3: 49.6646
  t=4: 43.1149
  t=5: 40.4860
  t=6: 44.2769
  t=7: 49.4137
  t=8: 46.2818
  t=9: 38.0497
