# Task 4

## Setup

In [None]:
# --- Check Python and pip versions ---
!python --version
!pip install --upgrade pip

Python 3.12.12
Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3


In [None]:
# --- Install required libraries ---
!pip install torch
!pip install numpy pandas scikit-learn matplotlib seaborn
!pip install tqdm



In [None]:
# --- Setup & Imports ---
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import Counter, defaultdict
import json
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.cuda.amp import GradScaler
from torch.nn.utils import clip_grad_norm_
from torch.cuda import is_available
from torch import cuda

from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, AutoConfig,
    get_scheduler, DataCollatorForTokenClassification
)
from transformers import AutoTokenizer

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, accuracy_score, f1_score,
    precision_score, recall_score
)
from itertools import chain
from copy import deepcopy

from tqdm.auto import tqdm

### Colab Pro

In [None]:
# --- Check GPU availability ---
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Dec 27 18:24:13 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# --- Check RAM availability ---
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


### Paths setup


In [None]:
# --- Mount Google Drive (for Google Colab users) ---
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- Define Paths ---
laboratory = 'Laboratory4'

base_path = '/content/drive/MyDrive/'
project_path = base_path + f'Projects/{laboratory}/'
data_path = project_path + 'data/'
results_path = project_path + 'results/'

# Ensure directories exist
os.makedirs(project_path, exist_ok=True)
os.makedirs(data_path, exist_ok=True)
os.makedirs(results_path, exist_ok=True)

print(f"Project path: {project_path}")
print(f"Data path: {data_path}")
print(f"Results path: {results_path}")

Project path: /content/drive/MyDrive/Projects/Laboratory4/
Data path: /content/drive/MyDrive/Projects/Laboratory4/data/
Results path: /content/drive/MyDrive/Projects/Laboratory4/results/


In [None]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

## Helper Functions

In [None]:
# ============================================================================
# GLOBAL CONFIGURATION: Plot Saving (Colab/Google Drive only)
# ============================================================================
SAVE_PLOTS = 1
# ============================================================================

import os
import matplotlib.pyplot as plt

BASE_DIR = results_path + 'Task4'
os.makedirs(BASE_DIR, exist_ok=True)

def save_figure_for_report(filename, dpi=300, bbox_inches='tight'):
    """
    Save the current matplotlib figure for use in the report.

    Args:
        filename: Name of the file (e.g., 'class_distribution.png')
        dpi: Resolution (default 300 for high quality)
        bbox_inches: Bounding box setting (default 'tight' to remove whitespace)
    """
    if not SAVE_PLOTS:
        return  # Skip saving if flag is disabled or filename missing

    filepath = os.path.join(BASE_DIR, filename)
    plt.savefig(filepath, dpi=dpi, bbox_inches=bbox_inches)
    print(f"Figure saved to: {filepath}")


In [None]:
def compute_metrics(full_predictions, full_labels):
    """Compute token-level classification metrics"""
    flat_predictions = list(chain(*full_predictions))
    flat_labels = list(chain(*full_labels))

    token_accuracy = accuracy_score(flat_labels, flat_predictions)
    token_precision = precision_score(flat_labels, flat_predictions, average='macro', zero_division=0)
    token_recall = recall_score(flat_labels, flat_predictions, average='macro', zero_division=0)
    token_f1 = f1_score(flat_labels, flat_predictions, average='macro', zero_division=0)

    metrics = {
        "token_accuracy": token_accuracy,
        "token_precision": token_precision,
        "token_recall": token_recall,
        "token_f1": token_f1,
    }
    return metrics

In [None]:
def postprocess(predictions, labels):
    """Convert predictions and labels to original label format"""
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_predictions, true_labels

In [None]:
def align_labels_with_tokens(labels, word_ids):
    """Align word-level labels to token-level labels"""
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            new_labels.append(label)
    return new_labels

In [None]:
def tokenize_and_align_labels_unixcoder(samples, tokenizer):
    """Tokenize and align labels for UniXcoder"""
    split_sentences = [s.split(" ") for s in samples["session"]]

    tokenized = tokenizer(
        split_sentences,
        truncation=True,
        is_split_into_words=True
    )

    all_labels = samples["label_id"]
    aligned_all = []

    for i, labels in enumerate(all_labels):
        word_ids = tokenized.word_ids(i)

        aligned = []
        prev_word = None

        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            else:
                if wid != prev_word:
                    aligned.append(labels[wid])
                    prev_word = wid
                else:
                    aligned.append(-100)
        aligned_all.append(aligned)

    tokenized["labels"] = aligned_all
    return tokenized


In [None]:
def training_loop(model, optimizer, lr_scheduler, train_loader, val_loader, device, num_epochs):
    scaler = torch.amp.GradScaler()
    best_val_loss = np.inf
    best_weights = deepcopy(model.state_dict())

    # Calculate steps dynamically based on the passed loader
    num_training_steps = num_epochs * len(train_loader)
    progress_bar = tqdm(range(num_training_steps))

    train_losses, val_losses = [], []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            # Move batch to device
            batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
            optimizer.zero_grad(set_to_none=True)

            # Automatic Mixed Precision
            with torch.amp.autocast(device_type='cuda'):
                outputs = model(**batch)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            lr_scheduler.step()

            train_loss += loss.item()
            progress_bar.update(1)

        avg_train_loss = train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # Evaluation
        model.eval()
        val_loss = 0
        predictions_list, labels_list = [], []

        for batch in val_loader:
            batch = {key: value.to(device, non_blocking=True) for key, value in batch.items()}
            with torch.no_grad():
                with torch.amp.autocast(device_type='cuda'):  # ← FIXED HERE
                    outputs = model(**batch)

            val_loss += outputs.loss.item()
            predictions = outputs.logits.argmax(dim=-1)
            labels = batch["labels"]

            true_predictions, true_labels = postprocess(predictions, labels)
            predictions_list += true_predictions
            labels_list += true_labels

        # Compute validation metrics
        val_metrics = compute_metrics(predictions_list, labels_list)
        val_accuracy = val_metrics["token_accuracy"]

        avg_val_loss = val_loss / len(val_loader)
        val_losses.append(avg_val_loss)

        # Optional: Print progress
        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

        if avg_val_loss <= best_val_loss:
            best_epoch = epoch
            best_val_loss = avg_val_loss
            best_weights = deepcopy(model.state_dict())

    # Load the best weights found during this specific training run
    model.load_state_dict(best_weights)
    return model, best_epoch, best_val_loss, train_losses, val_losses

In [None]:
def evaluate_model(model, dataloader, device):
    """Evaluate model on a dataset"""
    model.eval()
    full_predictions, full_labels = [], []
    for batch in dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        true_predictions, true_labels = postprocess(predictions, labels)
        full_predictions += true_predictions
        full_labels += true_labels

    test_metrics = compute_metrics(full_predictions, full_labels)
    return full_predictions, full_labels, test_metrics

## Re-Train Best Model from Task 3

### Dataset Loading and preparation

In [None]:
# Load training and test data
train_df = pd.read_json(f"{data_path}train.json")
test_df = pd.read_json(f"{data_path}test.json")

print(f"Training dataset: {train_df.shape[0]} sessions")
print(f"Test dataset: {test_df.shape[0]} sessions")

Training dataset: 251 sessions
Test dataset: 108 sessions


In [None]:
# Split training data into train and validation
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
print(f"Split training: {train_df.shape[0]} train, {val_df.shape[0]} validation")

Split training: 200 train, 51 validation


In [None]:
# Create label mappings
unique_labels = list(train_df.label.explode().unique())
print(f"Unique labels: {unique_labels}")

id2label = {it: label for it, label in enumerate(unique_labels)}
label2id = {label: it for it, label in enumerate(unique_labels)}

print(f"Label mappings: {label2id}")

Unique labels: ['Execution', 'Discovery', 'Not Malicious Yet', 'Persistence', 'Other', 'Defense Evasion', 'Impact']
Label mappings: {'Execution': 0, 'Discovery': 1, 'Not Malicious Yet': 2, 'Persistence': 3, 'Other': 4, 'Defense Evasion': 5, 'Impact': 6}


In [None]:
# Create Hugging Face datasets
full_ds = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "valid": Dataset.from_pandas(val_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True)),
})

In [None]:
# Convert labels to IDs
def convert_labels_to_ids(sample):
    sample['label_id'] = [label2id[el] for el in sample["label"]]
    return sample

encoded_dataset = full_ds.map(convert_labels_to_ids)
print(f"Encoded dataset: {encoded_dataset}")

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

Encoded dataset: DatasetDict({
    train: Dataset({
        features: ['session', 'label', 'label_id'],
        num_rows: 200
    })
    valid: Dataset({
        features: ['session', 'label', 'label_id'],
        num_rows: 51
    })
    test: Dataset({
        features: ['session', 'label', 'label_id'],
        num_rows: 108
    })
})


### Tokenization

In [None]:
# Create tokenizer and model checkpoint
unixcoder_model_checkpoint = "microsoft/unixcoder-base"
unixcoder_tokenizer = AutoTokenizer.from_pretrained(
    unixcoder_model_checkpoint,
    add_prefix_space=True,
    use_fast=True,
    model_max_length=512
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
# Tokenize datasets
original_columns = encoded_dataset["train"].column_names
tokenized_datasets = encoded_dataset.map(
    lambda x: tokenize_and_align_labels_unixcoder(x, unixcoder_tokenizer),
    batched=True,
    remove_columns=original_columns,
)

print(f"Tokenized dataset: {tokenized_datasets}")

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

Tokenized dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 51
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 108
    })
})


In [None]:
# Create data collator
data_collator = DataCollatorForTokenClassification(
    tokenizer=unixcoder_tokenizer,
    return_tensors="pt"
)

In [None]:
# Create DataLoaders
tokenized_datasets.set_format("torch")

BATCH_SIZE = 32
NUM_WORKERS = 2

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

eval_dataloader = DataLoader(
    tokenized_datasets["valid"],
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

test_dataloader = DataLoader(
    tokenized_datasets["test"],
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

print("DataLoaders created successfully")

DataLoaders created successfully


### Train the model

In [None]:
print("\n" + "="*80)
print("SECTION 3: TRAIN BEST MODEL FROM TASK 3")
print("UNIXCODER FULL FINE-TUNE WITH LR=1e-05")
print("="*80)

N_TRAIN_EPOCHS = 40
BEST_LR = 1e-5

print(f"\nTraining parameters:")
print(f"  - Model: UniXcoder (microsoft/unixcoder-base)")
print(f"  - Learning Rate: {BEST_LR}")
print(f"  - Epochs: {N_TRAIN_EPOCHS}")
print(f"  - Batch Size: {BATCH_SIZE}")
print(f"  - Device: {device}")


SECTION 3: TRAIN BEST MODEL FROM TASK 3
UNIXCODER FULL FINE-TUNE WITH LR=1e-05

Training parameters:
  - Model: UniXcoder (microsoft/unixcoder-base)
  - Learning Rate: 1e-05
  - Epochs: 40
  - Batch Size: 32
  - Device: cuda


In [None]:
# Initialize model
best_model = AutoModelForTokenClassification.from_pretrained(
    unixcoder_model_checkpoint,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
).to(device)

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at microsoft/unixcoder-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/504M [00:00<?, ?B/s]

In [None]:
# Setup optimizer and scheduler
optimizer = optim.AdamW(best_model.parameters(), lr=BEST_LR)

num_training_steps = N_TRAIN_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
# Train
best_model, best_epoch, best_val_loss, train_losses, val_losses = training_loop(
    best_model, optimizer, lr_scheduler,
    train_dataloader, eval_dataloader, device, N_TRAIN_EPOCHS
)

print("\n" + "="*80)
print(f"Training complete! Best epoch: {best_epoch + 1}, Best val loss: {best_val_loss:.4f}")
print("="*80)

  0%|          | 0/280 [00:00<?, ?it/s]

Epoch 1/40 | Train Loss: 1.9318 | Val Loss: 1.2478
Epoch 2/40 | Train Loss: 1.0164 | Val Loss: 0.8637
Epoch 3/40 | Train Loss: 0.7952 | Val Loss: 0.7271
Epoch 4/40 | Train Loss: 0.7297 | Val Loss: 0.5989
Epoch 5/40 | Train Loss: 0.5111 | Val Loss: 0.5384
Epoch 6/40 | Train Loss: 0.4083 | Val Loss: 0.4836
Epoch 7/40 | Train Loss: 0.3318 | Val Loss: 0.4659
Epoch 8/40 | Train Loss: 0.3096 | Val Loss: 0.4566
Epoch 9/40 | Train Loss: 0.2365 | Val Loss: 0.4204
Epoch 10/40 | Train Loss: 0.1952 | Val Loss: 0.3914
Epoch 11/40 | Train Loss: 0.1871 | Val Loss: 0.3838
Epoch 12/40 | Train Loss: 0.1576 | Val Loss: 0.4036
Epoch 13/40 | Train Loss: 0.1538 | Val Loss: 0.3694
Epoch 14/40 | Train Loss: 0.1191 | Val Loss: 0.3726
Epoch 15/40 | Train Loss: 0.1041 | Val Loss: 0.3476
Epoch 16/40 | Train Loss: 0.0841 | Val Loss: 0.3786
Epoch 17/40 | Train Loss: 0.0961 | Val Loss: 0.3757
Epoch 18/40 | Train Loss: 0.0786 | Val Loss: 0.3659
Epoch 19/40 | Train Loss: 0.0794 | Val Loss: 0.3860
Epoch 20/40 | Train L

In [None]:
# Evaluate on test set
print("\nEvaluating on test set...")
test_preds, test_labels, test_metrics = evaluate_model(best_model, test_dataloader, device)

print(f"\nTest Set Performance:")
print(f"  - Token Accuracy: {test_metrics['token_accuracy']:.4f}")
print(f"  - Macro F1-Score: {test_metrics['token_f1']:.4f}")
print(f"  - Macro Precision: {test_metrics['token_precision']:.4f}")
print(f"  - Macro Recall: {test_metrics['token_recall']:.4f}")


Evaluating on test set...

Test Set Performance:
  - Token Accuracy: 0.9275
  - Macro F1-Score: 0.7897
  - Macro Precision: 0.8860
  - Macro Recall: 0.7374


In [None]:
# # Save best model for later use in inference
# model_save_path = os.path.join(results_path, "best_unixcoder_model_task3")
# os.makedirs(model_save_path, exist_ok=True)
# best_model.save_pretrained(model_save_path)
# unixcoder_tokenizer.save_pretrained(model_save_path)

# print(f"\nBest model saved to: {model_save_path}")

## Inference Datasets

In [None]:
# Load cyberlab.csv
cyberlab_df = pd.read_csv(f"{data_path}cyberlab.csv")

print(f"Cyberlab dataset: {len(cyberlab_df)} sessions")
print(f"Columns: {list(cyberlab_df.columns)}")
print(f"Data types:\n{cyberlab_df.dtypes}")
print(f"\nFirst few rows:")
print(cyberlab_df.head(3))

# Convert timestamps_statements to datetime
cyberlab_df['timestamps_statements'] = pd.to_datetime(cyberlab_df['timestamps_statements'])
cyberlab_df['date'] = cyberlab_df['timestamps_statements'].dt.date

print(f"\nDate range: {cyberlab_df['date'].min()} to {cyberlab_df['date'].max()}")

Cyberlab dataset: 174262 sessions
Columns: ['session', 'timestamps_statements', 'country_name']
Data types:
session                  object
timestamps_statements    object
country_name             object
dtype: object

First few rows:
                                             session  \
0  enable ; system ; shell ; sh ; cat /proc/mount...   
1  enable ; system ; shell ; sh ; cat /proc/mount...   
2  enable ; system ; shell ; sh ; cat /proc/mount...   

              timestamps_statements country_name  
0  2019-09-01 00:00:10.493808+00:00       Israel  
1  2019-09-01 00:38:41.134935+00:00       Israel  
2  2019-09-01 00:39:26.263383+00:00       Israel  

Date range: 2019-09-01 to 2019-12-30


## Inference Functions

In [None]:
def truncate_long_words(session, max_length=20):
    """Truncate words longer than max_length (matching Task 3 preprocessing)"""
    words = session.split()
    truncated = []
    for word in words:
        if len(word) > max_length:
            truncated.append(word[:max_length-3] + '...')
        else:
            truncated.append(word)
    return ' '.join(truncated)

## Batch Inference on Cyberlab Dataset

In [34]:
import time
start_time = time.time()

commands_of_interest = ['cat', 'grep', 'echo', 'rm']

# Step 1: Preprocess all sessions (fast - vectorized)
print("Step 1: Preprocessing sessions (vectorizing)...")
t1 = time.time()
cyberlab_df['session_clean'] = cyberlab_df['session'].apply(truncate_long_words)
cyberlab_df['words'] = cyberlab_df['session_clean'].str.split()
t1_elapsed = time.time() - t1
print(f"  ✓ Done in {t1_elapsed:.1f}s")

# Step 2: Tokenize in MINI-BATCHES to avoid memory issues
print(f"Step 2: Tokenizing {len(cyberlab_df)} sessions in mini-batches...")
t2 = time.time()

MINI_BATCH_SIZE = 512  # Tokenize 512 sessions at a time
all_word_ids = []
all_predictions = []

total_batches = (len(cyberlab_df) + MINI_BATCH_SIZE - 1) // MINI_BATCH_SIZE

for batch_idx in range(total_batches):
    batch_start = batch_idx * MINI_BATCH_SIZE
    batch_end = min(batch_start + MINI_BATCH_SIZE, len(cyberlab_df))

    # Get batch sessions
    batch_words = [cyberlab_df.iloc[i]['words'] for i in range(batch_start, batch_end)]

    # Tokenize batch WITHOUT return_tensors first (to keep word_ids method)
    tokenized = unixcoder_tokenizer(
        batch_words,
        truncation=True,
        is_split_into_words=True,
        padding=True
    )

    # Extract word_ids BEFORE converting to tensors
    batch_word_ids = [tokenized.word_ids(i) for i in range(len(batch_words))]

    # Now convert to tensors
    tokenized = {k: torch.tensor(v) for k, v in tokenized.items()}

    # Move to device
    tokenized = {k: v.to(device) for k, v in tokenized.items()}

    # Inference on batch
    best_model.eval()
    with torch.no_grad():
        outputs = best_model(**tokenized)
        logits = outputs.logits

    # Store predictions and word_ids
    preds = logits.argmax(dim=-1).cpu().numpy()

    for session_offset in range(len(batch_words)):
        all_word_ids.append(batch_word_ids[session_offset])
        all_predictions.append(preds[session_offset])

    # Clean up
    del tokenized, logits, preds
    torch.cuda.empty_cache()

    if (batch_idx + 1) % 50 == 0:
        elapsed = time.time() - t2
        rate = (batch_idx + 1) * MINI_BATCH_SIZE / elapsed
        remaining = (total_batches - batch_idx - 1) * MINI_BATCH_SIZE / rate
        print(f"  Batch {batch_idx + 1}/{total_batches} | {rate:.0f} sess/s | ETA: {remaining/60:.1f}min")

t2_elapsed = time.time() - t2
print(f"  ✓ Tokenization & inference done in {t2_elapsed:.1f}s ({len(cyberlab_df)/t2_elapsed:.0f} sessions/sec)")

# Step 3: Align predictions to words (fast - vectorized)
print("Step 3: Aligning predictions to words...")
t3 = time.time()

results = []

for session_idx in range(len(cyberlab_df)):
    try:
        words = cyberlab_df.iloc[session_idx]['words']
        word_ids = all_word_ids[session_idx]
        predictions_session = all_predictions[session_idx]

        # Extract only first token prediction per word
        aligned_preds_ids = []
        seen_words = set()

        for token_idx, word_id in enumerate(word_ids):
            if word_id is None:
                continue
            if word_id not in seen_words:
                aligned_preds_ids.append(predictions_session[token_idx])
                seen_words.add(word_id)

        # Convert to labels
        aligned_preds = [id2label[pred_id] for pred_id in aligned_preds_ids]

        # Extract command tags for this session
        command_tags = defaultdict(list)
        for word_idx, word in enumerate(words):
            if word_idx < len(aligned_preds) and word in commands_of_interest:
                command_tags[word].append(aligned_preds[word_idx])

        # Store result
        results.append({
            'session_id': session_idx,
            'fingerprint': tuple(aligned_preds),
            'timestamp': cyberlab_df.iloc[session_idx]['timestamps_statements'],
            'date': cyberlab_df.iloc[session_idx]['date'],
            'country': cyberlab_df.iloc[session_idx]['country_name'],
            'command_tags': dict(command_tags)
        })

    except Exception as e:
        print(f"  ERROR on session {session_idx}: {e}")

results_df = pd.DataFrame(results)
t3_elapsed = time.time() - t3
print(f"  ✓ Alignment done in {t3_elapsed:.1f}s")

print(f"\n✓ Successfully processed: {len(results_df)}/{len(cyberlab_df)} sessions")

# Clean up
del all_word_ids, all_predictions
torch.cuda.empty_cache()

# Print timing summary
total_time = time.time() - start_time
print("\n" + "="*80)
print("TIMING SUMMARY")
print("="*80)
print(f"Step 1 (Preprocess):        {t1_elapsed:>7.1f}s")
print(f"Step 2 (Tokenize+Infer):    {t2_elapsed:>7.1f}s")
print(f"Step 3 (Alignment):         {t3_elapsed:>7.1f}s")
print(f"{'─'*40}")
print(f"Total:                      {total_time:>7.1f}s ({total_time/60:.1f} min)")
print("="*80)

Step 1: Preprocessing sessions (vectorizing)...
  ✓ Done in 5.2s
Step 2: Tokenizing 174262 sessions in mini-batches...
  Batch 50/341 | 41 sess/s | ETA: 61.1min
  Batch 100/341 | 42 sess/s | ETA: 48.9min


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.77 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.58 GiB is free. Process 8459 has 12.16 GiB memory in use. Of the allocated memory 7.19 GiB is allocated by PyTorch, and 4.84 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
commands_of_interest = ['cat', 'grep', 'echo', 'rm']
command_analysis = defaultdict(lambda: {'tags': [], 'examples': defaultdict(list)})

for _, row in results_df.iterrows():
    words = row['words']
    predictions = row['predictions']
    session_text = row['session_text']

    # Only consider words that have predictions (handle truncation)
    for word_idx in range(min(len(words), len(predictions))):
        word = words[word_idx]

        if word in commands_of_interest:
            tag = predictions[word_idx]
            command_analysis[word]['tags'].append(tag)

            # Store example per (command, tag) pair - keep up to 2 examples
            if len(command_analysis[word]['examples'][tag]) < 2:
                command_analysis[word]['examples'][tag].append({
                    'session': session_text[:300],
                    'words': words[:20],
                    'predictions': predictions[:20]
                })


In [None]:
# Create command-tag frequency table
print("\n" + "-"*80)
print("Command-Tag Frequency Distribution")
print("-"*80)

command_tag_table = []
for cmd in commands_of_interest:
    if cmd in command_analysis:
        tags = command_analysis[cmd]['tags']
        if len(tags) > 0:
            tag_counts = Counter(tags)
            total = len(tags)

            for tag, count in tag_counts.most_common():
                freq = (count / total) * 100
                command_tag_table.append({
                    'Command': cmd,
                    'Tag': tag,
                    'Count': count,
                    'Frequency (%)': f"{freq:.1f}"
                })

if command_tag_table:
    command_tag_df = pd.DataFrame(command_tag_table)
    print(command_tag_df.to_string(index=False))
    command_tag_df.to_csv(f"{results_path}command_tag_frequency.csv", index=False)

    # Determine if commands are uniquely associated with single tag
    print("\n" + "-"*80)
    print("Command Uniqueness Analysis")
    print("-"*80)
    for cmd in commands_of_interest:
        if cmd in command_analysis:
            unique_tags = len(set(command_analysis[cmd]['tags']))
            total_occurrences = len(command_analysis[cmd]['tags'])
            print(f"'{cmd}': {unique_tags} unique tag(s) across {total_occurrences} occurrences", end="")
            if unique_tags == 1:
                print(" ✓ (uniquely associated)")
            else:
                print(" ✗ (NOT uniquely associated)")

    # Plot command-tag distribution
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('MITRE Tag Distribution by Command', fontsize=18, fontweight='bold', y=1.00)

    for idx, cmd in enumerate(commands_of_interest):
        ax = axes[idx // 2, idx % 2]

        if cmd in command_analysis and len(command_analysis[cmd]['tags']) > 0:
            tags = command_analysis[cmd]['tags']
            tag_counts = Counter(tags)

            tags_list = list(tag_counts.keys())
            counts = list(tag_counts.values())

            colors = sns.color_palette("husl", len(tags_list))
            bars = ax.bar(tags_list, counts, color=colors, edgecolor='black', linewidth=1.5)

            # Add count labels on bars
            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height,
                       f'{int(height)}',
                       ha='center', va='bottom', fontweight='bold')

            ax.set_title(f"Command: '{cmd}' (n={len(tags)})", fontsize=14, fontweight='bold')
            ax.set_ylabel("Frequency", fontsize=12, fontweight='bold')
            ax.set_xlabel("MITRE Tactic", fontsize=12, fontweight='bold')
            ax.tick_params(axis='x', rotation=45, labelsize=11)
            ax.tick_params(axis='y', labelsize=11)
            ax.grid(axis='y', alpha=0.3)
        else:
            ax.text(0.5, 0.5, f"No '{cmd}' found", ha='center', va='center', fontsize=14)
            ax.set_xticks([])
            ax.set_yticks([])

    plt.tight_layout()
    plt.savefig(f"{results_path}command_tag_distribution.png", dpi=300, bbox_inches='tight')
    print(f"\n✓ Saved: command_tag_distribution.png")
    plt.show()

In [None]:
# Find unique fingerprints and sort by first appearance date
unique_fingerprints = results_df['fingerprint'].unique()
print(f"\nTotal unique fingerprints: {len(unique_fingerprints)}")

# Create fingerprint table with metadata
fingerprint_data = []
for fp in unique_fingerprints:
    fp_sessions = results_df[results_df['fingerprint'] == fp]
    first_date = fp_sessions['date'].min()
    last_date = fp_sessions['date'].max()
    session_count = len(fp_sessions)
    days_active = (fp_sessions['date'].nunique())

    fingerprint_data.append({
        'fingerprint': fp,
        'first_seen': first_date,
        'last_seen': last_date,
        'session_count': session_count,
        'days_active': days_active,
        'fingerprint_length': len(fp)
    })

In [None]:

fingerprint_df = pd.DataFrame(fingerprint_data)

# Sort by first appearance date
fingerprint_df = fingerprint_df.sort_values('first_seen').reset_index(drop=True)

# Assign fingerprint IDs
fingerprint_df['fp_id'] = range(len(fingerprint_df))

print(f"\nTop 15 Fingerprints by Session Count:")
print("-"*80)
top_fps = fingerprint_df.nlargest(15, 'session_count')
print(top_fps[['fp_id', 'first_seen', 'session_count', 'days_active', 'fingerprint_length']].to_string(index=False))

# Merge fingerprint IDs back to results
fp_to_id = dict(zip(fingerprint_df['fingerprint'], fingerprint_df['fp_id']))
results_df['fp_id'] = results_df['fingerprint'].map(fp_to_id)

# Count sessions per fingerprint per day
daily_fp_counts = results_df.groupby(['date', 'fp_id']).size().reset_index(name='session_count')

print(f"\nDaily fingerprint counts: {len(daily_fp_counts)} date-fingerprint pairs")

# Save fingerprint data
fingerprint_df.to_csv(f"{results_path}fingerprints.csv", index=False)
daily_fp_counts.to_csv(f"{results_path}daily_fingerprint_counts.csv", index=False)
print(f"✓ Saved: fingerprints.csv and daily_fingerprint_counts.csv")

In [None]:
fig, ax = plt.subplots(figsize=(18, 12))

# Prepare data for scatter plot
dates = daily_fp_counts['date']
fp_ids = daily_fp_counts['fp_id']
sizes = daily_fp_counts['session_count'] * 40  # Scale point size
colors = daily_fp_counts['session_count']  # Color by session count

# Create scatter plot
scatter = ax.scatter(
    dates, fp_ids,
    s=sizes,
    c=colors,
    cmap='YlOrRd',
    alpha=0.7,
    edgecolors='black',
    linewidth=0.8
)

ax.set_xlabel('Date', fontsize=16, fontweight='bold')
ax.set_ylabel('Fingerprint ID (sorted by first appearance)', fontsize=16, fontweight='bold')
ax.set_title('MITRE Tactics Fingerprints Over Time in Honeypot', fontsize=18, fontweight='bold', pad=20)
ax.grid(True, alpha=0.3, linestyle='--')

# Add colorbar
cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label('Number of Sessions per Day', fontsize=14, fontweight='bold')
cbar.ax.tick_params(labelsize=12)

# Format x-axis dates
ax.tick_params(axis='x', rotation=45, labelsize=12)
ax.tick_params(axis='y', labelsize=12)

plt.tight_layout()
plt.savefig(f"{results_path}fingerprint_timeline.png", dpi=300, bbox_inches='tight')
print(f"✓ Saved: fingerprint_timeline.png")
plt.show()

In [None]:
# Identify persistent fingerprints
fp_days_count = results_df.groupby('fp_id')['date'].nunique().reset_index(name='days_present')
fp_days_count = fp_days_count.merge(fingerprint_df[['fp_id', 'session_count']], on='fp_id')

print(f"\nFingerprint Persistence Analysis:")
print("-"*80)

# Top persistent fingerprints (present on many days)
top_persistent = fp_days_count.nlargest(10, 'days_present')
print("\nTop 10 Most Persistent Fingerprints (active on most days):")
print(top_persistent[['fp_id', 'days_present', 'session_count']].to_string(index=False))

# Top volume fingerprints (most sessions)
top_volume = fingerprint_df.nlargest(10, 'session_count')
print("\n\nTop 10 High-Volume Fingerprints (most sessions):")
print(top_volume[['fp_id', 'session_count', 'days_active', 'first_seen']].to_string(index=False))

# Identify "always present" fingerprints (present almost every day)
total_days = results_df['date'].nunique()
always_present = fp_days_count[fp_days_count['days_present'] >= (0.9 * total_days)]
print(f"\n\nFingerprints present on ≥90% of days ({int(0.9*total_days)} days):")
if len(always_present) > 0:
    print(always_present[['fp_id', 'days_present', 'session_count']].to_string(index=False))
else:
    print("None found")

# Identify potential attack campaigns (sudden spike in fingerprints)
print(f"\n\nPotential Attack Campaigns (sudden volume spikes):")
print("-"*80)

campaign_threshold = results_df.groupby(['date', 'fp_id']).size().max() * 0.5

suspicious_days = daily_fp_counts[daily_fp_counts['session_count'] > campaign_threshold]
if len(suspicious_days) > 0:
    print(f"\nDates with unusually high session counts (>{campaign_threshold:.0f} sessions):")
    suspicious_dates = suspicious_days.groupby('date').size()
    for date, count in suspicious_dates.head(10).items():
        print(f"  {date}: {count} fingerprint(s) with high activity")
        # Show details
        day_fps = suspicious_days[suspicious_days['date'] == date].sort_values('session_count', ascending=False)
        for _, row in day_fps.head(3).iterrows():
            fp_info = fingerprint_df[fingerprint_df['fp_id'] == int(row['fp_id'])].iloc[0]
            print(f"    - FP_ID {int(row['fp_id'])}: {row['session_count']} sessions, Tactic length: {fp_info['fingerprint_length']}")
else:
    print("No significant volume spikes detected")