# Task 4

## Setup

In [None]:
# --- Check Python and pip versions ---
!python --version
!pip install --upgrade pip

Python 3.12.12
Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3


In [None]:
# --- Install required libraries ---
!pip install torch
!pip install numpy pandas scikit-learn matplotlib seaborn
!pip install tqdm



In [1]:
# --- Setup & Imports ---
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import Counter, defaultdict
import json
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.cuda.amp import GradScaler
from torch.nn.utils import clip_grad_norm_
from torch.cuda import is_available

from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, AutoConfig,
    get_scheduler, DataCollatorForTokenClassification
)
from transformers import AutoTokenizer

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, accuracy_score, f1_score,
    precision_score, recall_score
)
from itertools import chain
from copy import deepcopy

from tqdm.auto import tqdm

### Colab Pro

In [None]:
# --- Check GPU availability ---
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Dec 17 15:55:27 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   45C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# --- Check RAM availability ---
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


### Paths setup


In [None]:
# --- Mount Google Drive (for Google Colab users) ---
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- Define Paths ---
laboratory = 'Laboratory4'

base_path = '/content/drive/MyDrive/'
project_path = base_path + f'Projects/{laboratory}/'
data_path = project_path + 'data/'
results_path = project_path + 'results/'

# Ensure directories exist
os.makedirs(project_path, exist_ok=True)
os.makedirs(data_path, exist_ok=True)
os.makedirs(results_path, exist_ok=True)

print(f"Project path: {project_path}")
print(f"Data path: {data_path}")
print(f"Results path: {results_path}")

Project path: /content/drive/MyDrive/Projects/Laboratory4/
Data path: /content/drive/MyDrive/Projects/Laboratory4/data/
Results path: /content/drive/MyDrive/Projects/Laboratory4/results/


In [None]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## Helper Functions 

In [None]:
# ============================================================================
# GLOBAL CONFIGURATION: Plot Saving (Colab/Google Drive only)
# ============================================================================
SAVE_PLOTS = 1
# ============================================================================

import os
import matplotlib.pyplot as plt

BASE_DIR = results_path + 'Task4'
os.makedirs(BASE_DIR, exist_ok=True)

def save_figure_for_report(filename, dpi=300, bbox_inches='tight'):
    """
    Save the current matplotlib figure for use in the report.

    Args:
        filename: Name of the file (e.g., 'class_distribution.png')
        dpi: Resolution (default 300 for high quality)
        bbox_inches: Bounding box setting (default 'tight' to remove whitespace)
    """
    if not SAVE_PLOTS:
        return  # Skip saving if flag is disabled or filename missing

    filepath = os.path.join(BASE_DIR, filename)
    plt.savefig(filepath, dpi=dpi, bbox_inches=bbox_inches)
    print(f"Figure saved to: {filepath}")


In [None]:
def compute_metrics(full_predictions, full_labels):
    """Compute token-level classification metrics"""
    flat_predictions = list(chain(*full_predictions))
    flat_labels = list(chain(*full_labels))

    token_accuracy = accuracy_score(flat_labels, flat_predictions)
    token_precision = precision_score(flat_labels, flat_predictions, average='macro', zero_division=0)
    token_recall = recall_score(flat_labels, flat_predictions, average='macro', zero_division=0)
    token_f1 = f1_score(flat_labels, flat_predictions, average='macro', zero_division=0)

    metrics = {
        "token_accuracy": token_accuracy,
        "token_precision": token_precision,
        "token_recall": token_recall,
        "token_f1": token_f1,
    }
    return metrics

In [None]:
def postprocess(predictions, labels):
    """Convert predictions and labels to original label format"""
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_predictions, true_labels

In [None]:
def align_labels_with_tokens(labels, word_ids):
    """Align word-level labels to token-level labels"""
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            new_labels.append(label)
    return new_labels

In [None]:
def tokenize_and_align_labels_unixcoder(samples, tokenizer):
    """Tokenize and align labels for UniXcoder"""
    split_sentences = [s.split(" ") for s in samples["session"]]

    tokenized = tokenizer(
        split_sentences,
        truncation=True,
        is_split_into_words=True
    )

    all_labels = samples["label_id"]
    aligned_all = []

    for i, labels in enumerate(all_labels):
        word_ids = tokenized.word_ids(i)

        aligned = []
        prev_word = None

        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            else:
                if wid != prev_word:
                    aligned.append(labels[wid])
                    prev_word = wid
                else:
                    aligned.append(-100)
        aligned_all.append(aligned)

    tokenized["labels"] = aligned_all
    return tokenized


In [None]:
def training_loop(model, optimizer, lr_scheduler, train_loader, val_loader, device, num_epochs):
    scaler = torch.amp.GradScaler()
    best_val_loss = np.inf
    best_weights = deepcopy(model.state_dict())

    # Calculate steps dynamically based on the passed loader
    num_training_steps = num_epochs * len(train_loader)
    progress_bar = tqdm(range(num_training_steps))

    train_losses, val_losses = [], []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            # Move batch to device
            batch = {k: v.to(device, non_blocking=True) for k, v in batch.items()}
            optimizer.zero_grad(set_to_none=True)

            # Automatic Mixed Precision
            with torch.amp.autocast(device_type='cuda'):
                outputs = model(**batch)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            lr_scheduler.step()

            train_loss += loss.item()
            progress_bar.update(1)

        avg_train_loss = train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # Evaluation
        model.eval()
        val_loss = 0
        predictions_list, labels_list = [], []

        for batch in val_loader:
            batch = {key: value.to(device, non_blocking=True) for key, value in batch.items()}
            with torch.no_grad():
                with torch.amp.autocast(device_type='cuda'):  # ← FIXED HERE
                    outputs = model(**batch)

            val_loss += outputs.loss.item()
            predictions = outputs.logits.argmax(dim=-1)
            labels = batch["labels"]

            true_predictions, true_labels = postprocess(predictions, labels)
            predictions_list += true_predictions
            labels_list += true_labels

        # Compute validation metrics
        val_metrics = compute_metrics(predictions_list, labels_list)
        val_accuracy = val_metrics["token_accuracy"]

        avg_val_loss = val_loss / len(val_loader)
        val_losses.append(avg_val_loss)

        # Optional: Print progress
        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

        if avg_val_loss <= best_val_loss:
            best_epoch = epoch
            best_val_loss = avg_val_loss
            best_weights = deepcopy(model.state_dict())

    # Load the best weights found during this specific training run
    model.load_state_dict(best_weights)
    return model, best_epoch, best_val_loss, train_losses, val_losses

In [None]:
def evaluate_model(model, dataloader, device):
    """Evaluate model on a dataset"""
    model.eval()
    full_predictions, full_labels = [], []
    for batch in dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        true_predictions, true_labels = postprocess(predictions, labels)
        full_predictions += true_predictions
        full_labels += true_labels

    test_metrics = compute_metrics(full_predictions, full_labels)
    return full_predictions, full_labels, test_metrics

### Re-Train Best Model from Task 3

### Dataset Loading and preparation

In [None]:
# Load training and test data
train_df = pd.read_json(f"{data_path}train.json")
test_df = pd.read_json(f"{data_path}test.json")

print(f"Training dataset: {train_df.shape[0]} sessions")
print(f"Test dataset: {test_df.shape[0]} sessions")

In [None]:
# Split training data into train and validation
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
print(f"Split training: {train_df.shape[0]} train, {val_df.shape[0]} validation")

In [None]:
# Create label mappings
unique_labels = list(train_df.label.explode().unique())
print(f"Unique labels: {unique_labels}")

id2label = {it: label for it, label in enumerate(unique_labels)}
label2id = {label: it for it, label in enumerate(unique_labels)}

print(f"Label mappings: {label2id}")

In [None]:
# Create Hugging Face datasets
full_ds = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "valid": Dataset.from_pandas(val_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True)),
})

In [None]:
# Convert labels to IDs
def convert_labels_to_ids(sample):
    sample['label_id'] = [label2id[el] for el in sample["label"]]
    return sample

encoded_dataset = full_ds.map(convert_labels_to_ids)
print(f"Encoded dataset: {encoded_dataset}")

### Tokenization

In [None]:
# Create tokenizer and model checkpoint
unixcoder_model_checkpoint = "microsoft/unixcoder-base"
unixcoder_tokenizer = AutoTokenizer.from_pretrained(
    unixcoder_model_checkpoint,
    add_prefix_space=True,
    use_fast=True,
    model_max_length=512
)

In [None]:
# Tokenize datasets
original_columns = encoded_dataset["train"].column_names
tokenized_datasets = encoded_dataset.map(
    lambda x: tokenize_and_align_labels_unixcoder(x, unixcoder_tokenizer),
    batched=True,
    remove_columns=original_columns,
)

print(f"Tokenized dataset: {tokenized_datasets}")

In [None]:
# Create data collator
data_collator = DataCollatorForTokenClassification(
    tokenizer=unixcoder_tokenizer,
    return_tensors="pt"
)

In [None]:
# Create DataLoaders
tokenized_datasets.set_format("torch")

BATCH_SIZE = 32
NUM_WORKERS = 2

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

eval_dataloader = DataLoader(
    tokenized_datasets["valid"],
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

test_dataloader = DataLoader(
    tokenized_datasets["test"],
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True
)

print("DataLoaders created successfully")

### Train the model

In [None]:
print("\n" + "="*80)
print("SECTION 3: TRAIN BEST MODEL FROM TASK 3")
print("UNIXCODER FULL FINE-TUNE WITH LR=1e-05")
print("="*80)

N_TRAIN_EPOCHS = 40
BEST_LR = 1e-5

print(f"\nTraining parameters:")
print(f"  - Model: UniXcoder (microsoft/unixcoder-base)")
print(f"  - Learning Rate: {BEST_LR}")
print(f"  - Epochs: {N_TRAIN_EPOCHS}")
print(f"  - Batch Size: {BATCH_SIZE}")
print(f"  - Device: {device}")

In [None]:
# Initialize model
best_model = AutoModelForTokenClassification.from_pretrained(
    unixcoder_model_checkpoint,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id,
).to(device)

In [None]:
# Setup optimizer and scheduler
optimizer = optim.AdamW(best_model.parameters(), lr=BEST_LR)

num_training_steps = N_TRAIN_EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
# Train
best_model, best_epoch, best_val_loss, train_losses, val_losses = training_loop(
    best_model, optimizer, lr_scheduler, 
    train_dataloader, eval_dataloader, device, N_TRAIN_EPOCHS
)

print("\n" + "="*80)
print(f"Training complete! Best epoch: {best_epoch + 1}, Best val loss: {best_val_loss:.4f}")
print("="*80)

In [None]:
# Evaluate on test set
print("\nEvaluating on test set...")
test_preds, test_labels, test_metrics = evaluate_model(best_model, test_dataloader, device)

print(f"\nTest Set Performance:")
print(f"  - Token Accuracy: {test_metrics['token_accuracy']:.4f}")
print(f"  - Macro F1-Score: {test_metrics['token_f1']:.4f}")
print(f"  - Macro Precision: {test_metrics['token_precision']:.4f}")
print(f"  - Macro Recall: {test_metrics['token_recall']:.4f}")

In [None]:
# Save best model for later use in inference
model_save_path = os.path.join(results_path, "best_unixcoder_model_task3")
os.makedirs(model_save_path, exist_ok=True)
best_model.save_pretrained(model_save_path)
unixcoder_tokenizer.save_pretrained(model_save_path)

print(f"\nBest model saved to: {model_save_path}")

## Inference Datasets

In [None]:
# Reload test data for inference (unlabeled inference sessions)
inference_test_df = pd.read_json(f"{data_path}test.json")
print(f"Test dataset for inference: {inference_test_df.shape[0]} sessions")

In [None]:
# Load cyberlab.csv for temporal information
cyberlab_df = pd.read_csv(f"{data_path}cyberlab.csv")
print(f"Cyberlab dataset: {cyberlab_df.shape[0]} records")
print(f"Columns: {list(cyberlab_df.columns)}")
print(cyberlab_df.head(2))

In [None]:
def truncate_long_words(session, max_length=20):
    """Truncate words longer than max_length (matching Task 3 preprocessing)"""
    words = session.split()
    truncated = []
    for word in words:
        if len(word) > max_length:
            truncated.append(word[:max_length-3] + '...')
        else:
            truncated.append(word)
    return ' '.join(truncated)

In [None]:
def align_predictions_to_words(predictions_ids, word_ids, id2label):
    """
    Extract only the prediction for the first token of each word.
    Returns list of predicted tactics, one per word.
    """
    aligned_preds_ids = []
    seen_words = set()
    
    for idx, word_id in enumerate(word_ids):
        if word_id is None:
            continue
        if word_id not in seen_words:
            aligned_preds_ids.append(predictions_ids[idx])
            seen_words.add(word_id)
    
    aligned_preds = [id2label[pred_id] for pred_id in aligned_preds_ids]
    return aligned_preds

In [None]:
def predict_tactics(session_text, model, tokenizer, device, id2label):
    """
    Predict MITRE tactics for a session.
    Returns:
    - words: list of words
    - predictions: list of predicted tactics (one per word)
    """
    # Preprocess
    session_clean = truncate_long_words(session_text)
    words = session_clean.split()
    
    # Tokenize
    tokenized = tokenizer(
        words,
        truncation=True,
        is_split_into_words=True,
        return_tensors='pt'
    )
    
    # Move to device
    tokenized = {k: v.to(device) for k, v in tokenized.items()}
    
    # Inference
    model.eval()
    with torch.no_grad():
        outputs = model(**tokenized)
        logits = outputs.logits
    
    # Get predictions
    predictions_ids = logits.argmax(dim=-1).cpu().numpy()[0]
    word_ids = tokenized.word_ids(0)
    
    # Align to words
    aligned_preds = align_predictions_to_words(predictions_ids, word_ids, id2label)
    
    return words, aligned_preds


## Inference

In [None]:
results = []
errors = []

print(f"Processing {len(inference_test_df)} sessions for inference...\n")

for idx, row in inference_test_df.iterrows():
    try:
        session_text = row['session']
        
        # Predict tactics
        words, predictions = predict_tactics(session_text, best_model, unixcoder_tokenizer, device, id2label)
        
        # Store result
        results.append({
            'session_id': idx,
            'session_text': session_text,
            'words': words,
            'predictions': predictions,
            'fingerprint': tuple(predictions)
        })
        
        if (idx + 1) % 20 == 0:
            print(f"  Processed {idx + 1}/{len(inference_test_df)} sessions...")
    
    except Exception as e:
        errors.append({'session_id': idx, 'error': str(e)})
        print(f"  ERROR on session {idx}: {e}")

results_df = pd.DataFrame(results)
print(f"\n✓ Successfully processed: {len(results_df)}/{len(inference_test_df)} sessions")
if errors:
    print(f"✗ Errors: {len(errors)}")

In [None]:
# Examine cyberlab structure to understand how to match sessions
print("Cyberlab columns and dtypes:")
print(cyberlab_df.dtypes)
print("\nFirst few rows:")
print(cyberlab_df.head(3))

# Add session index to results for matching with cyberlab
results_df['dataset_index'] = range(len(results_df))

# Assuming cyberlab.csv rows correspond to test.json rows in order
# Add index to cyberlab_df if not present
if len(cyberlab_df) >= len(results_df):
    cyberlab_df['dataset_index'] = range(len(cyberlab_df))[:len(results_df)]
    
    # Merge on index
    results_df = results_df.merge(
        cyberlab_df,
        on='dataset_index',
        how='left'
    )
    
    # Try to identify date column
    date_cols = [col for col in results_df.columns if 'date' in col.lower() or 'time' in col.lower()]
    if date_cols:
        date_col = date_cols[0]
        results_df[date_col] = pd.to_datetime(results_df[date_col], errors='coerce')
        date_range = results_df[date_col].dropna()
        if len(date_range) > 0:
            print(f"Date range: {date_range.min()} to {date_range.max()}")
        else:
            results_df['date'] = pd.to_datetime('2024-01-01')
            print("Date column exists but couldn't parse dates, using dummy dates")
    else:
        results_df['date'] = pd.to_datetime('2024-01-01')
        print("No date column found in cyberlab.csv, using dummy dates for fingerprint analysis")
else:
    results_df['date'] = pd.to_datetime('2024-01-01')
    print(f"Cyberlab has fewer rows ({len(cyberlab_df)}) than results ({len(results_df)}), using dummy dates")

print(f"Results dataframe shape: {results_df.shape}")


In [2]:
df = pd.read_csv(f"../data/cyberlab.csv")

In [3]:
df.describe()

Unnamed: 0,session,timestamps_statements,country_name
count,174262,174262,174191
unique,174262,174262,163
top,cat /proc/cpuinfo | grep name | wc -l ; echo -...,2019-12-30 23:56:54.297736+00:00,China
freq,1,1,53158


In [4]:
df

Unnamed: 0,session,timestamps_statements,country_name
0,enable ; system ; shell ; sh ; cat /proc/mount...,2019-09-01 00:00:10.493808+00:00,Israel
1,enable ; system ; shell ; sh ; cat /proc/mount...,2019-09-01 00:38:41.134935+00:00,Israel
2,enable ; system ; shell ; sh ; cat /proc/mount...,2019-09-01 00:39:26.263383+00:00,Israel
3,enable ; system ; shell ; sh ; cat /proc/mount...,2019-09-01 00:40:45.132152+00:00,Israel
4,enable ; system ; shell ; sh ; cat /proc/mount...,2019-09-01 00:54:51.783935+00:00,Israel
...,...,...,...
174257,cat /proc/cpuinfo | grep name | wc -l ; echo -...,2019-12-30 23:37:10.487881+00:00,Thailand
174258,cat /proc/cpuinfo | grep name | wc -l ; echo -...,2019-12-30 23:37:10.923944+00:00,Thailand
174259,cat /proc/cpuinfo | grep name | wc -l ; echo -...,2019-12-30 23:41:46.601903+00:00,Netherlands
174260,cat /proc/cpuinfo | grep name | wc -l ; echo -...,2019-12-30 23:43:34.981985+00:00,Colombia
