In [None]:
#!pip install transformers datasets scikit-learn umap-learn --quiet

In [1]:
import pandas as pd
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    RobertaTokenizer, RobertaForMaskedLM,
    LongformerTokenizer, LongformerForMaskedLM,
    Trainer, TrainingArguments
)
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, precision_recall_curve, auc
import matplotlib.pyplot as plt
import os
import shutil
# Choose dataset: Options = 'Thunderbird', 'HDFS', 'BGL'
dataset_name = 'Thunderbird'
def load_dataset(dataset_name):
    if dataset_name == 'Thunderbird':
        !rm -rf loghub
        !git clone https://github.com/logpai/loghub.git
        paragraphs = []
        current_paragraph = []
        lines_per_paragraph = 30
        log_file = "loghub/Thunderbird/Thunderbird_2k.log"

        with open(log_file, 'r') as f:
            for line in f:
                line = line.strip()
                if line:
                    current_paragraph.append(line)
                if len(current_paragraph) >= lines_per_paragraph:
                    paragraphs.append(' '.join(current_paragraph))
                    current_paragraph = []
        if current_paragraph:
            paragraphs.append(' '.join(current_paragraph))
        paragraphs = np.array(paragraphs)

        random.seed(42)
        anomaly_indices = np.random.choice(len(paragraphs), size=int(0.2 * len(paragraphs)), replace=False)
        normal_indices = np.array(list(set(range(len(paragraphs))) - set(anomaly_indices)))

        normal_paragraphs = paragraphs[normal_indices]
        anomalous_paragraphs = paragraphs[anomaly_indices]

        return normal_paragraphs, anomalous_paragraphs

    elif dataset_name == 'HDFS':
        !rm -rf loghub
        !git clone https://github.com/logpai/loghub.git
        df = pd.read_csv('loghub/HDFS/HDFS.log_structured.csv')
        block_labels = pd.read_csv('loghub/HDFS/anomaly_label.csv')

        block_to_logs = {}
        for _, row in df.iterrows():
            blk = row['BlockId']
            block_to_logs.setdefault(blk, []).append(row['Content'])

        normal_paragraphs, anomalous_paragraphs = [], []

        for blk, logs in block_to_logs.items():
            paragraph = ' '.join(logs)
            label = block_labels.loc[block_labels['BlockId'] == blk, 'Label'].values[0]
            if label == 'Anomaly':
                anomalous_paragraphs.append(paragraph)
            else:
                normal_paragraphs.append(paragraph)

        return np.array(normal_paragraphs), np.array(anomalous_paragraphs)

    elif dataset_name == 'BGL':
        !rm -rf loghub
        !git clone https://github.com/logpai/loghub.git
        df = pd.read_csv('loghub/BGL/BGL.log_structured.csv')
        paragraphs = []
        current_paragraph = []
        window_size = 30  # 30 seconds

        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        df = df.sort_values(by='Timestamp')
        start_time = df['Timestamp'].iloc[0]

        df['window_id'] = ((df['Timestamp'] - start_time).dt.total_seconds() // window_size).astype(int)
        grouped = df.groupby('window_id')

        for window_id, group in grouped:
            content = ' '.join(group['Content'])
            paragraphs.append((content, group['Label'].iloc[0]))

        normal_paragraphs = [p[0] for p in paragraphs if p[1] == '-']
        anomalous_paragraphs = [p[0] for p in paragraphs if p[1] != '-']

        return np.array(normal_paragraphs), np.array(anomalous_paragraphs)

normal_paragraphs, anomalous_paragraphs = load_dataset(dataset_name)
print(f"✅ Loaded {dataset_name}: {len(normal_paragraphs)} normal, {len(anomalous_paragraphs)} anomalies.")


Cloning into 'loghub'...
remote: Enumerating objects: 575, done.[K
remote: Counting objects: 100% (171/171), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 575 (delta 145), reused 134 (delta 133), pack-reused 404 (from 2)[K(478/575), 5.60 MiB | 3.73 MiB/sReceiving objects:  88% (506/575), 7.24 MiB | 3.61 MiB/s
Receiving objects: 100% (575/575), 7.27 MiB | 3.59 MiB/s, done.
Resolving deltas: 100% (266/266), done.
✅ Loaded BGL: 1 normal, 0 anomalies.


In [None]:
class LogFiTDataset(Dataset):
    def __init__(self, paragraphs, tokenizer, max_length=512, mask_prob_sentence=0.5, mask_prob_token=0.8):
        self.paragraphs = paragraphs
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mask_prob_sentence = mask_prob_sentence
        self.mask_prob_token = mask_prob_token

    def __len__(self):
        return len(self.paragraphs)

    def __getitem__(self, idx):
        text = self.paragraphs[idx]
        sentences = text.split('. ')
        masked_sentences = []

        for sent in sentences:
            if random.random() < self.mask_prob_sentence:
                tokenized = self.tokenizer.tokenize(sent)
                masked_tokens = []
                for token in tokenized:
                    if random.random() < self.mask_prob_token:
                        masked_tokens.append(self.tokenizer.mask_token)
                    else:
                        masked_tokens.append(token)
                masked_sent = self.tokenizer.convert_tokens_to_string(masked_tokens)
                masked_sentences.append(masked_sent)
            else:
                masked_sentences.append(sent)

        masked_text = '. '.join(masked_sentences)

        encoding = self.tokenizer(masked_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        labels = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': labels['input_ids'].squeeze(0)
        }


In [None]:
def top_k_accuracy(logits, labels, k=5):
    top_k_preds = logits.topk(k, dim=-1).indices
    labels = labels.unsqueeze(-1).expand_as(top_k_preds)
    correct = (top_k_preds == labels).any(dim=-1)
    return correct

def detect_anomalies(model, paragraphs, tokenizer, k=5, threshold=0.6, batch_size=4):
    model.eval()
    anomalies = []

    dataset = LogFiTDataset(paragraphs, tokenizer)
    loader = DataLoader(dataset, batch_size=batch_size)

    for batch in loader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

        acc = top_k_accuracy(logits, labels, k=k)
        mask = labels != tokenizer.pad_token_id
        acc = acc * mask
        paragraph_accuracy = acc.sum(dim=1) / mask.sum(dim=1)

        anomaly_flags = (paragraph_accuracy < threshold).long().tolist()
        anomalies.extend(anomaly_flags)

    return anomalies


In [None]:
# Decide RoBERTa vs Longformer based on token length
sample_paragraphs = random.sample(list(normal_paragraphs), min(10, len(normal_paragraphs)))
token_counts = []

tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer_longformer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

for paragraph in sample_paragraphs:
    token_counts.append(len(tokenizer_roberta.tokenize(paragraph)))

avg_tokens = np.mean(token_counts)
print(f"Average tokens per paragraph: {avg_tokens:.2f}")

if avg_tokens > 512:
    print("✅ Using Longformer for this dataset.")
    tokenizer = tokenizer_longformer
    base_model_name = 'allenai/longformer-base-4096'
    model_class = LongformerForMaskedLM
else:
    print("✅ Using RoBERTa for this dataset.")
    tokenizer = tokenizer_roberta
    base_model_name = 'roberta-base'
    model_class = RobertaForMaskedLM


In [None]:
# Cross-validation settings
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
precision_scores = []
recall_scores = []
f1_scores = []

for fold, (train_idx, test_idx) in enumerate(kfold.split(normal_paragraphs)):
    print(f"\n=== Fold {fold+1} ===")

    train_normals = normal_paragraphs[train_idx][:5000] if len(train_idx) >= 5000 else normal_paragraphs[train_idx]
    val_normals = normal_paragraphs[train_idx][5000:6000] if len(train_idx) >= 6000 else normal_paragraphs[train_idx][-1000:]
    test_normals = normal_paragraphs[test_idx][:5000] if len(test_idx) >= 5000 else normal_paragraphs[test_idx]

    val_anomalies = anomalous_paragraphs[:1000]
    test_anomalies = anomalous_paragraphs[1000:2000]

    # Prepare datasets
    train_dataset = LogFiTDataset(train_normals, tokenizer)

    # Load model
    model = model_class.from_pretrained(base_model_name)
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

    # Training settings
    training_args = TrainingArguments(
        output_dir=f"./logfit_model_fold{fold+1}",
        overwrite_output_dir=True,
        num_train_epochs=3 if dataset_name == 'Thunderbird' else 5,
        per_device_train_batch_size=4,  # Lower batch for Colab
        save_steps=500,
        save_total_limit=2,
        logging_steps=100,
        logging_dir=f"./logfit_logs_fold{fold+1}",
        learning_rate=5e-5,
        weight_decay=0.01,
        fp16=True,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )

    trainer.train()

    # Validation set
    val_paragraphs = np.concatenate([val_normals, val_anomalies])
    val_labels = [0]*len(val_normals) + [1]*len(val_anomalies)

    # Automatic threshold tuning
    best_f1 = 0
    best_threshold = 0.5

    for threshold in np.arange(0.5, 0.96, 0.05):
        preds_val = detect_anomalies(model, val_paragraphs, tokenizer, threshold=threshold, batch_size=4)
        precision = precision_score(val_labels, preds_val)
        recall = recall_score(val_labels, preds_val)
        f1 = f1_score(val_labels, preds_val)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold

    print(f"✅ Best threshold on validation: {best_threshold:.2f} with F1: {best_f1:.4f}")

    # Final Test
    test_paragraphs = np.concatenate([test_normals, test_anomalies])
    test_labels = [0]*len(test_normals) + [1]*len(test_anomalies)

    preds_test = detect_anomalies(model, test_paragraphs, tokenizer, threshold=best_threshold, batch_size=4)

    precision = precision_score(test_labels, preds_test)
    recall = recall_score(test_labels, preds_test)
    f1 = f1_score(test_labels, preds_test)

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    print(f"[Test] Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    # Clean up memory
    del model
    del trainer
    del train_dataset
    torch.cuda.empty_cache()

# Report final 5-Fold Results
print("\n=== Final 5-Fold Results ===")
print(f"Average Precision: {np.mean(precision_scores):.4f}")
print(f"Average Recall: {np.mean(recall_scores):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")


In [None]:
def plot_precision_recall_curve(y_true, y_scores, title='Precision-Recall Curve'):
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    pr_auc = auc(recall, precision)

    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, marker='.', label=f'AUC = {pr_auc:.4f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
def plot_confusion_matrix(y_true, y_pred, title='Confusion Matrix'):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5, 4))
    plt.imshow(cm, cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.colorbar()

    classes = ['Normal', 'Anomaly']
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     ha="center", va="center",
                     color="white" if cm[i, j] > thresh else "black")
    plt.show()


In [None]:
# After testing on test_paragraphs
# Plot Precision-Recall curve
plot_precision_recall_curve(test_labels, preds_test, title=f'Precision-Recall Fold {fold+1}')

# Plot Confusion Matrix
plot_confusion_matrix(test_labels, preds_test, title=f'Confusion Matrix Fold {fold+1}')
