In [None]:
import os
import re
import json
import hashlib
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import pickle
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split




In [None]:
class DrainParser:
    def __init__(self, log_format, indir, outdir, depth=4, st=0.5, rex=None):
        self.path = indir
        self.savePath = outdir
        self.depth = depth - 2
        self.st = st
        self.rex = rex or []
        self.log_format = log_format

    def preprocess(self, line):
        for regex in self.rex:
            line = re.sub(regex, '<*>', line)
        return line

    def generate_logformat_regex(self, logformat):
        headers = []
        splitters = re.split(r'(<[^<>]+>)', logformat)
        regex = ''
        for i, part in enumerate(splitters):
            if i % 2 == 0:
                regex += re.sub(r' +', r'\\s+', part)
            else:
                header = part.strip('<>').strip()
                headers.append(header)
                regex += f'(?P<{header}>.*?)'
        return headers, re.compile('^' + regex + '$')

    def load_data(self, log_file):
        headers, regex = self.generate_logformat_regex(self.log_format)
        rows = []
        with open(log_file, 'r') as f:
            for line in f:
                try:
                    match = regex.search(line.strip())
                    rows.append([match.group(h) for h in headers])
                except:
                    continue
        df = pd.DataFrame(rows, columns=headers)
        df.insert(0, 'LineId', range(1, len(df) + 1))
        return df

    def parse(self, log_name):
        full_path = os.path.join(self.path, log_name)
        print("🔍 Parsing log file:", full_path)

        df_log = self.load_data(full_path)
        df_log['Content'] = df_log['Content'].apply(self.preprocess)

        templates = df_log['Content'].value_counts().reset_index()
        templates.columns = ['EventTemplate', 'Occurrences']
        templates['EventId'] = templates['EventTemplate'].apply(lambda x: hashlib.md5(x.encode()).hexdigest()[:8])

        id_map = templates.set_index('EventTemplate')['EventId'].to_dict()
        df_log['EventTemplate'] = df_log['Content']
        df_log['EventId'] = df_log['EventTemplate'].map(id_map)

        os.makedirs(self.savePath, exist_ok=True)
        df_log.to_csv(os.path.join(self.savePath, log_name + '_structured.csv'), index=False)
        templates.to_csv(os.path.join(self.savePath, log_name + '_templates.csv'), index=False)



In [None]:
def extract_templates_to_json(template_csv, output_json):
    df = pd.read_csv(template_csv)
    df.sort_values(by="Occurrences", ascending=False, inplace=True)
    template_map = {row["EventId"]: i + 1 for i, row in df.iterrows()}

    with open(output_json, "w") as f:
        json.dump(template_map, f)

    print(f"Template map saved to {output_json}")


In [None]:
def extract_sessions(structured_csv, template_map_json, output_csv):
    df = pd.read_csv(structured_csv)

    with open(template_map_json, 'r') as f:
        template_map = json.load(f)

    df["EventId"] = df["EventId"].map(template_map)

    blk_map = defaultdict(list)
    for _, row in df.iterrows():
        blk_ids = re.findall(r'blk_-?\d+', row["Content"])
        for blk in set(blk_ids):
            blk_map[blk].append(row["EventId"])

    session_df = pd.DataFrame(list(blk_map.items()), columns=["BlockId", "EventSequence"])
    session_df.to_csv(output_csv, index=False)

    print(f" Sessions extracted to {output_csv}")


In [None]:
def split_train_test(sequence_csv, label_csv, output_dir, max_train=None, ratio=0.3):
    df_seq = pd.read_csv(sequence_csv)
    df_label = pd.read_csv(label_csv)

    label_map = dict(zip(df_label["BlockId"], df_label["Label"].map(lambda x: 1 if x == "Anomaly" else 0)))
    df_seq["Label"] = df_seq["BlockId"].map(label_map)

    normal = df_seq[df_seq["Label"] == 0]["EventSequence"].dropna().sample(frac=1, random_state=42)
    anomaly = df_seq[df_seq["Label"] == 1]["EventSequence"].dropna()

    if max_train is None:
        max_train = int(len(normal) * ratio)

    datasets = {
        "train": normal[:max_train],
        "test_normal": normal[max_train:],
        "test_abnormal": anomaly
    }

    os.makedirs(output_dir, exist_ok=True)
    for name, data in datasets.items():
        with open(os.path.join(output_dir, name), "w") as f:
            for seq in data:
                f.write(" ".join(map(str, eval(seq))) + "\n")

    print("Train/test split completed.")


In [None]:
input_dir = "/Users/poojithareddy/Desktop/ucf/Sem2/NLP/nlp_project/dataset/HDFS/"
output_dir = "output_hdfs/"
log_file = "HDFS.log"
log_format = "<Date> <Time> <Pid> <Level> <Component>: <Content>"

# Step 1: Drain Parsing (NO regex to preserve blk IDs)
parser = DrainParser(log_format, indir=input_dir, outdir=output_dir, rex=[], depth=5)
parser.parse(log_file)

# Step 2: Generate template ID map
extract_templates_to_json(
    os.path.join(output_dir, log_file + "_templates.csv"),
    os.path.join(output_dir, "hdfs_log_templates.json")
)

# Step 3: Extract blk-based sessions
extract_sessions(
    os.path.join(output_dir, log_file + "_structured.csv"),
    os.path.join(output_dir, "hdfs_log_templates.json"),
    os.path.join(output_dir, "hdfs_sequence.csv")
)

# Step 4: Train/test split
split_train_test(
    sequence_csv=os.path.join(output_dir, "hdfs_sequence.csv"),
    label_csv=os.path.join(input_dir, "anomaly_label.csv"),
    output_dir=output_dir,
    max_train=None  # auto-calculate if needed
)


In [None]:
structured_path = "output_hdfs/HDFS.log_structured.csv"
df_structured = pd.read_csv(structured_path)

print(f"Structured log entries: {len(df_structured)}")

In [None]:
class LogVocab:
    def __init__(self, sequences, min_freq=1, max_size=None):
        self.special_tokens = ["<pad>", "<unk>", "<eos>", "<sos>", "<mask>"]
        self.counter = Counter()

        for seq in tqdm(sequences, desc="Building vocab"):
            tokens = seq if isinstance(seq, list) else str(seq).strip().split()
            self.counter.update(tokens)

        self.itos = list(self.special_tokens)
        if max_size:
            max_size += len(self.special_tokens)

        for token, freq in self.counter.most_common():
            if freq < min_freq:
                continue
            if token not in self.itos:
                self.itos.append(token)
            if max_size and len(self.itos) >= max_size:
                break

        self.stoi = {tok: idx for idx, tok in enumerate(self.itos)}

        self.pad_idx = self.stoi["<pad>"]
        self.unk_idx = self.stoi["<unk>"]
        self.eos_idx = self.stoi["<eos>"]
        self.sos_idx = self.stoi["<sos>"]
        self.mask_idx = self.stoi["<mask>"]

    def encode(self, sentence, seq_len=None, with_sos=False, with_eos=False):
        tokens = sentence if isinstance(sentence, list) else str(sentence).strip().split()
        indices = [self.stoi.get(tok, self.unk_idx) for tok in tokens]

        if with_sos:
            indices.insert(0, self.sos_idx)
        if with_eos:
            indices.append(self.eos_idx)

        if seq_len:
            indices = indices[:seq_len] + [self.pad_idx] * max(0, seq_len - len(indices))

        return indices

    def decode(self, indices, skip_pad=True):
        tokens = [self.itos[i] for i in indices if not skip_pad or i != self.pad_idx]
        return " ".join(tokens)

    def save(self, filepath):
        with open(filepath, "wb") as f:
            pickle.dump(self, f)

    @staticmethod
    def load(filepath):
        with open(filepath, "rb") as f:
            return pickle.load(f)

    def __len__(self):
        return len(self.itos)


In [None]:
train_file_path = "output_hdfs/train"

with open(train_file_path, "r") as f:
    sequences = [line.strip().split() for line in f if line.strip()]

print(f"Loaded {len(sequences)} sequences.")

vocab = LogVocab(sequences, min_freq=2)
print(f" Vocab built. Total tokens (including special): {len(vocab)}")


In [None]:
vocab.save("output_hdfs/vocab.pkl")
print("Vocab saved to output_hdfs/vocab.pkl")

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
with open("output_hdfs/vocab.pkl", "rb") as f:
    vocab = pickle.load(f)
print(f"Vocabulary size: {len(vocab)}")

In [None]:
vocab.pad_index = vocab.stoi.get('<PAD>', 0)
vocab.mask_index = vocab.stoi.get('<MASK>', 1)
vocab.unk_index = vocab.stoi.get('<UNK>', 2)
vocab.sos_index = vocab.stoi.get('<CLS>', 3)


In [None]:
class LogDataset(Dataset):
    def __init__(self, log_seqs, time_seqs, vocab, seq_len=128, mask_ratio=0.15):
        self.log_seqs = log_seqs
        self.time_seqs = time_seqs
        self.vocab = vocab
        self.seq_len = seq_len
        self.mask_ratio = mask_ratio

    def __len__(self):
        return len(self.log_seqs)

    def __getitem__(self, idx):
        log_seq, time_seq = self.log_seqs[idx], self.time_seqs[idx]
        input_ids, labels, time_input, time_labels = self.mask_sequence(log_seq, time_seq)

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long),
            "time_input": torch.tensor(time_input, dtype=torch.float).unsqueeze(1),
            "time_labels": torch.tensor(time_labels, dtype=torch.float)
        }

    def mask_sequence(self, log_seq, time_seq):
        tokens, labels = [], []
        time_input, time_labels = [], []

        for token, time in zip(log_seq, time_seq):
            if random.random() < self.mask_ratio:
                tokens.append(self.vocab.mask_index)
                labels.append(self.vocab.stoi[token] if token in self.vocab.stoi else self.vocab.unk_index)
                time_input.append(0)
                time_labels.append(time)
            else:
                tokens.append(self.vocab.stoi[token] if token in self.vocab.stoi else self.vocab.unk_index)
                labels.append(0)
                time_input.append(time)
                time_labels.append(0)

        tokens = [self.vocab.sos_index] + tokens
        labels = [self.vocab.pad_index] + labels
        time_input = [0] + time_input
        time_labels = [0] + time_labels

        pad_len = self.seq_len - len(tokens)
        tokens += [self.vocab.pad_index] * pad_len
        labels += [self.vocab.pad_index] * pad_len
        time_input += [0] * pad_len
        time_labels += [0] * pad_len

        return tokens[:self.seq_len], labels[:self.seq_len], time_input[:self.seq_len], time_labels[:self.seq_len]



In [None]:
def fixed_window_parser(log_lines, window_size=20, min_len=10):
    logkey_seqs, time_seqs = [], []
    for line in log_lines:
        line = line.strip().split()
        if len(line) < min_len:
            continue
        keys, times = [], []
        for item in line:
            parts = item.split(",")
            if len(parts) == 2:
                keys.append(parts[0])
                times.append(float(parts[1]))
            else:
                keys.append(parts[0])
                times.append(0.0)
        for i in range(0, len(keys), window_size):
            logkey_seqs.append(keys[i:i + window_size])
            time_seqs.append(times[i:i + window_size])
    return logkey_seqs, time_seqs

# Load raw log lines
with open("/Users/poojithareddy/Desktop/ucf/Sem2/NLP/nlp_project/dataset/HDFS/HDFS_2k.log_structured.csv", "r") as f:
    raw_lines = f.readlines()

# Generate sequences
logkey_seqs, time_seqs = fixed_window_parser(raw_lines, window_size=20, min_len=10)

# Split into train and validation sets
logkey_train, logkey_valid, time_train, time_valid = train_test_split(
    logkey_seqs, time_seqs, test_size=0.1, random_state=42
)

print(f"Train sequences: {len(logkey_train)}, Valid sequences: {len(logkey_valid)}")


In [None]:
# Create dataset instances
train_dataset = LogDataset(logkey_train, time_train, vocab, seq_len=128)
valid_dataset = LogDataset(logkey_valid, time_valid, vocab, seq_len=128)

# Create PyTorch DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64)

# Check sample batch shape
sample_batch = next(iter(train_loader))
print("Sample input_ids shape:", sample_batch["input_ids"].shape)
print("Sample time_input shape:", sample_batch["time_input"].shape)




In [None]:
class SimpleBERT(nn.Module):
    def __init__(self, vocab_size, hidden_dim=128, max_len=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, hidden_dim, padding_idx=vocab.pad_index)
        self.cls = nn.Sequential(
            nn.Linear(hidden_dim, vocab_size),
            nn.LogSoftmax(dim=-1)  # for NLLLoss
        )
    def forward(self, x):
        x = self.embed(x)
        cls_output = x[:, 0]  # [CLS] token
        logits = self.cls(x)  # shape: [batch_size, seq_len, vocab_size]
        return logits, cls_output




In [None]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        logits, _ = model(input_ids)
        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)
def eval_epoch(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            logits, _ = model(input_ids)
            loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
            total_loss += loss.item()
    return total_loss / len(loader)



In [None]:
model = SimpleBERT(vocab_size=len(vocab)).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.NLLLoss(ignore_index=vocab.pad_index)
train_losses, val_losses = [], []
for epoch in range(10):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss = eval_epoch(model, valid_loader, criterion)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    print(f"Epoch {epoch}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")
torch.save(model.state_dict(), "logbert_model.pth")


In [None]:
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.title("Training vs Validation Loss")
plt.show()


In [None]:
with open("output_hdfs/vocab.pkl", "rb") as f:
    vocab = pickle.load(f)

vocab.pad_index = vocab.stoi.get('<PAD>', 0)
vocab.mask_index = vocab.stoi.get('<MASK>', 1)
vocab.unk_index = vocab.stoi.get('<UNK>', 2)
vocab.sos_index = vocab.stoi.get('<CLS>', 3)




In [None]:
def fixed_window(log_lines, window_size=20, min_len=10):
    logkey_seqs, time_seqs = [], []
    for line in log_lines:
        line = line.strip().split()
        if len(line) < min_len:
            continue
        keys, times = [], []
        for item in line:
            parts = item.split(",")
            if len(parts) == 2:
                keys.append(parts[0])
                times.append(float(parts[1]))
            else:
                keys.append(parts[0])
                times.append(0.0)
        for i in range(0, len(keys), window_size):
            logkey_seqs.append(keys[i:i + window_size])
            time_seqs.append(times[i:i + window_size])
    return logkey_seqs, time_seqs



In [None]:
with open("output_hdfs/test_normal", "r") as f:
    logkey_normal, time_normal = fixed_window(f.readlines(), window_size=20)

with open("output_hdfs/test_abnormal", "r") as f:
    logkey_abnormal, time_abnormal = fixed_window(f.readlines(), window_size=20)

print("Loaded:", len(logkey_normal), "normal sequences,", len(logkey_abnormal), "abnormal sequences")


In [None]:
class LogDataset(Dataset):
    def __init__(self, log_seqs, time_seqs, vocab, seq_len=128, mask_ratio=0.15):
        self.log_seqs = log_seqs
        self.time_seqs = time_seqs
        self.vocab = vocab
        self.seq_len = seq_len
        self.mask_ratio = mask_ratio

    def __len__(self):
        return len(self.log_seqs)

    def __getitem__(self, idx):
        k, t = self.log_seqs[idx], self.time_seqs[idx]
        tokens, labels, time_input, time_labels = self.mask_sequence(k, t)
        return {
            "input_ids": torch.tensor(tokens, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long),
            "time_input": torch.tensor(time_input, dtype=torch.float).unsqueeze(1)
        }

    def mask_sequence(self, k, t):
        tokens, labels, time_input, time_labels = [], [], [], []
        for token, time in zip(k, t):
            if np.random.rand() < self.mask_ratio:
                tokens.append(self.vocab.mask_index)
                labels.append(self.vocab.stoi.get(token, self.vocab.unk_index))
                time_input.append(0)
                time_labels.append(time)
            else:
                tokens.append(self.vocab.stoi.get(token, self.vocab.unk_index))
                labels.append(0)
                time_input.append(time)
                time_labels.append(0)
        tokens = [self.vocab.sos_index] + tokens
        labels = [self.vocab.pad_index] + labels
        time_input = [0] + time_input
        time_labels = [0] + time_labels

        pad_len = self.seq_len - len(tokens)
        tokens += [self.vocab.pad_index] * pad_len
        labels += [self.vocab.pad_index] * pad_len
        time_input += [0] * pad_len
        time_labels += [0] * pad_len

        return tokens[:self.seq_len], labels[:self.seq_len], time_input[:self.seq_len], time_labels[:self.seq_len]



In [None]:
class SimpleBERT(nn.Module):
    def __init__(self, vocab_size, hidden_dim=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, hidden_dim, padding_idx=vocab.pad_index)
        self.cls = nn.Sequential(
            nn.Linear(hidden_dim, vocab_size),
            nn.LogSoftmax(dim=-1)
        )

    def forward(self, x):
        x = self.embed(x)
        cls_output = x[:, 0]  
        logits = self.cls(x)
        return logits, cls_output




In [None]:
def load_test_data(path, window_size=20):
    with open(path, "r") as f:
        lines = f.readlines()
    return fixed_window(lines, window_size=window_size)



In [None]:
def detect_anomalies(model, loader, num_candidates=30, threshold_ratio=0.5):
    model.eval()
    total_sequences = 0
    total_anomalies = 0

    with torch.no_grad():
        for batch in tqdm(loader):
            inputs = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            logits, _ = model(inputs) 
            top_preds = torch.topk(logits, num_candidates, dim=-1).indices

            for i in range(inputs.size(0)):
                masked_positions = labels[i] > 0
                masked_tokens = labels[i][masked_positions]
                predicted_top = top_preds[i][masked_positions]

                if masked_tokens.numel() == 0:
                    continue

                num_undetected = sum(
                    [masked_tokens[j].item() not in predicted_top[j].cpu().tolist() for j in range(masked_tokens.size(0))]
                )

                if num_undetected > threshold_ratio * masked_tokens.size(0):
                    total_anomalies += 1

                total_sequences += 1

    return total_anomalies, total_sequences


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleBERT(vocab_size=len(vocab)).to(device)
model.load_state_dict(torch.load("logbert_model.pth", map_location=device))
model.eval()

normal_dataset = LogDataset(logkey_normal, time_normal, vocab)
abnormal_dataset = LogDataset(logkey_abnormal, time_abnormal, vocab)

normal_loader = DataLoader(normal_dataset, batch_size=64)
abnormal_loader = DataLoader(abnormal_dataset, batch_size=64)

print("Evaluating on normal logs...")
normal_anomalies, normal_total = detect_anomalies(model, normal_loader)

print("Evaluating on abnormal logs...")
abnormal_anomalies, abnormal_total = detect_anomalies(model, abnormal_loader)


In [None]:
normal_preds = evaluate_file("output_hdfs/test_normal", vocab, model, threshold_ratio=0.5)
abnormal_preds = evaluate_file("output_hdfs/test_abnormal", vocab, model, threshold_ratio=0.5)

TP = sum(abnormal_preds)
FN = len(abnormal_preds) - TP
FP = sum(normal_preds)
TN = len(normal_preds) - FP

P = 100 * TP / (TP + FP) if TP + FP > 0 else 0
R = 100 * TP / (TP + FN) if TP + FN > 0 else 0
F1 = 2 * P * R / (P + R) if P + R > 0 else 0

print(f"TP: {TP}, TN: {TN}, FP: {FP}, FN: {FN}")
print(f"Precision: {P:.2f}%, Recall: {R:.2f}%, F1-Score: {F1:.2f}%")


In [None]:
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

# Combine predictions and ground truth
y_true = [0] * len(normal_preds) + [1] * len(abnormal_preds)
y_pred = normal_preds + abnormal_preds

# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_pred)

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
TP, FN, FP, TN = cm[1, 1], cm[1, 0], cm[0, 1], cm[0, 0]

# Print results
print(f"Confusion Matrix:\n{cm}")
print(f"TP: {TP}, TN: {TN}, FP: {FP}, FN: {FN}")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"AUC:       {auc:.4f}")
