In [1]:
import pandas as pd

# === Load your BGL log file ===
with open("BGL_2k.log", "r", encoding='utf-8', errors='ignore') as file:
    logs = file.readlines()

# === Optional: remove empty lines and trim whitespace ===
logs = [line.strip() for line in logs if line.strip() != ""]

# === Just keep the log message part (last few columns) ===
# You can extract only the last part of the log using string split
log_messages = [' '.join(line.split()[8:]) for line in logs]

# === Create DataFrame ===
df_logs = pd.DataFrame(log_messages, columns=["log_message"])
df_logs.head()

Unnamed: 0,log_message
0,INFO instruction cache parity error corrected
1,INFO instruction cache parity error corrected
2,INFO instruction cache parity error corrected
3,INFO instruction cache parity error corrected
4,INFO 63543 double-hummer alignment exceptions


In [3]:
from transformers import RobertaTokenizer, RobertaModel
import torch
import numpy as np

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")
model.eval()

# Embed function
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Apply to all log messages
embeddings = []
for i, msg in enumerate(df_logs["log_message"]):
    try:
        emb = get_embedding(msg)
        embeddings.append(emb)
    except Exception as e:
        print(f"Error at index {i}: {e}")
        embeddings.append(np.zeros(768))  # fallback for failed embeddings

    if (i + 1) % 100 == 0:
        print(f"✅ Processed {i + 1} log messages")

embeddings = np.array(embeddings)
print("✅ Done! Embeddings shape:", embeddings.shape)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embeddings shape: (2000, 768)
