In [6]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

def main():
    # ---------- Step 1: Load the Data ----------
    csv_file = "output.csv"  # Change to your CSV file if needed
    try:
        df = pd.read_csv(csv_file)
        print(f"✅ Loaded data from {csv_file}")
    except Exception as e:
        print(f"❌ Error loading {csv_file}: {e}")
        exit(1)

    if "message" not in df.columns:
        print("❌ Error: 'message' column not found in the CSV.")
        exit(1)

    # Ensure messages are strings and remove missing values.
    messages = df["message"].dropna().astype(str).tolist()

    # ---------- Step 2: Initialize the Sentence Transformer Model ----------
    print("Initializing Sentence Transformer model...")
    model = SentenceTransformer("all-MiniLM-L6-v2")  # A lightweight, effective model

    # ---------- Step 3: Compute Embeddings for Each Message ----------
    print("Computing embeddings for messages (this may take a minute)...")
    message_embeddings = model.encode(messages, show_progress_bar=True)

    # ---------- Step 4: Define the Query and Compute Its Embedding ----------
    # You can adjust the query string as needed.
    query = "login authentication failure sudo user"
    print(f"\nComputing embedding for query: '{query}'")
    query_embedding = model.encode([query])[0]

    # ---------- Step 5: Compute Cosine Similarity ----------
    # Convert embeddings to numpy arrays.
    message_embeddings = np.array(message_embeddings)
    query_embedding = np.array(query_embedding)

    # Compute cosine similarity between the query and each message.
    dot_products = np.dot(message_embeddings, query_embedding)
    message_norms = np.linalg.norm(message_embeddings, axis=1)
    query_norm = np.linalg.norm(query_embedding)
    cosine_similarities = dot_products / (message_norms * query_norm)

    # ---------- Step 6: Retrieve and Display Top 10 Most Similar Messages ----------
    top_n = 10
    top_indices = np.argsort(cosine_similarities)[::-1][:top_n]

    print("\n--- Top 10 Messages Related to {query} ---\n")
    for rank, idx in enumerate(top_indices, start=1):
        sim_score = cosine_similarities[idx]
        msg = messages[idx]
        print(f"{rank}. Similarity: {sim_score:.4f} | Message: {msg}\n")

if __name__ == "__main__":
    main()


✅ Loaded data from output.csv
Initializing Sentence Transformer model...
Computing embeddings for messages (this may take a minute)...


Batches:   0%|          | 0/851 [00:00<?, ?it/s]


Computing embedding for query: 'login authentication failure sudo user'

--- Top 10 Messages Related to {query} ---

1. Similarity: 0.4465 | Message: 2025-02-24T19:23:30.363751-08:00 elastic-Standard-PC-i440FX-PIIX-1996 gdm-password]: pam_unix(gdm-password:auth): authentication failure; logname= uid=0 euid=0 tty=/dev/tty1 ruser= rhost=  user=elastic

2. Similarity: 0.4426 | Message: 2024-08-27 15:37:31 status half-configured sudo:amd64 1.9.15p5-3ubuntu5

3. Similarity: 0.4304 | Message: 2025-02-04T04:40:26.340731+00:00 elastic-Standard-PC-i440FX-PIIX-1996 gdm-password]: message repeated 2 times: [ pam_unix(gdm-password:auth): authentication failure; logname=elastic uid=0 euid=0 tty=/dev/tty1 ruser= rhost=  user=elastic]

4. Similarity: 0.4232 | Message: Setting up sudo (1.9.15p5-3ubuntu5) ...

5. Similarity: 0.4194 | Message: 2024-08-27 15:37:22 configure adduser:all 3.137ubuntu1 3.137ubuntu1

6. Similarity: 0.4101 | Message: 2024-08-27 15:37:29 status unpacked sudo:amd64 1.9.15p5-3ub