In [1]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("dataset/")
df.head()

In [None]:
print(df.source.unique())
print(df.target_label.unique())

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN

# Load a pre-trained sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Ensure the 'log_message' column exists and is of string type
if "log_message" in df.columns:
    df["log_message"] = df["log_message"].astype(str)  # Handle non-string entries gracefully
    
    # Encode text data into dense vector embeddings
    embeddings = model.encode(df["log_message"].tolist(), show_progress_bar=True, convert_to_numpy=True)
else:
    raise KeyError("Column 'log_message' not found in DataFrame.")


In [None]:
embeddings[:2]

In [None]:
from sklearn.cluster import DBSCAN

# Initialize DBSCAN with cosine distance
# Note: the correct parameter is 'metric' (not 'metrics') and 'min_samples' (not 'min_sample')
dbscan = DBSCAN(eps=0.2, min_samples=1, metric="cosine")

# Fit the DBSCAN model on the embeddings
clusters = dbscan.fit(embeddings)


In [None]:
# Assign cluster labels to the DataFrame
df["cluster"] = clusters.labels_

# Display the first few rows to inspect results
df.head()


In [None]:
from sklearn.metrics import silhouette_score

# Only compute silhouette score if there are at least 2 clusters (excluding noise)
n_clusters = len(set(clusters.labels_)) - (1 if -1 in clusters.labels_ else 0)

if n_clusters > 1:
    score = silhouette_score(embeddings, clusters.labels_, metric="cosine")
    print(f"Silhouette Score: {score:.4f}")
else:
    print("Silhouette score not applicable: fewer than 2 clusters found.")


In [None]:
import umap.umap_ as umap
import matplotlib.pyplot as plt
import seaborn as sns

# Reduce embeddings to 2D for visualization
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
embedding_2d = reducer.fit_transform(embeddings)

# Create a DataFrame for plotting
plot_df = pd.DataFrame({
    "x": embedding_2d[:, 0],
    "y": embedding_2d[:, 1],
    "cluster": df["cluster"].astype(str)  # Convert for categorical coloring
})

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=plot_df, x="x", y="y", hue="cluster", palette="tab10", s=50, alpha=0.8)
plt.title("UMAP Projection of Sentence Embeddings by Cluster", fontsize=14)
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
df[df["cluster"] == 1]

In [None]:
# Count how many entries are in each cluster
cluster_counts = df["cluster"].value_counts()

# Get clusters with more than 10 samples
large_clusters = cluster_counts[cluster_counts > 10].index

# Display the top 5 log messages for each large cluster
for cluster in large_clusters:
    print(f"\n Cluster {cluster} (Size: {cluster_counts[cluster]}):")
    print(df[df["cluster"] == cluster]["log_message"].head(5).to_string(index=False))


In [None]:
import re

def classify_with_regex(log_message):
    """
    Classifies a log message using regex patterns to determine its category.
    Returns a string label if matched, otherwise None.
    """
    regex_patterns = {
        r"User User\d+ logged (in|out)\.": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully\.": "System Notification",
        r"System Updated to version .*": "System Notification",
        r"File .* uploaded successfully.*": "System Notification",
        r"Disk cleanup completed successfully\.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .*": "User Action"
    }

    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return "Unclassified"


In [None]:
# Apply regex-based classification to each log message
df["regex_label"] = df["log_message"].apply(classify_with_regex)

df["regex_label"].isnull().sum()

In [None]:
# Extract logs that were not matched by any regex pattern
df_non_regex = df[df["regex_label"] == "Unclassified"].copy()

In [None]:
# Identify rare target labels (appearing 5 times or fewer) within unmatched logs
rare_labels = df_non_regex["target_label"].value_counts()
rare_labels = rare_labels[rare_labels <= 5].index.tolist()

print(rare_labels)

In [None]:
# Filter out entries from source 'LegacyCRM'
df_non_legacy = df_non_regex[df_non_regex["source"] != "LegacyCRM"]

# View the unique sources remaining in the filtered DataFrame
print(df_non_legacy["source"].unique())

In [None]:
# Encode the log messages from non-legacy sources using the sentence transformer
filter_embeddings = model.encode(
    df_non_legacy["log_message"].astype(str).tolist(),
    show_progress_bar=True,
    convert_to_numpy=True
)

In [None]:
filter_embeddings[:2]

In [None]:
# Assign features and target labels
X = filter_embeddings
y = df_non_legacy["target_label"]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Initialize and train a logistic regression classifier
clf = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Display a detailed classification report
print(classification_report(y_test, y_pred))

In [None]:
import joblib
import os

# Ensure the model directory exists
os.makedirs("../models", exist_ok=True)

# Save the trained classifier
joblib.dump(clf, "../models/log_classifier.joblib")