In [1]:
import re
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import numpy as np





In [2]:
df = pd.read_csv('dataset/synthetic_logs.csv')

In [5]:
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [6]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)

In [7]:
# Separate regex matches (high confidence)
df_regex = df[df['regex_label'].notnull()]
df_non_regex = df[df['regex_label'].isnull()].copy()

In [8]:
df_non_regex = df_non_regex[df_non_regex.source != 'LegacyCRM']

In [9]:
def normalize_log(log):
    log = re.sub(r"\bUser\d+\b", "USER_ID", log)
    log = re.sub(r"\b\d{4}-\d{2}-\d{2}\b", "DATE", log)
    log = re.sub(r"\b\d+\.\d+\.\d+\.\d+\b", "IP_ADDR", log)
    log = re.sub(r"\b\d+\b", "NUM", log)
    return log.lower()

In [10]:
df_non_regex['normalized_message'] = df_non_regex['log_message'].apply(normalize_log)

In [11]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df_non_regex['normalized_message'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

In [12]:
dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)
df_non_regex['cluster'] = clusters

In [13]:
cluster_counts = df_non_regex['cluster'].value_counts()
large_clusters = cluster_counts[cluster_counts > 10].index

In [14]:
per_cluster_models = {}
for cluster_id in large_clusters:
    cluster_df = df_non_regex[df_non_regex['cluster'] == cluster_id]
    X = model.encode(cluster_df['normalized_message'].tolist())
    y = cluster_df['target_label']

    if len(set(y)) > 1:  # Avoid single-class clusters
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42
        )
        clf = LogisticRegression(max_iter=1000)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(f"\nCluster {cluster_id} classification report:")
        print(classification_report(y_test, y_pred))
        per_cluster_models[cluster_id] = clf


Cluster 1 classification report:
                precision    recall  f1-score   support

Critical Error       1.00      1.00      1.00         6
         Error       1.00      1.00      1.00         6

      accuracy                           1.00        12
     macro avg       1.00      1.00      1.00        12
  weighted avg       1.00      1.00      1.00        12



In [15]:
X_all = embeddings
y_all = df_non_regex['target_label']
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42)

general_clf = LogisticRegression(max_iter=1000)
general_clf.fit(X_train, y_train)
y_pred = general_clf.predict(X_test)
print("\nGeneral classifier report:")
print(classification_report(y_test, y_pred))


General classifier report:
                precision    recall  f1-score   support

Critical Error       0.91      1.00      0.95        48
         Error       0.98      0.89      0.93        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       1.00      0.99      1.00       123

      accuracy                           0.99       571
     macro avg       0.98      0.98      0.98       571
  weighted avg       0.99      0.99      0.99       571



In [17]:
import joblib
joblib.dump(model, "../models/sentence_transformer.pkl")
joblib.dump(general_clf, "../models/general_logistic_regression.pkl")
joblib.dump(per_cluster_models, "../models/per_cluster_models.pkl")

['../models/per_cluster_models.pkl']