In [43]:
import pandas as pd

df = pd.read_csv('dataset/synthetic_logs.csv')
df

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert
...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert


In [44]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [45]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

Let's implement log message clustering using DBSCAN algorithm with sentence transformers for text embedding. We'll:
1. Install required packages
2. Create embeddings using sentence-transformers
3. Apply DBSCAN clustering
4. Analyze the results

In [46]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
import numpy as np

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings
embeddings = model.encode(df['log_message'].tolist(), show_progress_bar=True)

Batches: 100%|██████████| 76/76 [01:11<00:00,  1.07it/s]


In [47]:
# Apply DBSCAN clustering
clustering = DBSCAN(eps=0.5, min_samples=5).fit(embeddings)

# Add cluster labels to dataframe
df['cluster'] = clustering.labels_
df

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,-1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,-1
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0
...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,-1
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,9
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,-1


In [48]:
# Show clusters distribution
print(f"Number of unique clusters: {len(np.unique(clustering.labels_))}")
print("\nCluster distribution:")
print(df['cluster'].value_counts().sort_index())

Number of unique clusters: 29

Cluster distribution:
cluster
-1     548
 0     809
 1      42
 2      53
 3      84
 4      60
 5      31
 6      15
 7      99
 8      86
 9     206
 10     10
 11     48
 12     42
 13     58
 14     14
 15     29
 16     51
 17     21
 18     15
 19      7
 20     20
 21     17
 22      7
 23      8
 24      6
 25     11
 26      6
 27      7
Name: count, dtype: int64


In [49]:
# Display sample messages from each cluster
for cluster in sorted(df['cluster'].unique()):
    print(f"\nCluster {cluster}:")
    sample_messages = df[df['cluster'] == cluster]['log_message'].sample(
        min(3, len(df[df['cluster'] == cluster]))).values
    for msg in sample_messages:
        print(f"- {msg}")


Cluster -1:
- Email provider experienced a mail delivery issue
- Kernel panic terminated system boot sequence
- Account with ID 1724 created by User232.

Cluster 0:
- nova.osapi_compute.wsgi.server [req-7f1448b8-38fe-4dc3-b1d7-ca0058b598bd 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1910 time: 0.2568409
- nova.osapi_compute.wsgi.server [req-9339970e-e303-4833-a6b6-31e4b360624c 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" Status code -  200 len: 1759 time: 0.2833371
- nova.osapi_compute.wsgi.server [req-64997958-5681-4d81-9842-7761830aba34 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2691131

Cluster 1:
- Data replication for 

Note: 
- Cluster -1 represents noise points (outliers)
- You may need to adjust eps and min_samples parameters of DBSCAN based on your specific needs
- The chosen model 'all-MiniLM-L6-v2' is a good balance between speed and performance


Let's create a regex-based log classification system based on the target labels from the synthetic dataset.
First, we'll analyze the labels and create patterns for each category.

In [69]:
import re

# Define regex patterns for each log category
log_patterns = {
    r"User User\d+ logged (in|out).": "User Action",
    r"Backup (started|ended) at .*": "System Notification",
    r"Backup completed successfully.": "System Notification",
    r"System updated to version .*": "System Notification",
    r"File .* uploaded successfully by user .*": "System Notification",
    r"Disk cleanup completed successfully.": "System Notification",
    r"System reboot initiated by user .*": "System Notification",
    r"Account with ID .* created by .*": "User Action"
}

def classify_log(message):
    for pattern, category in log_patterns.items():
        if re.search(pattern, message):
            return category
    return None

In [70]:
# Apply classification to the dataset
df['regex_label'] = df['log_message'].apply(classify_log)

# Calculate accuracy
accuracy = (df['regex_label'] == df['target_label']).mean()
print(f"Classification Accuracy: {accuracy:.2%}")

# Show confusion matrix
confusion = pd.crosstab(df['target_label'], df['regex_label'],
                        margins=True)
confusion

Classification Accuracy: 20.75%


regex_label,System Notification,User Action,All
target_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
System Notification,356,0,356
User Action,0,144,144
All,356,144,500


In [72]:
classify_log("User User123 logged in.")

'User Action'

In [73]:
df[df['regex_label'].notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,2,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,regex,2,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,regex,4,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,5,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex,7,User Action
...,...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,regex,13,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,16,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,regex,16,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,regex,8,System Notification


In [74]:
df[df['regex_label'].isnull()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,-1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,-1,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,-1,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,9,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,-1,


we can see that we matched just rows with regex complexity

Stage 2: Classification Using Embeddings

In [75]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1910, 7)

In [76]:
df_legacy = df_non_regex[df_non_regex.source=="LegacyCRM"]
df_legacy

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
60,2025-10-06 16:55:23,LegacyCRM,Lead conversion failed for prospect ID 7842 du...,Workflow Error,llm,-1,
255,2025-05-03 16:55:35,LegacyCRM,API endpoint 'getCustomerDetails' is deprecate...,Deprecation Warning,llm,-1,
377,2025-06-24 12:16:29,LegacyCRM,Customer follow-up process for lead ID 5621 fa...,Workflow Error,llm,-1,
1325,2025-04-17 07:33:44,LegacyCRM,Escalation rule execution failed for ticket ID...,Workflow Error,llm,-1,
1734,2025-04-30 07:47:30,LegacyCRM,The 'ExportToCSV' feature is outdated. Please ...,Deprecation Warning,llm,-1,
1826,2025-01-23 10:33:36,LegacyCRM,Support for legacy authentication methods will...,Deprecation Warning,llm,-1,
2217,2025-05-12 09:46:54,LegacyCRM,Task assignment for TeamID 3425 could not comp...,Workflow Error,llm,-1,


In [77]:
df_non_legacy = df_non_regex[df_non_regex.source!="LegacyCRM"]
df_non_legacy

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,-1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,-1,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,-1,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,9,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,-1,


In [78]:
df_non_legacy.shape

(1903, 7)