In [1]:
import pandas as pd
from torch.ao.quantization.backend_config.onednn import embedding_op_dtype_configs

df = pd.read_csv("../../../../datasets/synthetic_logs.csv")
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [2]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [3]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [4]:
df[df.target_label == "System Notification"].sample(10)

Unnamed: 0,timestamp,source,log_message,target_label,complexity
367,6/18/2025 12:21,ThirdPartyAPI,Backup ended at 2025-08-18 14:29:26.,System Notification,regex
988,9/11/2025 22:23,BillingSystem,System reboot initiated by user User427.,System Notification,regex
1683,12/13/2025 5:35,ModernCRM,Backup completed successfully.,System Notification,regex
1040,11/25/2025 17:03,AnalyticsEngine,File data_9838.csv uploaded successfully by us...,System Notification,regex
865,2/25/2025 1:40,AnalyticsEngine,System reboot initiated by user User964.,System Notification,regex
670,8/2/2025 5:31,BillingSystem,Backup completed successfully.,System Notification,regex
2036,6/11/2025 2:53,ModernHR,System updated to version 2.4.9.,System Notification,regex
1902,4/21/2025 17:32,AnalyticsEngine,Backup completed successfully.,System Notification,regex
760,1/28/2025 17:31,ThirdPartyAPI,Backup started at 2025-02-03 22:33:16.,System Notification,regex
237,11/10/2025 6:09,BillingSystem,Backup ended at 2025-06-01 06:27:38.,System Notification,regex


In [5]:
df[df.log_message.str.startswith("System reboot initiated by user")].sample(10)

Unnamed: 0,timestamp,source,log_message,target_label,complexity
896,7/28/2025 11:24,BillingSystem,System reboot initiated by user User765.,System Notification,regex
668,9/5/2025 7:14,ModernHR,System reboot initiated by user User297.,System Notification,regex
2228,9/3/2025 11:24,ModernHR,System reboot initiated by user User179.,System Notification,regex
1524,11/30/2025 2:39,ThirdPartyAPI,System reboot initiated by user User278.,System Notification,regex
1275,7/15/2025 23:37,BillingSystem,System reboot initiated by user User829.,System Notification,regex
1663,10/27/2025 22:04,AnalyticsEngine,System reboot initiated by user User315.,System Notification,regex
2043,9/12/2025 20:20,ThirdPartyAPI,System reboot initiated by user User262.,System Notification,regex
1852,1/26/2025 12:34,AnalyticsEngine,System reboot initiated by user User223.,System Notification,regex
1803,8/22/2025 6:30,AnalyticsEngine,System reboot initiated by user User204.,System Notification,regex
714,9/25/2025 23:35,ThirdPartyAPI,System reboot initiated by user User600.,System Notification,regex


## Clustering

In [6]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['log_message'].tolist())

In [10]:
embeddings[:5]

array([[-0.10293962,  0.03354593, -0.02202606, ...,  0.00457793,
        -0.04259717,  0.00322622],
       [ 0.00804573, -0.03573924,  0.04938739, ...,  0.01538321,
        -0.06230951, -0.02774664],
       [-0.00908221,  0.13003927, -0.05275569, ...,  0.02014104,
        -0.05117096, -0.02930295],
       [-0.09751041,  0.04911302, -0.03977425, ...,  0.02477501,
        -0.03546083, -0.00018598],
       [-0.10468338,  0.05926034, -0.02488498, ...,  0.02502053,
        -0.03719296, -0.02568912]], shape=(5, 384), dtype=float32)

In [11]:
clustering = DBSCAN(eps=0.2, metric='cosine', min_samples=1).fit(embeddings)
df['cluster'] = clustering.labels_

In [12]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0
