In [4]:
import pandas as pd

In [5]:
df=pd.read_csv('dataset/synthetic_logs.csv')
df.head()


Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [6]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [7]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [8]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip




In [9]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN

In [15]:
model = SentenceTransformer('all-MiniLM-L6-v2')


log_messages = df['log_message'].tolist()
embeddings = model.encode(log_messages)


dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)


df['cluster'] = clusters


df['cluster']

0       0
1       1
2       2
3       0
4       0
       ..
2405    0
2406    7
2407    0
2408    1
2409    7
Name: cluster, Length: 2410, dtype: int64

In [17]:
df[df.cluster==5].head(25)

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
8,2025-02-12 10:42:29,ThirdPartyAPI,nova.compute.claims [req-a07ac654-8e81-416d-bf...,Resource Usage,bert,5
26,2025-03-03 17:11:11,ModernCRM,nova.compute.claims [req-d6986b54-3735-4a42-90...,Resource Usage,bert,5
40,2025-06-19 21:42:34,ThirdPartyAPI,nova.compute.claims [req-72b4858f-049e-49e1-b3...,Resource Usage,bert,5
58,2025-09-13 14:45:14,AnalyticsEngine,nova.compute.claims [req-5c8f52bd-8e3c-41f0-95...,Resource Usage,bert,5
61,2025-04-27 11:18:18,ThirdPartyAPI,nova.compute.claims [req-d38f479d-9bb9-4276-96...,Resource Usage,bert,5
64,2025-06-20 23:40:51,ModernHR,nova.compute.claims [req-d82fab16-60f8-4c9f-bd...,Resource Usage,bert,5
109,2025-02-03 06:35:20,AnalyticsEngine,nova.compute.claims [req-868a5460-dbb6-416b-b4...,Resource Usage,bert,5
138,2025-06-26 02:46:29,AnalyticsEngine,nova.compute.claims [req-2d658d2c-7eff-414e-a6...,Resource Usage,bert,5
152,2025-03-29 08:14:30,ThirdPartyAPI,nova.compute.claims [req-29a09cdb-3169-4c40-8b...,Resource Usage,bert,5
177,2025-11-03 04:49:47,ModernCRM,nova.compute.claims [req-9118475d-6e72-48fa-9d...,Resource Usage,bert,5


In [18]:
cluster_counts = df['cluster'].value_counts()
filtered_clusters = cluster_counts[cluster_counts > 10]
print(filtered_clusters)

cluster
0     1017
5      147
11     100
13      86
7       60
8       60
21      58
3       57
4       53
17      52
32      51
6       51
16      48
20      48
9       44
1       39
10      30
34      25
14      20
53      20
52      20
18      17
25      13
42      13
59      12
26      11
Name: count, dtype: int64


In [36]:
import re
def classify_with_regex(log_message):
    regex_patterns={
        r"User User\d+ logged (in|out).":"User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern,log_message,re.IGNORECASE):
            return label
    return None

In [37]:
classify_with_regex("Disk cleanup completed successfully.")

'System Notification'

In [38]:
df['regex_label']=df['log_message'].apply(classify_with_regex)

In [39]:
df.head(30)

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,
5,2025-10-09 10:30:31,ModernHR,nova.osapi_compute.wsgi.server [req-f0bffbc3-5...,HTTP Status,bert,0,
6,3/1/2025 19:14,ModernHR,Shard 6 replication task ended in failure,Error,bert,3,
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,4,System Notification
8,2025-02-12 10:42:29,ThirdPartyAPI,nova.compute.claims [req-a07ac654-8e81-416d-bf...,Resource Usage,bert,5,
9,2025-03-30 04:01:45,ModernHR,nova.osapi_compute.wsgi.server [req-2bf7cfee-a...,HTTP Status,bert,0,


In [41]:
df_non_regex=df[df['regex_label'].isnull()].copy()
df_non_regex

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,


In [45]:
df[df.regex_label.notna()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,4,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,regex,4,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,regex,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex,11,User Action
...,...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,regex,21,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,regex,13,System Notification


In [46]:
target_label_counts = df_non_regex['target_label'].value_counts()
filtered_target_labels = target_label_counts[target_label_counts <= 5]
print(filtered_target_labels)

target_label
Workflow Error         4
Name: count, dtype: int64


In [53]:
df_non_regex[df_non_regex['target_label']=='Deprecation Warning']

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
255,2025-05-03 16:55:35,LegacyCRM,API endpoint 'getCustomerDetails' is deprecate...,Deprecation Warning,llm,48,
1734,2025-04-30 07:47:30,LegacyCRM,The 'ExportToCSV' feature is outdated. Please ...,Deprecation Warning,llm,118,
1826,2025-01-23 10:33:36,LegacyCRM,Support for legacy authentication methods will...,Deprecation Warning,llm,122,


In [54]:
df_non_legacy=df_non_regex[df_non_regex.source!='LegacyCRM']
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [56]:
filtered_embeddings=model.encode(df_non_legacy['log_message'].tolist())


In [58]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X=filtered_embeddings
y=df_non_legacy['target_label']

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)
clf=LogisticRegression(max_iter=1000)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
report=classification_report(y_test,y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.92      1.00      0.96        54
         Error       0.98      0.91      0.94        55
   HTTP Status       1.00      1.00      1.00       333
Resource Usage       1.00      1.00      1.00        55
Security Alert       1.00      0.99      1.00       131

      accuracy                           0.99       628
     macro avg       0.98      0.98      0.98       628
  weighted avg       0.99      0.99      0.99       628



In [61]:
import joblib
joblib.dump(clf, 'models/log_classifier.joblib')

['models/log_classifier.joblib']