In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('dataset/synthetic_logs.csv')

In [3]:
df.head(7)

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert
5,2025-10-09 10:30:31,ModernHR,nova.osapi_compute.wsgi.server [req-f0bffbc3-5...,HTTP Status,bert
6,3/1/2025 19:14,ModernHR,Shard 6 replication task ended in failure,Error,bert


In [4]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [5]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [6]:
from sentence_transformers import SentenceTransformer

In [7]:
from sklearn.cluster import DBSCAN

In [8]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['log_message'].tolist())

In [9]:
dbscan = DBSCAN(eps=0.2, min_samples=5, metric='euclidean')
clusters = dbscan.fit_predict(embeddings)

In [10]:
embeddings[0]

array([-1.02939621e-01,  3.35459411e-02, -2.20260732e-02,  1.55101740e-03,
       -9.86917876e-03, -1.78956270e-01, -6.34409785e-02, -6.01761639e-02,
        2.81109158e-02,  5.99620491e-02, -1.72618348e-02,  1.43363548e-03,
       -1.49560034e-01,  3.15287686e-03, -5.66030927e-02,  2.71685235e-02,
       -1.49891041e-02, -3.54037657e-02, -3.62936445e-02, -1.45410765e-02,
       -5.61491773e-03,  8.75539035e-02,  4.55120578e-02,  2.50963885e-02,
        1.00187510e-02,  1.24267349e-02, -1.39923573e-01,  7.68696293e-02,
        3.14095505e-02, -4.15247958e-03,  4.36902344e-02,  1.71250012e-02,
       -8.00951198e-02,  5.74006326e-02,  1.89091656e-02,  8.55262503e-02,
        3.96398641e-02, -1.34371817e-01, -1.44360063e-03,  3.06704035e-03,
        1.76854044e-01,  4.44885530e-03, -1.69274509e-02,  2.24266481e-02,
       -4.35049310e-02,  6.09034160e-03, -9.98169929e-03, -6.23972900e-02,
        1.07372422e-02, -6.04895083e-03, -7.14660808e-02, -8.45799781e-03,
       -3.18019874e-02, -

In [11]:
df['cluster'] = clusters

In [12]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,-1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,-1
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,1


In [13]:
df[df.cluster == 5]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
20,2025-12-27 12:52:25,ThirdPartyAPI,nova.osapi_compute.wsgi.server [req-033d97b9-6...,HTTP Status,bert,5
63,2025-01-19 16:01:21,ThirdPartyAPI,nova.osapi_compute.wsgi.server [req-d6e9cfb8-d...,HTTP Status,bert,5
65,2025-11-21 01:36:24,BillingSystem,nova.osapi_compute.wsgi.server [req-8c36e637-8...,HTTP Status,bert,5
70,2025-09-30 20:20:15,BillingSystem,nova.osapi_compute.wsgi.server [req-0b7fefce-6...,HTTP Status,bert,5
157,2025-01-25 14:48:02,ThirdPartyAPI,nova.osapi_compute.wsgi.server [req-60f50a9d-8...,HTTP Status,bert,5
257,2025-12-18 00:05:43,AnalyticsEngine,nova.osapi_compute.wsgi.server [req-4beefba4-a...,HTTP Status,bert,5
272,2025-10-11 14:16:33,ModernCRM,nova.osapi_compute.wsgi.server [req-f1165387-3...,HTTP Status,bert,5
375,2025-10-07 17:58:42,ThirdPartyAPI,nova.osapi_compute.wsgi.server [req-dedb4b73-1...,HTTP Status,bert,5
376,2025-04-07 01:54:43,ModernCRM,nova.osapi_compute.wsgi.server [req-d0d1ead8-1...,HTTP Status,bert,5
566,2025-08-23 01:09:26,ModernCRM,nova.osapi_compute.wsgi.server [req-f34ca27d-6...,HTTP Status,bert,5


In [14]:
cluster_count = df['cluster'].value_counts()
cluster_count

cluster
-1     1111
 0      406
 2      159
 1      143
 4       60
 13      57
 15      52
 14      51
 12      44
 5       40
 8       31
 10      21
 11      21
 21      21
 22      20
 7       19
 18      19
 16      19
 3       19
 6       15
 9       15
 17      15
 19      15
 26      11
 20       8
 23       7
 25       6
 24       5
Name: count, dtype: int64

In [15]:
large_clusters = cluster_count[cluster_count > 10].index

for cluster in large_clusters:
    print(f"Cluster {cluster} :")
    print(df[df['cluster'] == cluster]['log_message'].head(5).to_string(index=False))
    print()

Cluster -1 :
    Email service experiencing issues with sending
         Unauthorized access to data was attempted
         Shard 6 replication task ended in failure
File data_6169.csv uploaded successfully by use...
          Email server encountered a sending fault

Cluster 0 :
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-d4f8d0c2-4f...
nova.osapi_compute.wsgi.server [req-6fe0e366-f2...
nova.osapi_compute.wsgi.server [req-945d1f31-a2...

Cluster 2 :
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-5e6e042b-f9...
nova.osapi_compute.wsgi.server [req-4d05bae9-8a...
nova.osapi_compute.wsgi.server [req-9174a757-01...
nova.osapi_compute.wsgi.server [req-b2ffcdcc-26...

Cluster 1 :
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...
nova.osapi_compute.wsgi.server [req-5f1c2027-e1...
nova.osapi_compute.wsgi.server

In [16]:
import re

def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).*": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.*": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.*": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }

    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return "None"



In [17]:
classify_with_regex("User User536 logged OUT.")

'User Action'

In [18]:
classify_with_regex("Hey, I am Shagun")

'None'

In [19]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)

In [20]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,-1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,-1,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,1,


In [21]:
df.tail()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,2,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,-1,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,-1,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,-1,
2409,12/25/2025 13:21,AnalyticsEngine,Repeated failed login attempts occurred for us...,Security Alert,bert,-1,


In [22]:
df[df['regex_label'] == 'User Action']

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,-1,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex,-1,User Action
57,9/14/2025 3:03,AnalyticsEngine,User User395 logged in.,User Action,regex,-1,User Action
85,3/13/2025 2:11,ModernHR,User User225 logged in.,User Action,regex,-1,User Action
88,3/8/2025 19:04,AnalyticsEngine,User User494 logged out.,User Action,regex,-1,User Action
...,...,...,...,...,...,...,...
2207,10/4/2025 8:06,ModernCRM,User User495 logged in.,User Action,regex,-1,User Action
2263,2/27/2025 14:40,AnalyticsEngine,User User429 logged out.,User Action,regex,-1,User Action
2275,3/13/2025 17:17,AnalyticsEngine,User User755 logged out.,User Action,regex,-1,User Action
2323,12/1/2025 18:17,ThirdPartyAPI,User User882 logged out.,User Action,regex,-1,User Action


In [23]:
df[df.regex_label.notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,-1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,-1,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,1,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,2,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,-1,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,-1,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,-1,


In [24]:
df_non_regex = df[df['regex_label'] == 'None'].copy()


In [25]:
df_non_regex

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,-1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,-1,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,1,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,2,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,-1,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,-1,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,-1,


In [26]:
print(df_non_regex['target_label'].value_counts()[df_non_regex['target_label'].value_counts() <= 5].index.to_list())



In [None]:
df_non_legacy = df_non_regex[df_non_regex.source != 'LegacyCRM']
df_non_legacy.source.unique() 

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [28]:
filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist())
filtered_embeddings[:2]

array([[-1.02939621e-01,  3.35459411e-02, -2.20260732e-02,
         1.55101740e-03, -9.86917876e-03, -1.78956270e-01,
        -6.34409785e-02, -6.01761639e-02,  2.81109158e-02,
         5.99620491e-02, -1.72618348e-02,  1.43363548e-03,
        -1.49560034e-01,  3.15287686e-03, -5.66030927e-02,
         2.71685235e-02, -1.49891041e-02, -3.54037657e-02,
        -3.62936445e-02, -1.45410765e-02, -5.61491773e-03,
         8.75539035e-02,  4.55120578e-02,  2.50963885e-02,
         1.00187510e-02,  1.24267349e-02, -1.39923573e-01,
         7.68696293e-02,  3.14095505e-02, -4.15247958e-03,
         4.36902344e-02,  1.71250012e-02, -8.00951198e-02,
         5.74006326e-02,  1.89091656e-02,  8.55262503e-02,
         3.96398641e-02, -1.34371817e-01, -1.44360063e-03,
         3.06704035e-03,  1.76854044e-01,  4.44885530e-03,
        -1.69274509e-02,  2.24266481e-02, -4.35049310e-02,
         6.09034160e-03, -9.98169929e-03, -6.23972900e-02,
         1.07372422e-02, -6.04895083e-03, -7.14660808e-0

In [29]:
X = filtered_embeddings
y = df_non_legacy['target_label']

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.91      1.00      0.95        48
         Error       0.98      0.89      0.93        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       1.00      0.99      1.00       123

      accuracy                           0.99       571
     macro avg       0.98      0.98      0.98       571
  weighted avg       0.99      0.99      0.99       571



In [30]:
import joblib 

In [32]:
joblib.dump(clf, 'models/log_classifier_model.joblib')

['models/log_classifier_model.joblib']