In [37]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split    
from sklearn.metrics import classification_report
import joblib
import os


In [2]:
df = pd.read_csv('dataset/synthetic_logs.csv')

In [3]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status


In [4]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [5]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(df['log_message'].tolist())   

embeddings[:3]

array([[-0.10293972,  0.03354599, -0.02202604, ...,  0.00457789,
        -0.04259715,  0.00322623],
       [ 0.00804573, -0.03573925,  0.04938737, ...,  0.01538317,
        -0.06230951, -0.02774668],
       [-0.00908224,  0.13003926, -0.05275577, ...,  0.02014109,
        -0.05117095, -0.02930295]], shape=(3, 384), dtype=float32)

In [7]:
dbscan = DBSCAN(eps=0.2, min_samples=1,metric='cosine')
clusters = dbscan.fit_predict(embeddings)

In [8]:
df['cluster'] = clusters

In [9]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0


In [10]:
df.cluster.unique()

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135])

In [11]:
df[df.cluster == 0]

Unnamed: 0,timestamp,source,log_message,target_label,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0
5,2025-10-09 10:30:31,ModernHR,nova.osapi_compute.wsgi.server [req-f0bffbc3-5...,HTTP Status,0
9,2025-03-30 04:01:45,ModernHR,nova.osapi_compute.wsgi.server [req-2bf7cfee-a...,HTTP Status,0
...,...,...,...,...,...
2399,2025-03-08 06:23:00,ThirdPartyAPI,nova.metadata.wsgi.server [req-ba29717b-249a-4...,HTTP Status,0
2401,2025-12-05 15:51:51,ModernCRM,nova.osapi_compute.wsgi.server [req-4bdf00b0-3...,HTTP Status,0
2404,2025-09-18 02:18:30,ThirdPartyAPI,nova.osapi_compute.wsgi.server [req-2c9c783f-3...,HTTP Status,0
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,0


In [12]:
df[df.cluster == 5]

Unnamed: 0,timestamp,source,log_message,target_label,cluster
8,2025-02-12 10:42:29,ThirdPartyAPI,nova.compute.claims [req-a07ac654-8e81-416d-bf...,Resource Usage,5
26,2025-03-03 17:11:11,ModernCRM,nova.compute.claims [req-d6986b54-3735-4a42-90...,Resource Usage,5
40,2025-06-19 21:42:34,ThirdPartyAPI,nova.compute.claims [req-72b4858f-049e-49e1-b3...,Resource Usage,5
58,2025-09-13 14:45:14,AnalyticsEngine,nova.compute.claims [req-5c8f52bd-8e3c-41f0-95...,Resource Usage,5
61,2025-04-27 11:18:18,ThirdPartyAPI,nova.compute.claims [req-d38f479d-9bb9-4276-96...,Resource Usage,5
...,...,...,...,...,...
2336,2025-12-10 11:53:33,AnalyticsEngine,nova.compute.claims [req-97fcea79-42f7-4241-9b...,Resource Usage,5
2345,2025-12-22 01:38:48,BillingSystem,nova.compute.claims [req-caeb3818-dab6-4e8d-9e...,Resource Usage,5
2352,2025-02-18 00:16:44,ModernCRM,nova.compute.claims [req-98474cd9-61e1-4afe-bd...,Resource Usage,5
2355,2025-11-28 18:03:55,BillingSystem,nova.compute.claims [req-6f9ecdfe-481c-4535-9b...,Resource Usage,5


In [13]:
cluster_counts = df.cluster.value_counts()
large_clusters = cluster_counts[cluster_counts > 10].index

for cluster in large_clusters:
    print(f'Cluster {cluster}')
    print(df[df.cluster == cluster].log_message.head(5).to_string(index=False))
    print()

Cluster 0
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...

Cluster 5
nova.compute.claims [req-a07ac654-8e81-416d-bfb...
nova.compute.claims [req-d6986b54-3735-4a42-907...
nova.compute.claims [req-72b4858f-049e-49e1-b31...
nova.compute.claims [req-5c8f52bd-8e3c-41f0-95a...
nova.compute.claims [req-d38f479d-9bb9-4276-968...

Cluster 11
User User685 logged out.
 User User395 logged in.
 User User225 logged in.
User User494 logged out.
 User User900 logged in.

Cluster 13
Backup started at 2025-05-14 07:06:55.
Backup started at 2025-02-15 20:00:19.
  Backup ended at 2025-08-08 13:06:23.
Backup started at 2025-11-14 08:27:43.
Backup started at 2025-12-09 10:19:11.

Cluster 7
Multiple bad login attempts detected on user 85...
Multiple login failures occurred on user 9052 a...
  User 7153 

In [14]:
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message,re.IGNORECASE):
            return label
    return None

In [15]:
classify_with_regex("User User123 logged in.")

'User Action'

In [17]:
df["regex_label"] = df.log_message.apply(classify_with_regex)

In [21]:
df.shape

(2410, 6)

In [22]:
df[df.regex_label.isna()].shape

(1910, 6)

In [23]:
df[df.regex_label.notnull()].shape

(500, 6)

In [19]:
df[df.regex_label.notnull()].head()

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,4,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,4,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,11,User Action


In [20]:
df[df.regex_label.isna()].head()

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0,


In [24]:
df_non_regex = df[df.regex_label.isna()].copy()

In [25]:
#Recods with very few target labels
df_non_regex.target_label.value_counts()

target_label
HTTP Status            1017
Security Alert          371
Error                   177
Resource Usage          177
Critical Error          161
Workflow Error            4
Name: count, dtype: int64

In [26]:
df[df.target_label == "Workflow Error"]

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
60,2025-10-06 16:55:23,LegacyCRM,Lead conversion failed for prospect ID 7842 du...,Workflow Error,24,
377,2025-06-24 12:16:29,LegacyCRM,Customer follow-up process for lead ID 5621 fa...,Workflow Error,62,
1325,2025-04-17 07:33:44,LegacyCRM,Escalation rule execution failed for ticket ID...,Workflow Error,105,
2217,2025-05-12 09:46:54,LegacyCRM,Task assignment for TeamID 3425 could not comp...,Workflow Error,133,


In [27]:
df[df.target_label == "Deprecation Warning"]

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
255,2025-05-03 16:55:35,LegacyCRM,API endpoint 'getCustomerDetails' is deprecate...,Deprecation Warning,48,
1734,2025-04-30 07:47:30,LegacyCRM,The 'ExportToCSV' feature is outdated. Please ...,Deprecation Warning,118,
1826,2025-01-23 10:33:36,LegacyCRM,Support for legacy authentication methods will...,Deprecation Warning,122,


In [28]:
df_non_legacy = df_non_regex[df_non_regex.source != "LegacyCRM"]

In [29]:
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [32]:
filtered_embeddings = embeddings[df_non_legacy.index]
filtered_embeddings[:3]

array([[-0.10293972,  0.03354599, -0.02202604, ...,  0.00457789,
        -0.04259715,  0.00322623],
       [ 0.00804573, -0.03573925,  0.04938737, ...,  0.01538317,
        -0.06230951, -0.02774668],
       [-0.00908224,  0.13003926, -0.05275577, ...,  0.02014109,
        -0.05117095, -0.02930295]], shape=(3, 384), dtype=float32)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(filtered_embeddings, df_non_legacy.target_label, test_size=0.3, random_state=101)
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

                precision    recall  f1-score   support

Critical Error       0.96      1.00      0.98        51
         Error       1.00      0.96      0.98        48
   HTTP Status       1.00      1.00      1.00       296
Resource Usage       1.00      1.00      1.00        54
Security Alert       1.00      1.00      1.00       122

      accuracy                           1.00       571
     macro avg       0.99      0.99      0.99       571
  weighted avg       1.00      1.00      1.00       571



In [48]:
models_folder_path = os.path.join(os.getcwd().split("training")[0],"models")
os.makedirs(models_folder_path, exist_ok=True)
model_file_path = os.path.join(models_folder_path, "log_classifier_LR_model.joblib")
joblib.dump(lr, model_file_path)

['c:\\Python\\LogClassificationSystem\\models\\log_classifier_LR_model.joblib']

In [42]:
pwd

'c:\\Python\\LogClassificationSystem\\training'

'c:\\Python\\LogClassificationSystem\\models'