In [1]:
import pandas as pd

df = pd.read_csv("synthetic_logs.csv")
df

Unnamed: 0,timestamp,source,log_message,target_label
0,27-06-2025 07:20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
3,12-07-2025 00:24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
4,02-06-2025 18:25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status
...,...,...,...,...
2405,13-08-2025 07:29,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status
2406,01-11-2025 05:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert
2407,03-08-2025 03:07,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status
2408,11-11-2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error


In [2]:
# Obtaining the total system our logs are coming from:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [3]:
# Obtaining the total unique logs in our dataset in target variable:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [6]:
# Clustering the log messages using DBSCAN using sentence tranformation embeddings:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
import numpy as np

# Load pre-trained sentence tranformation model:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate Embeddings for the log messages:
embeddings = model.encode(df['log_message'].tolist(), show_progress_bar=True)

# Perform DBSCAN clustering:
dbscan  = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)

# Add cluster labels to the DataFrame:
df['cluster'] = clusters

# Obtaining the first few rows of the dataset:
df.head()

Batches:   0%|          | 0/76 [00:00<?, ?it/s]

Unnamed: 0,timestamp,source,log_message,target_label,cluster
0,27-06-2025 07:20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2
3,12-07-2025 00:24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,02-06-2025 18:25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0


In [8]:
# Calculate the size of each cluster
cluster_sizes = df['cluster'].value_counts()

# Sort clusters by size in descending order
sorted_clusters = cluster_sizes.sort_values(ascending=False)

# Iterate through the sorted clusters
for cluster_id, size in sorted_clusters.items():
  # Check if the cluster has more than 10 records
  if size > 10:
    print(f"Cluster {cluster_id} has {size} records.")
    # Get the log messages for this cluster
    cluster_logs = df[df['cluster'] == cluster_id]['log_message'].tolist()
    # Print the first 5 log messages (or fewer if the cluster has less than 5)
    for log in cluster_logs[:5]:
      print(f"- {log}")
  else:
      # Stop if we reach clusters with 10 or fewer records
      break

Cluster 0 has 1017 records.
- nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
- nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
- nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
- nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54

In [14]:
import re
def classify_with_regex(log_message):
  regex_patterns = {
      r"User User\d+ logged (in|out).":"User Action",
      r"Backup (started|ended) at .*":"System Notification",
      r"Backup completed successfully.":"System Notification",
      r"System updated to version .*":"System Notification",
      r"File .* uploaded successfully by user .*":"System Notification",
      r"Disk cleanup completed successfully.":"System Notification",
      r"System reboot initiated by user .*":"System Notification",
      r"Account with ID .* created by .*":"User Action"
  }

  # Traversing over the patterns-labels dictionary obtained out of the logs and If the log message matches any pattern: returning the corresponding label:
  for pattern,label in regex_patterns.items():
    if re.search(pattern,log_message, re.IGNORECASE):
      return label
  return None

In [15]:
# Obtaining the regex labels for all the log messages in the dataset: adding a new column as regex labels:
df['regex_label'] = df['log_message'].apply(classify_with_regex)
df

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
0,27-06-2025 07:20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2,
3,12-07-2025 00:24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0,
4,02-06-2025 18:25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0,
...,...,...,...,...,...,...
2405,13-08-2025 07:29,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,0,
2406,01-11-2025 05:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,7,
2407,03-08-2025 03:07,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,0,
2408,11-11-2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1,


In [17]:
# Obtaining all those data points that is not classfied by the regex classfication into any of the regex categories formed into a separate dataframe:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1910, 6)

In [22]:
# Get the count of each unique target_label in the df_non_regex DataFrame
target_label_counts = df_non_regex['target_label'].value_counts()

# Filter the target_labels to keep only those with 5 or fewer rows
target_labels_to_print = target_label_counts[target_label_counts <= 5].index.tolist()

# Print the identified target_labels
print("Target labels with 5 or fewer rows in df_non_regex:")
target_labels_to_print

Target labels with 5 or fewer rows in df_non_regex:




In [24]:
# We identified both the workFlow error and Depracating warning classes are from source == LegacyCRM.
# Obtaining all the logs which are not from LegacyCRM:
df_non_legacy = df_non_regex[df_non_regex['source'] != 'LegacyCRM']
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [25]:
# Generate Embeddings for the log messages which are not from source == Legacy: These Classes/clusters have many examples in them (>= 5), So we can use bert model to classify them.
filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Split data into train and test:
X_train, X_test, y_train, y_test = train_test_split(filtered_embeddings, df_non_legacy['target_label'], test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)


                precision    recall  f1-score   support

Critical Error       0.91      1.00      0.95        48
         Error       0.98      0.89      0.93        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       1.00      0.99      1.00       123

      accuracy                           0.99       571
     macro avg       0.98      0.98      0.98       571
  weighted avg       0.99      0.99      0.99       571



In [27]:
import joblib

joblib.dump(clf,'log_clf.joblib')

['log_clf.joblib']