In [9]:
import pandas as pd
import numpy as np
n = 10000
timestamps = pd.date_range(start="2025-09-26", periods=n, freq='min')

services = ['frontend', 'auth-service', 'database-service', 'cache-service']
event_types = ['SERVICE_RESTART', 'CONFIG_CHANGE', 'USER_LOGIN', 'DEPLOYMENT']
log_levels = ['INFO', 'WARN', 'ERROR']

In [15]:
import numpy as np
import pandas as pd

cpu_base = 18
mem_base = 1507

cpu_usage = cpu_base + np.random.randint(-5, 5, size=n)
mem_usage = mem_base + np.random.randint(-250, 250, size=n)
response_time = np.random.randint(100, 110, size=n)
services_arr = np.random.choice(services, size=n)

# Random spiles
is_anomaly = np.random.rand(n) < 0.05
cpu_usage[is_anomaly] += np.random.randint(50, 70, size=is_anomaly.sum())
response_time[is_anomaly] += np.random.randint(300, 500, size=is_anomaly.sum())
is_anomaly = is_anomaly.astype(np.int8)

metrics_df = pd.DataFrame({
    'timestamp': timestamps,
    'cpu_usage': cpu_usage.astype(np.int16),
    'mem_base': mem_usage.astype(np.int32),
    'response_time': response_time.astype(np.int16),
    'service': services_arr,
    'is_anomaly': is_anomaly
})
metrics_df

Unnamed: 0,timestamp,cpu_usage,mem_base,response_time,service,is_anomaly
0,2025-09-26 00:00:00,20,1375,106,cache-service,0
1,2025-09-26 00:01:00,18,1293,103,frontend,0
2,2025-09-26 00:02:00,14,1690,104,database-service,0
3,2025-09-26 00:03:00,19,1685,109,auth-service,0
4,2025-09-26 00:04:00,21,1535,108,cache-service,0
...,...,...,...,...,...,...
9995,2025-10-02 22:35:00,17,1422,106,frontend,0
9996,2025-10-02 22:36:00,77,1734,422,database-service,1
9997,2025-10-02 22:37:00,20,1502,105,cache-service,0
9998,2025-10-02 22:38:00,15,1623,105,database-service,0


In [17]:
levels = np.random.choice(log_levels, size=n, p=[0.7, 0.2, 0.1])
services_arr = np.random.choice(services, size=n)

# services = ['frontend', 'auth-service', 'database-service', 'cache-service']
# log_levels = ['INFO', 'WARN', 'ERROR']

messages = np.array([
    f"{s} operation successful" if l=="INFO" else 
    f"{s} high memory usage" if l=="WARN" else 
    f"{s} error occurred"
    for s, l in zip(services_arr, levels)
])

is_anomaly = (levels == 'ERROR').astype(np.int8)

log_df = pd.DataFrame({
    'timestamp': timestamps,
    'level': levels,
    'message': messages,
    'service': services_arr,
    'is_anomaly': is_anomaly
})
log_df

Unnamed: 0,timestamp,level,message,service,is_anomaly
0,2025-09-26 00:00:00,INFO,cache-service operation successful,cache-service,0
1,2025-09-26 00:01:00,INFO,auth-service operation successful,auth-service,0
2,2025-09-26 00:02:00,INFO,database-service operation successful,database-service,0
3,2025-09-26 00:03:00,ERROR,cache-service error occurred,cache-service,1
4,2025-09-26 00:04:00,INFO,frontend operation successful,frontend,0
...,...,...,...,...,...
9995,2025-10-02 22:35:00,INFO,database-service operation successful,database-service,0
9996,2025-10-02 22:36:00,ERROR,auth-service error occurred,auth-service,1
9997,2025-10-02 22:37:00,ERROR,cache-service error occurred,cache-service,1
9998,2025-10-02 22:38:00,INFO,cache-service operation successful,cache-service,0


In [18]:
event_arr = np.random.choice(event_types, size=n)
services_arr = np.random.choice(services, size=n)

# services = ['frontend', 'auth-service', 'database-service', 'cache-service']
# event_types = ['SERVICE_RESTART', 'CONFIG_CHANGE', 'USER_LOGIN', 'DEPLOYMENT']

is_anomaly = (event_arr == 'SERVICE_RESTART').astype(np.int8)

event_df = pd.DataFrame({
    'timestamp': timestamps,
    'event_type': event_arr,
    'service': services_arr,
    'is_anomaly': is_anomaly
})

event_df


Unnamed: 0,timestamp,event_type,service,is_anomaly
0,2025-09-26 00:00:00,USER_LOGIN,database-service,0
1,2025-09-26 00:01:00,CONFIG_CHANGE,database-service,0
2,2025-09-26 00:02:00,DEPLOYMENT,cache-service,0
3,2025-09-26 00:03:00,USER_LOGIN,auth-service,0
4,2025-09-26 00:04:00,SERVICE_RESTART,cache-service,1
...,...,...,...,...
9995,2025-10-02 22:35:00,USER_LOGIN,cache-service,0
9996,2025-10-02 22:36:00,SERVICE_RESTART,frontend,1
9997,2025-10-02 22:37:00,SERVICE_RESTART,frontend,1
9998,2025-10-02 22:38:00,DEPLOYMENT,frontend,0


In [20]:
trace_id = np.arange(1, n+1)
duration = np.random.randint(20, 50, size=(n, len(services)))

data = []
for idx, t in enumerate(timestamps):
    parent_id = None
    for s_idx, service in enumerate(services):
        # 20%  span is missing  consider missing span as anomaly
        missing_span = np.random.rand() < 0.2
        span_id = f"{trace_id[idx]}_{service}"
        if missing_span:
            data.append([t, trace_id[idx], span_id, parent_id, service, duration[idx, s_idx], 1])
            continue
        data.append([t, trace_id[idx], span_id, parent_id, service, duration[idx, s_idx], 0])
        parent_id = span_id

trace_df = pd.DataFrame(data, columns=['start_time','trace_id','span_id','parent_id','service','duration_ms','is_anomaly'])
trace_df

Unnamed: 0,start_time,trace_id,span_id,parent_id,service,duration_ms,is_anomaly
0,2025-09-26 00:00:00,1,1_frontend,,frontend,29,0
1,2025-09-26 00:00:00,1,1_auth-service,1_frontend,auth-service,48,1
2,2025-09-26 00:00:00,1,1_database-service,1_frontend,database-service,48,0
3,2025-09-26 00:00:00,1,1_cache-service,1_database-service,cache-service,47,0
4,2025-09-26 00:01:00,2,2_frontend,,frontend,24,0
...,...,...,...,...,...,...,...
39995,2025-10-02 22:38:00,9999,9999_cache-service,9999_frontend,cache-service,49,0
39996,2025-10-02 22:39:00,10000,10000_frontend,,frontend,38,0
39997,2025-10-02 22:39:00,10000,10000_auth-service,10000_frontend,auth-service,31,0
39998,2025-10-02 22:39:00,10000,10000_database-service,10000_auth-service,database-service,49,0


# Anamoly detection


### Isolation


In [12]:
pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [13]:
metrics_df

Unnamed: 0,timestamp,cpu_usage,mem_base,response_time
0,2022-04-13 00:00:00,21,1489,105
1,2022-04-13 00:01:00,17,1692,108
2,2022-04-13 00:02:00,14,1735,103
3,2022-04-13 00:03:00,14,1268,106
4,2022-04-13 00:04:00,21,1507,100
...,...,...,...,...
9995,2022-04-19 22:35:00,21,1371,106
9996,2022-04-19 22:36:00,20,1554,106
9997,2022-04-19 22:37:00,17,1483,101
9998,2022-04-19 22:38:00,17,1434,109


In [25]:
from sklearn.ensemble import  IsolationForest

# X = metric'response_time']]
iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
a = metrics_df[['cpu_usage', 'mem_base', 'response_time']]
metrics_df['anamoly_iso'] = iso_forest.fit_predict(a)  # -1 = anomaly, 1 = normal
metrics_df['anamoly_iso'] = metrics_df['anamoly_iso'].map({1:0, -1:1})  # -1 = anomaly, 1 = normal

from sklearn.svm import OneClassSVM

oc_svm = OneClassSVM(nu=0.05, kernel='rbf', gamma='auto')
X = metrics_df[['cpu_usage', 'mem_base', 'response_time']]
metrics_df['anomaly_svm'] = oc_svm.fit_predict(X)
metrics_df['anomaly_svm'] = metrics_df['anomaly_svm'].map({1: 0, -1: 1})



from scipy.stats import zscore

metrics_df['cpu_z'] = zscore(metrics_df['cpu_usage'])
metrics_df['cpu_anomaly_stat'] = metrics_df['cpu_z'].abs() > 3
metrics_df['cpu_anomaly_stat'] = metrics_df['cpu_anomaly_stat'].map({1: 0, -1: 1})


In [26]:
metrics_df

Unnamed: 0,timestamp,cpu_usage,mem_base,response_time,service,is_anomaly,anamoly_iso,anomaly_svm,cpu_z,cpu_anomaly_stat
0,2025-09-26 00:00:00,20,1375,106,cache-service,0,0,0,-0.034101,
1,2025-09-26 00:01:00,18,1293,103,frontend,0,0,0,-0.184855,
2,2025-09-26 00:02:00,14,1690,104,database-service,0,0,0,-0.486363,
3,2025-09-26 00:03:00,19,1685,109,auth-service,0,0,0,-0.109478,
4,2025-09-26 00:04:00,21,1535,108,cache-service,0,0,0,0.041276,
...,...,...,...,...,...,...,...,...,...,...
9995,2025-10-02 22:35:00,17,1422,106,frontend,0,0,1,-0.260232,
9996,2025-10-02 22:36:00,77,1734,422,database-service,1,1,1,4.262393,
9997,2025-10-02 22:37:00,20,1502,105,cache-service,0,0,0,-0.034101,
9998,2025-10-02 22:38:00,15,1623,105,database-service,0,0,0,-0.410986,


In [28]:
import torch
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


def create_seq(data, seq_length=10):
    seq = []
    for i in range(len(data) - seq_length):
        seq.append(data[i : i+seq_length])
    return torch.tensor(seq, dtype=torch.float32)

X_seq = create_seq(X_scaled) # shape: (num_sequences, seq_length, features)
# print(X_seq.shape)

class LSTManEncoder(nn.Module):
    def __init__(self, input_feature, hidden_feature = 32):
        super().__init__()
        self.encoder = nn.LSTM(input_size=input_feature, hidden_size=hidden_feature, batch_first=True)
        self.decoder = nn.LSTM(input_size=hidden_feature, hidden_size=input_feature, batch_first=True)


    def forward(self, x):
        out, (h, c) = self.encoder(x)
        # print(h.shape)
        # print(x.shape)
        h = h.repeat(x.size(1), 1, 1).permute(1,0,2)
        output, _ = self.decoder(h)
        return output

model = LSTManEncoder(input_feature=X_seq.shape[2])
a = model(X_seq)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(10):
    optimizer.zero_grad()
    output = model(X_seq)
    # print(output.shape)
    # print(X_seq.shape)
    loss = loss_fn(output, X_seq)
    loss.backward()
    optimizer.step()

with torch.no_grad():
    pred = model(X_seq)
    mse = ((pred - X_seq)**2).mean(dim=(1,2)).numpy()

threshold = mse.mean() + 3*mse.std()
print(threshold)
lstm_anomalies = np.where(mse > threshold)[0]
print(f"LSTMAutoencoder anamoloy :{len(lstm_anomalies)}")

  return torch.tensor(seq, dtype=torch.float32)


0.1405184
LSTMAutoencoder anamoloy :113


In [29]:
lstm_anomalies

array([ 639,  640,  641,  642,  674,  764,  765,  858,  859,  865, 1178,
       1179, 1180, 1181, 1182, 1183, 1184, 1185, 1253, 1254, 1255, 1256,
       1257, 1258, 1260, 1261, 2107, 2108, 2109, 2110, 2111, 2112, 2113,
       2114, 2466, 2627, 2656, 2657, 2948, 2949, 2950, 3001, 4222, 5131,
       5132, 5133, 5164, 5165, 5166, 5522, 5523, 5561, 5597, 5598, 5599,
       5600, 5601, 5602, 5942, 5943, 5944, 5945, 5948, 6018, 6019, 6020,
       6021, 6022, 7869, 7870, 7871, 8249, 8250, 8251, 8252, 8255, 8653,
       8654, 8655, 8735, 8736, 8737, 8738, 8739, 8740, 8742, 8842, 8843,
       8844, 8845, 8846, 8847, 8951, 8952, 8953, 8954, 8955, 8956, 8958,
       8959, 9030, 9301, 9302, 9303, 9304, 9305, 9375, 9853, 9854, 9855,
       9856, 9857, 9858])

In [17]:
service

'cache-service'

In [18]:

log_df['level_num'] = log_df['level'].map({'INFO':0, 'WARN':1, 'ERROR':2})

log_df['error_flag'] = log_df['message'].str.contains('error|fail|timeout', case=False).astype(int)


In [None]:
from sklearn.ensemble import  IsolationForest

# X = metric'response_time']]
iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
a = log_df[['level_num', 'error_flag']]
log_df['anamoly_iso'] = iso_forest.fit_predict(a)  # -1 = anomaly, 1 = normal

from sklearn.svm import OneClassSVM

oc_svm = OneClassSVM(nu=0.05, kernel='rbf', gamma='auto')
X = log_df[['level_num', 'error_flag']]
log_df['anomaly_svm'] = oc_svm.fit_predict(X)


from scipy.stats import zscore

log_df['cpu_z'] = zscore(log_df['error_flag'])
log_df['cpu_anomaly_stat'] = log_df['cpu_z'].abs() > 3


In [None]:
log_df

Unnamed: 0,timestamp,level,message,level_num,error_flag,anamoly_iso,anomaly_svm,cpu_z,cpu_anomaly_stat
0,"DatetimeIndex(['2022-04-13 00:00:00', '2022-04...",INFO,frontend operation successful,0,0,1,1,-0.333889,False
1,2022-04-13 00:00:00,INFO,frontend operation successful,0,0,1,1,-0.333889,False
2,"DatetimeIndex(['2022-04-13 00:00:00', '2022-04...",INFO,frontend operation successful,0,0,1,1,-0.333889,False
3,2022-04-13 00:01:00,INFO,frontend operation successful,0,0,1,1,-0.333889,False
4,"DatetimeIndex(['2022-04-13 00:00:00', '2022-04...",INFO,cache-service operation successful,0,0,1,1,-0.333889,False
...,...,...,...,...,...,...,...,...,...
19995,2022-04-19 22:37:00,INFO,frontend operation successful,0,0,1,1,-0.333889,False
19996,"DatetimeIndex(['2022-04-13 00:00:00', '2022-04...",INFO,auth-service operation successful,0,0,1,1,-0.333889,False
19997,2022-04-19 22:38:00,INFO,auth-service operation successful,0,0,1,1,-0.333889,False
19998,"DatetimeIndex(['2022-04-13 00:00:00', '2022-04...",INFO,auth-service operation successful,0,0,1,1,-0.333889,False


In [33]:
metric_anamoly = metrics_df[metrics_df['anamoly_iso'] == 1]

In [34]:
metric_metadata = []
for _, row in metric_anamoly.iterrows():
    metadata = {
        "timestamp": row['timestamp'],
        "service": row.get('service','unknown'),  
        "component": "metric",
        "feature": "cpu_usage/mem_usage/response_time",  
        "value": row['cpu_usage'],
        "anomaly_type": "spike"
    }
    metric_metadata.append(metadata)

In [35]:
metric_metadata

[{'timestamp': Timestamp('2025-09-26 00:10:00'),
  'service': 'auth-service',
  'component': 'metric',
  'feature': 'cpu_usage/mem_usage/response_time',
  'value': 69,
  'anomaly_type': 'spike'},
 {'timestamp': Timestamp('2025-09-26 00:38:00'),
  'service': 'auth-service',
  'component': 'metric',
  'feature': 'cpu_usage/mem_usage/response_time',
  'value': 84,
  'anomaly_type': 'spike'},
 {'timestamp': Timestamp('2025-09-26 00:41:00'),
  'service': 'auth-service',
  'component': 'metric',
  'feature': 'cpu_usage/mem_usage/response_time',
  'value': 73,
  'anomaly_type': 'spike'},
 {'timestamp': Timestamp('2025-09-26 00:58:00'),
  'service': 'cache-service',
  'component': 'metric',
  'feature': 'cpu_usage/mem_usage/response_time',
  'value': 85,
  'anomaly_type': 'spike'},
 {'timestamp': Timestamp('2025-09-26 01:16:00'),
  'service': 'cache-service',
  'component': 'metric',
  'feature': 'cpu_usage/mem_usage/response_time',
  'value': 72,
  'anomaly_type': 'spike'},
 {'timestamp': Tim

In [38]:
import getpass
import os

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = "AIzaSyCpCk8y8l3IU08n9_u_EWajQv-pibrBdps"

In [39]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate

def get_cause_llm(metric_metadata):

    client = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

    prompt = """
    You are an expert in system monitoring and root cause analysis.
    Given the anomaly metadata below, suggest possible root causes.
    Metadata: {metadata}
        Return the response as structured JSON with fields:
    - root_cause
    - severity
    - suggested_action

    Answer in simple and short
    """

    template = PromptTemplate(template=prompt, input_variables=['metadata'])
    formatted_prompt = template.format(metadata=metric_metadata)

    result = client.invoke(formatted_prompt).content

    return result


In [42]:
import json
def get_cause_llm(metric_metadata):
    client = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

    prompt = """
        You are an expert in system monitoring and root cause analysis.
        Given the anomaly metadata below, suggest possible root causes.
        Metadata: {metadata}
        Return the response strictly in JSON format with fields:
        - root_cause
        - severity
        - suggested_action
        Also answer in short and simple in less words
    """

    template = PromptTemplate(template=prompt, input_variables=['metadata'])
    formatted_prompt = template.format(metadata=metric_metadata)

    print("\nExplaining the reason behind problem..............")
    raw_response = client.invoke(formatted_prompt).content

    try:
        cleaned = raw_response.strip().replace("```json", "").replace("```", "")
        parsed = json.loads(cleaned)  
        # print(json.dumps(parsed, indent=2))  
        return parsed
    except json.JSONDecodeError:
        print("Model didn't return proper JSON, raw response shown:")
        # print(raw_response)
        return {"raw_response": raw_response}

In [43]:
get_cause_llm(metric_metadata)


Explaining the reason behind problem..............


{'root_cause': 'Widespread resource contention and performance bottlenecks across core services (auth, cache, database, frontend), likely triggered by increased workload, inefficient code/queries, or underlying infrastructure limitations.',
 'severity': 'High',
 'suggested_action': 'Immediately review traffic patterns and recent deployments. Deep-dive into database and cache performance metrics, and consider scaling resources or optimizing application code.'}

In [1]:
print("Generating event data (vectorized)...")
service_col = services[np.random.randint(0, len(services), size=n)]
event_col = event_types[np.random.randint(0, len(event_types), size=n)]

df = pd.DataFrame({
    'timestamp': timestamp,
    'event_type': event_col,
    'service': service_col
})

Generating event data (vectorized)...


NameError: name 'services' is not defined