In [None]:
!pip install pandas matplotlib scikit-learn google-generativeai

import os
import json
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import random

import google.generativeai as genai
import re




In [None]:

genai.configure(api_key="GOOGLE_API_KEY")
model = genai.GenerativeModel("gemini-1.5-flash")


In [None]:
import random
import datetime
import pandas as pd
import numpy as np
import uuid

def generate_synthetic_logs(
    num_entries: int = 1000,
    start_time: datetime.datetime = None,
    end_time:   datetime.datetime = None
) -> pd.DataFrame:
    if start_time is None:
        start_time = datetime.datetime.now() - datetime.timedelta(hours=1)
    if end_time is None:
        end_time = datetime.datetime.now()
    span_secs = (end_time - start_time).total_seconds()

    services    = ["Auth","Billing","Catalog","Orders"]
    queues      = ["q_auth","q_bill","q_cat","q_ord"]
    servers     = ["srv1","srv2"]
    log_levels  = ["INFO","DEBUG","WARN","ERROR"]
    status_codes= [200,201,400,401,403,404,500,502,503]
    messages    = {
        "Auth":         ["User login succeeded","User login failed","Token expired","Invalid credentials"],
        "Billing":      ["Payment processed","Payment declined","Invoice generated","Billing timeout"],
        "Catalog":      ["Item lookup succeeded","Item not found","Catalog cache miss"],
        "Orders":       ["Order placed","Order failed","Order cancelled"],
        "Inventory":    ["Inventory check passed","Low stock warning","Stock updated"],
        "Shipping":     ["Shipment created","Delivery delayed","Tracking updated"],
        "Notification": ["Email sent","Push notification failed","SMS queued"]
    }

    records = []
    for _ in range(num_entries):
        ts = start_time + datetime.timedelta(seconds=random.random() * span_secs)
        svc = random.choice(services)
        elapsed = np.clip(np.random.normal(loc=10, scale=8), 0.1, 120)  # heavy tails
        lvl = random.choices(log_levels, weights=[0.6,0.1,0.2,0.1])[0]
        status = random.choices(status_codes,
                                weights=[0.6,0.1,0.05,0.05,0.05,0.05,0.05,0.025,0.025])[0]
        msg = random.choice(messages.get(svc, ["Operation completed","Operation failed"]))
        user = f"user_{random.randint(1,5000)}"
        sess = str(uuid.uuid4())

        error_code = None
        error_msg  = None
        if lvl=="ERROR" or status>=500:
            error_code = random.choice(["E1001","E2002","E3003"])
            error_msg  = "Unexpected exception occurred."

        records.append({
            "timestamp":    ts,
            "service":      svc,
            "instance":     random.choice(servers),
            "jms_queue":    queues[services.index(svc)],
            "elapsed_time": round(float(elapsed), 2),
            "log_level":    lvl,
            "status_code":  status,
            "message":      msg,
            "user_id":      user,
            "session_id":   sess,
            "error_code":   error_code,
            "error_message":error_msg
        })

    return pd.DataFrame(records)

df_logs = generate_synthetic_logs(
    num_entries=100,
    start_time=datetime.datetime(2025,8,7,12,0,0),
    end_time=datetime.datetime(2025,8,8,12,0,0)
)
print(len(df_logs))
df_logs.head()


100


Unnamed: 0,timestamp,service,instance,jms_queue,elapsed_time,log_level,status_code,message,user_id,session_id,error_code,error_message
0,2025-08-08 09:38:29.049808,Orders,srv1,q_ord,19.82,WARN,200,Order placed,user_3865,6929924d-4fe0-47a1-8cb0-4472dcd401a4,,
1,2025-08-08 02:36:14.980564,Catalog,srv1,q_cat,16.26,WARN,200,Item not found,user_2684,fac22065-ea54-40dd-a6fd-4d8eb5bfe0fa,,
2,2025-08-07 14:15:00.572669,Orders,srv1,q_ord,8.42,INFO,200,Order failed,user_4564,62353faa-0a91-457b-a90b-d71a90080552,,
3,2025-08-08 02:55:06.462731,Auth,srv2,q_auth,0.1,ERROR,200,Invalid credentials,user_4625,50963d1c-38e2-4b3a-9b97-83e430c3e496,E3003,Unexpected exception occurred.
4,2025-08-07 21:53:59.686387,Billing,srv1,q_bill,6.11,INFO,200,Invoice generated,user_1840,72c1320e-b2de-4938-a6d4-edaccdf6ecd3,,


In [None]:
def format_logs_csv_windows(df, window_size=10):
    header = (
        "timestamp,service,instance,jms_queue,elapsed_time,"
        "log_level,status_code,message,user_id,session_id,"
        "error_code,error_message"
    )
    windows = []
    for i in range(0, len(df), window_size):
        batch = df.iloc[i:i+window_size]
        lines = []
        for _, row in batch.iterrows():
            fields = [
                row['timestamp'].strftime("%Y-%m-%d %H:%M:%S"),
                row['service'],
                row['instance'],
                row['jms_queue'],
                str(row['elapsed_time']),
                row['log_level'],
                str(row['status_code']),
                row['message'].replace(',', ' '),
                row['user_id'],
                row['session_id'],
                row['error_code'] or '',
                row['error_message'] or ''
            ]
            lines.append(",".join(fields))
        windows.append(header + "\n" + "\n".join(lines))
    return windows

log_windows_csv = format_logs_csv_windows(df_logs, window_size=10)


In [None]:
thresholds = {
    ('Auth',       'srv1'): 15.0,
    ('Auth',       'srv2'): 18.0,
    ('Billing',    'srv1'): 20.0,
    ('Billing',    'srv2'): 22.0,
    ('Catalog',    'srv1'): 12.0,
    ('Catalog',    'srv2'): 14.0,
    ('Orders',     'srv1'): 16.0,
    ('Orders',     'srv2'): 17.5,
}


In [None]:
import re, json, ast


In [None]:


def label_logs_with_gemini(windows, thresholds):
    t_lines = [f"{svc}/{inst}: {thr}s"
               for (svc,inst),thr in thresholds.items()]
    thresh_block = "Thresholds per service/instance:\n" + "\n".join(t_lines)

    labeled = []
    for w in windows:
        prompt = f"""
You are an anomaly‐detection expert.  Use these thresholds:
{thresh_block}

For each CSV log entry below, if its elapsed_time exceeds the threshold
for its (service,instance), mark it anomalous (anomaly=1), otherwise normal (0).
Return a JSON array of objects, each with all original fields plus "anomaly".

Log entries (CSV with header):
{w}

Example output:
[
  {{"timestamp":"2025-08-08 14:00:00","service":"Auth","instance":"srv1","jms_queue":"q_auth","elapsed_time":17.2,"log_level":"INFO","status_code":200,"message":"User login succeeded","user_id":"user_123","session_id":"...","error_code":"","error_message":"","anomaly":1}},
  ...
]

"""
        resp = model.generate_content(prompt)
        text = resp.text.strip()
        text = re.sub(r"^```(?:json)?\s*", "", text)
        text = re.sub(r"\s*```$", "", text)
        try:
            batch = json.loads(text)
        except json.JSONDecodeError:
            batch = ast.literal_eval(text)
        labeled.extend(batch)
    return labeled

labeled_records = label_logs_with_gemini(log_windows_csv, thresholds)
df_labeled = pd.DataFrame(labeled_records)


In [None]:
print(len(df_labeled))
df_labeled.head()


100


Unnamed: 0,timestamp,service,instance,jms_queue,elapsed_time,log_level,status_code,message,user_id,session_id,error_code,error_message,anomaly
0,2025-08-08 09:38:29,Orders,srv1,q_ord,19.82,WARN,200,Order placed,user_3865,6929924d-4fe0-47a1-8cb0-4472dcd401a4,,,1
1,2025-08-08 02:36:14,Catalog,srv1,q_cat,16.26,WARN,200,Item not found,user_2684,fac22065-ea54-40dd-a6fd-4d8eb5bfe0fa,,,1
2,2025-08-07 14:15:00,Orders,srv1,q_ord,8.42,INFO,200,Order failed,user_4564,62353faa-0a91-457b-a90b-d71a90080552,,,0
3,2025-08-08 02:55:06,Auth,srv2,q_auth,0.1,ERROR,200,Invalid credentials,user_4625,50963d1c-38e2-4b3a-9b97-83e430c3e496,E3003,Unexpected exception occurred.,0
4,2025-08-07 21:53:59,Billing,srv1,q_bill,6.11,INFO,200,Invoice generated,user_1840,72c1320e-b2de-4938-a6d4-edaccdf6ecd3,,,0


In [None]:
def calculate_llm_accuracy(df, thresholds):
    df = df.copy()
    df['anomaly_gt'] = df.apply(
        lambda r: int(r['elapsed_time'] > thresholds[(r['service'],r['instance'])]),
        axis=1
    )
    y_true = df['anomaly_gt'].values
    y_pred = df['anomaly'].values

    tp = int(((y_true==1)&(y_pred==1)).sum())
    tn = int(((y_true==0)&(y_pred==0)).sum())
    fp = int(((y_true==0)&(y_pred==1)).sum())
    fn = int(((y_true==1)&(y_pred==0)).sum())

    total    = tp+tn+fp+fn
    accuracy = (tp+tn)/total if total>0 else float('nan')
    precision= tp/(tp+fp)    if (tp+fp)>0 else 0.0
    recall   = tp/(tp+fn)    if (tp+fn)>0 else 0.0
    f1       = (2*precision*recall)/(precision+recall) if (precision+recall)>0 else 0.0

    return {'tp':tp,'tn':tn,'fp':fp,'fn':fn,
            'accuracy':accuracy,'precision':precision,
            'recall':recall,'f1':f1}

metrics = calculate_llm_accuracy(df_labeled, thresholds)
print(metrics)


{'tp': 23, 'tn': 71, 'fp': 0, 'fn': 6, 'accuracy': 0.94, 'precision': 1.0, 'recall': 0.7931034482758621, 'f1': 0.8846153846153846}


In [None]:
anomalous_logs = df_labeled[df_labeled['anomaly'] == 1]
print(
    anomalous_logs[["timestamp", "service", "instance", "jms_queue", "elapsed_time"]]
    .to_string(index=False)
)

          timestamp service instance jms_queue  elapsed_time
2025-08-08 01:11:13    Auth     srv1    q_auth         15.41
2025-08-07 12:04:16  Orders     srv2     q_ord         18.24
2025-08-08 03:31:40 Catalog     srv1     q_cat         12.99
2025-08-08 10:17:22    Auth     srv1    q_auth         23.45
2025-08-08 11:41:44    Auth     srv1    q_auth         21.49
2025-08-08 11:55:50  Orders     srv2     q_ord         19.10
2025-08-08 08:29:50 Catalog     srv2     q_cat         20.35
2025-08-08 03:54:29    Auth     srv1    q_auth         20.57
2025-08-07 23:39:41    Auth     srv1    q_auth         16.32
2025-08-07 14:07:56 Catalog     srv1     q_cat         22.57
2025-08-07 18:28:31 Catalog     srv2     q_cat         22.17
2025-08-07 14:13:06 Catalog     srv1     q_cat         16.29
2025-08-08 08:08:57 Billing     srv1    q_bill         25.60
2025-08-07 15:15:07 Billing     srv1    q_bill         37.72
2025-08-08 11:47:46    Auth     srv2    q_auth         20.60
2025-08-08 01:01:01    A

In [None]:
anomalous_logs = df_labeled[df_labeled['anomaly'] == 0]
print(
    anomalous_logs[["timestamp", "service", "instance", "jms_queue", "elapsed_time"]]
    .to_string(index=False)
)

          timestamp service instance jms_queue  elapsed_time
2025-08-07 16:33:28 Billing     srv1    q_bill          9.29
2025-08-08 04:48:45    Auth     srv1    q_auth         13.23
2025-08-08 04:07:53 Catalog     srv2     q_cat          0.81
2025-08-07 20:12:38 Catalog     srv2     q_cat         10.93
2025-08-08 01:25:30  Orders     srv1     q_ord         16.13
2025-08-08 06:02:53    Auth     srv2    q_auth          0.10
2025-08-07 15:25:01  Orders     srv2     q_ord         12.51
2025-08-08 00:05:22  Orders     srv2     q_ord         12.17
2025-08-08 07:59:25 Catalog     srv2     q_cat         12.48
2025-08-08 05:38:03    Auth     srv1    q_auth          8.42
2025-08-07 18:41:58  Orders     srv1     q_ord         13.17
2025-08-07 18:04:16  Orders     srv2     q_ord          0.10
2025-08-08 07:30:02  Orders     srv1     q_ord         11.32
2025-08-07 12:21:54  Orders     srv2     q_ord          2.67
2025-08-08 10:39:49  Orders     srv1     q_ord          4.72
2025-08-08 11:38:04  Ord

In [None]:
from sklearn.ensemble import IsolationForest
import pandas as pd
import numpy as np

df_train = df_labeled[df_labeled['anomaly'] == 0].copy()

feature_cols = ['elapsed_time', 'service', 'instance', 'jms_queue']

X_train = pd.get_dummies(df_train[feature_cols],
                         columns=['service', 'instance', 'jms_queue'])

iso_clf = IsolationForest(contamination=0.1, random_state=42)
iso_clf.fit(X_train)


In [None]:
import pandas as pd
import numpy as np
import datetime

test_cases = [
    # Normal Auth log, reasonable elapsed_time
    {
        "timestamp": "2025-08-08 10:00:00",
        "service": "Auth",
        "instance": "srv1",
        "jms_queue": "q_auth",
        "elapsed_time": 12.0,
        "log_level": "INFO",
        "status_code": 200,
        "message": "User login succeeded",
        "user_id": "user_101",
        "session_id": "test-session-1",
        "error_code": None,
        "error_message": None
    },
    # High elapsed_time for Billing (likely anomaly)
    {
        "timestamp": "2025-08-08 10:05:00",
        "service": "Billing",
        "instance": "srv1",
        "jms_queue": "q_bill",
        "elapsed_time": 35.0,
        "log_level": "WARN",
        "status_code": 200,
        "message": "Payment processed slowly",
        "user_id": "user_202",
        "session_id": "test-session-2",
        "error_code": None,
        "error_message": None
    },
    # Very low elapsed_time for Catalog (possible anomaly)
    {
        "timestamp": "2025-08-08 10:10:00",
        "service": "Catalog",
        "instance": "srv2",
        "jms_queue": "q_cat",
        "elapsed_time": 0.5,
        "log_level": "DEBUG",
        "status_code": 200,
        "message": "Catalog lookup fast",
        "user_id": "user_303",
        "session_id": "test-session-3",
        "error_code": None,
        "error_message": None
    },
    # Normal Orders log
    {
        "timestamp": "2025-08-08 10:15:00",
        "service": "Orders",
        "instance": "srv1",
        "jms_queue": "q_ord",
        "elapsed_time": 14.2,
        "log_level": "INFO",
        "status_code": 200,
        "message": "Order placed",
        "user_id": "user_404",
        "session_id": "test-session-4",
        "error_code": None,
        "error_message": None
    },
    # Extreme elapsed_time for Auth (clear anomaly)
    {
        "timestamp": "2025-08-08 10:20:00",
        "service": "Auth",
        "instance": "srv2",
        "jms_queue": "q_auth",
        "elapsed_time": 60.0,
        "log_level": "ERROR",
        "status_code": 500,
        "message": "Auth server timeout",
        "user_id": "user_505",
        "session_id": "test-session-5",
        "error_code": "E5001",
        "error_message": "Timeout error"
    }
]

df_test = pd.DataFrame(test_cases)

feature_cols = ['elapsed_time','service','instance','jms_queue']
X_test = pd.get_dummies(df_test[feature_cols],
                        columns=['service','instance','jms_queue'])

X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

df_test['ml_anomaly'] = np.where(iso_clf.predict(X_test) == -1, 1, 0)

print(df_test[['timestamp','service','instance','elapsed_time','ml_anomaly']])


             timestamp  service instance  elapsed_time  ml_anomaly
0  2025-08-08 10:00:00     Auth     srv1          12.0           0
1  2025-08-08 10:05:00  Billing     srv1          35.0           1
2  2025-08-08 10:10:00  Catalog     srv2           0.5           0
3  2025-08-08 10:15:00   Orders     srv1          14.2           0
4  2025-08-08 10:20:00     Auth     srv2          60.0           1


In [None]:
def detect_and_route_new_app(df_raw: pd.DataFrame):
    before_cols = list(df_raw.columns)
    missing_required = [c for c in REQUIRED if c not in df_raw.columns]
    extra_before = [c for c in before_cols if c not in REQUIRED]

    if not missing_required:
        status = "as_is"
        used_aliases = {}
        df_mapped = normalize_new_app_schema(df_raw)  # type-fixes; idempotent
    else:
        used_aliases = {}
        for canonical, candidates in ALIASES.items():
            if canonical in df_raw.columns:
                continue
            for c in candidates:
                if c in df_raw.columns:
                    used_aliases[c] = canonical
                    break

        df_mapped = normalize_new_app_schema(df_raw)
        still_missing = [c for c in REQUIRED if c not in df_mapped.columns or df_mapped[c].isna().all()]
        status = "normalized" if not still_missing else "unsupported"

    after_cols = list(df_mapped.columns)
    diff_report = {
        "status": status,
        "missing_required_before": missing_required,
        "extra_columns_before": extra_before,
        "used_aliases": used_aliases,
        "columns_before": before_cols,
        "columns_after": after_cols
    }
    return status, df_mapped, diff_report


In [None]:
REQUIRED = ['timestamp','service','instance','jms_queue','elapsed_time',
            'log_level','status_code','message','user_id','session_id',
            'error_code','error_message']

ALIASES = {
    'service':      ['service','service_name','svc'],
    'instance':     ['instance','host','node'],
    'jms_queue':    ['jms_queue','queue','topic'],
    'elapsed_time': ['elapsed_time','latency','response_time','duration','elapsed','time_s','time_ms'],
    'log_level':    ['log_level','level'],
    'status_code':  ['status_code','status','code'],
    'timestamp':    ['timestamp','time','ts'],
    'message':      ['message','msg','detail'],
    'user_id':      ['user_id','user','uid'],
    'session_id':   ['session_id','session','sid'],
    'error_code':   ['error_code','err_code','ecode'],
    'error_message':['error_message','err_message','errmsg','error','exception']
}

def normalize_new_app_schema(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = df_raw.copy()
    cols = set(df.columns)
    rename_map = {}
    for canonical, candidates in ALIASES.items():
        for c in candidates:
            if c in cols:
                rename_map[c] = canonical
                break
    df = df.rename(columns=rename_map)

    for col in REQUIRED:
        if col not in df.columns:
            df[col] = None

    df['elapsed_time'] = pd.to_numeric(df['elapsed_time'], errors='coerce').astype(float)
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

    text_cols = ['service','instance','jms_queue','log_level','message','user_id','session_id','error_code','error_message']
    for c in text_cols:
        df[c] = df[c].astype(str).replace({'None':'', 'nan':'', 'NaT':''})

    df['status_code'] = pd.to_numeric(df['status_code'], errors='coerce').fillna(0).astype(int)

    df = df[REQUIRED]
    return df


In [None]:
import numpy as np
import pandas as pd

feature_cols     = ['elapsed_time','service','instance','jms_queue']
categorical_cols = ['service','instance','jms_queue']
numeric_cols     = ['elapsed_time']

def _infer_training_categories(df_train, categorical_cols):
    return {c: set(df_train[c].dropna().astype(str).unique()) for c in categorical_cols}

def _schema_check(df_new, required_cols):
    have = set(df_new.columns)
    req  = set(required_cols)
    return list(req - have), list(have - req)

def _compute_bins_base(series, nbins=10):
    s = pd.Series(series).dropna().astype(float)
    q = np.linspace(0, 1, nbins+1)
    edges = np.unique(np.quantile(s, q))
    if len(edges) < 3:
        mn, mx = s.min(), s.max()
        edges = np.linspace(mn, mx if mx>mn else mn+1e-6, nbins+1)
    return edges

def _psi_from_edges(base, new, edges, eps=1e-6):
    b = pd.cut(base, bins=edges, include_lowest=True).value_counts(normalize=True, sort=False)
    n = pd.cut(new,  bins=edges, include_lowest=True).value_counts(normalize=True, sort=False)
    b = (b + eps) / (b.sum() + eps*len(b))
    n = (n + eps) / (n.sum() + eps*len(n))
    return float(((n - b) * np.log(n / b)).sum())

def _psi_bucket_label(psi):
    return ("none" if psi < 0.1 else "moderate" if psi < 0.25 else "major")

def prepare_features_for_inference(df, feature_cols, categorical_cols, train_columns):
    X = pd.get_dummies(df[feature_cols], columns=categorical_cols)
    X = X.reindex(columns=train_columns, fill_value=0)
    return X

def check_model_compatibility(df_new,
                              df_train,
                              X_train_columns,
                              feature_cols,
                              categorical_cols,
                              numeric_cols,
                              nbins=10):
    missing, extra = _schema_check(df_new, feature_cols)
    train_cats = _infer_training_categories(df_train, categorical_cols)
    unseen = {
        c: sorted(set(df_new[c].dropna().astype(str).unique()) - train_cats[c])
        for c in categorical_cols if c in df_new.columns
    }
    base_numeric = {c: df_train[c].astype(float) for c in numeric_cols}
    new_numeric  = {c: df_new[c].astype(float)  for c in numeric_cols}
    psi = {}
    psi_level = {}
    for c in numeric_cols:
        edges = _compute_bins_base(base_numeric[c], nbins)
        v = _psi_from_edges(base_numeric[c], new_numeric[c], edges)
        psi[c] = v
        psi_level[c] = _psi_bucket_label(v)

    X_new = prepare_features_for_inference(df_new, feature_cols, categorical_cols, X_train_columns)

    compatible = (len(missing) == 0)
    report = {
        "compatible": compatible,
        "missing_required_columns": missing,
        "extra_columns_ignored": extra,
        "unseen_categories": unseen,
        "numeric_psi": psi,
        "numeric_psi_level": psi_level,
        "notes": [
            "Unseen categories are encoded as all-zero one-hots; consider retraining if many are present.",
            "PSI ≥ 0.25 suggests major drift; consider retraining or separate model for this app."
        ]
    }
    return report, X_new



In [None]:
import uuid, random, datetime
import pandas as pd, numpy as np

def make_df_new_app_similar(n=60):
    services  = ["Auth","Billing","Search","Recs"]           # includes some unseen services
    instances = ["srv1","srv2","srvX"]                       # includes unseen instance
    queues    = {"Auth":"q_auth","Billing":"q_bill","Search":"q_search","Recs":"q_recs"}
    levels    = ["INFO","WARN","ERROR","DEBUG"]
    codes     = [200,201,400,401,404,500,502]
    rows=[]
    for _ in range(n):
        svc = random.choice(services)
        inst= random.choice(instances)
        elapsed = max(0.1, random.gauss(12,6))
        rows.append({
            "timestamp":   (datetime.datetime.now()-datetime.timedelta(minutes=random.randint(0,600))).strftime("%Y-%m-%d %H:%M:%S"),
            "service":     svc,
            "instance":    inst,
            "jms_queue":   queues.get(svc,"q_misc"),
            "elapsed_time": round(elapsed,2),
            "log_level":   random.choice(levels),
            "status_code": random.choice(codes),
            "message":     "synthetic",
            "user_id":     f"user_{random.randint(1,9999)}",
            "session_id":  str(uuid.uuid4()),
            "error_code":  None,
            "error_message": None
        })
    return pd.DataFrame(rows)

df_new_sim = make_df_new_app_similar(60)


In [None]:
def make_df_new_app_different(n=60):
    services=["Gateway","Search"]
    hosts   =["nodeA","nodeB"]
    queues  =["q_gateway","q_search"]
    regions =["us-east","us-west"]
    rows=[]
    for _ in range(n):
        svc = random.choice(services)
        latency = max(0.1, random.gauss(8 if svc=="Search" else 18,5))
        rows.append({
            "timestamp":   (datetime.datetime.now()-datetime.timedelta(minutes=random.randint(0,600))).strftime("%Y-%m-%d %H:%M:%S"),
            "service_name": svc,              # different name
            "host":         random.choice(hosts),  # different name
            "queue":        random.choice(queues), # different name
            "latency":      round(latency,2),      # different name
            "level":        random.choice(["INFO","WARN","ERROR"]),
            "status":       random.choice([200,200,200,500,502]),
            "region":       random.choice(regions),
            "app_version":  f"v{random.randint(1,3)}.{random.randint(0,9)}"
        })
    return pd.DataFrame(rows)

df_new_diff = make_df_new_app_different(60)


In [None]:
df_new_sim = make_df_new_app_similar(60)
status_sim, df_sim_mapped, report_sim = detect_and_route_new_app(df_new_sim)
print(status_sim)
print(report_sim)

df_new_diff = make_df_new_app_different(60)
status_diff, df_diff_mapped, report_diff = detect_and_route_new_app(df_new_diff)
print(status_diff)
print(report_diff)

as_is
{'status': 'as_is', 'missing_required_before': [], 'extra_columns_before': [], 'used_aliases': {}, 'columns_before': ['timestamp', 'service', 'instance', 'jms_queue', 'elapsed_time', 'log_level', 'status_code', 'message', 'user_id', 'session_id', 'error_code', 'error_message'], 'columns_after': ['timestamp', 'service', 'instance', 'jms_queue', 'elapsed_time', 'log_level', 'status_code', 'message', 'user_id', 'session_id', 'error_code', 'error_message']}
normalized
{'status': 'normalized', 'missing_required_before': ['service', 'instance', 'jms_queue', 'elapsed_time', 'log_level', 'status_code', 'message', 'user_id', 'session_id', 'error_code', 'error_message'], 'extra_columns_before': ['service_name', 'host', 'queue', 'latency', 'level', 'status', 'region', 'app_version'], 'used_aliases': {'service_name': 'service', 'host': 'instance', 'queue': 'jms_queue', 'latency': 'elapsed_time', 'level': 'log_level', 'status': 'status_code'}, 'columns_before': ['timestamp', 'service_name', '

In [None]:
# Use the compatibility checker on the similar-schema dataset
report_sim, X_new_sim = check_model_compatibility(
    df_new=df_new_sim,
    df_train=df_train,
    X_train_columns=list(X_train.columns),
    feature_cols=feature_cols,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols,
    nbins=10
)
print("Similar schema report:", report_sim)

if report_sim["compatible"]:
    preds = iso_clf.predict(X_new_sim)
    df_new_sim['ml_anomaly'] = np.where(preds==-1, 1, 0)
    print(df_new_sim[['timestamp','service','instance','jms_queue','elapsed_time','ml_anomaly']].head())


Similar schema report: {'compatible': True, 'missing_required_columns': [], 'extra_columns_ignored': ['timestamp', 'session_id', 'status_code', 'error_code', 'error_message', 'user_id', 'log_level', 'message'], 'unseen_categories': {'service': ['Recs', 'Search'], 'instance': ['srvX'], 'jms_queue': ['q_recs', 'q_search']}, 'numeric_psi': {'elapsed_time': 1.9394476451174687}, 'numeric_psi_level': {'elapsed_time': 'major'}, 'notes': ['Unseen categories are encoded as all-zero one-hots; consider retraining if many are present.', 'PSI ≥ 0.25 suggests major drift; consider retraining or separate model for this app.']}
             timestamp  service instance jms_queue  elapsed_time  ml_anomaly
0  2025-08-21 09:42:53  Billing     srv2    q_bill         10.02           0
1  2025-08-21 10:26:53     Recs     srv2    q_recs         18.27           0
2  2025-08-21 12:27:53  Billing     srvX    q_bill         15.55           0
3  2025-08-21 11:47:53     Auth     srv1    q_auth         13.44        

In [None]:
# Normalize columns to the canonical schema your model expects
df_new_diff_mapped = normalize_new_app_schema(df_new_diff)

report_diff2, X_new_diff = check_model_compatibility(
    df_new=df_new_diff_mapped,
    df_train=df_train,
    X_train_columns=list(X_train.columns),
    feature_cols=feature_cols,
    categorical_cols=categorical_cols,
    numeric_cols=numeric_cols,
    nbins=10
)
print("Different schema report (post-mapping):", report_diff2)

if report_diff2["compatible"]:
    preds = iso_clf.predict(X_new_diff)
    df_new_diff_mapped['ml_anomaly'] = np.where(preds==-1, 1, 0)
    print(df_new_diff_mapped[['timestamp','service','instance','jms_queue','elapsed_time','ml_anomaly']].head())
else:
    print("Missing required columns after normalization:", report_diff2['missing_required_columns'])
def make_df_new_app_different(n=60):
    services=["Gateway","Search"]
    hosts   =["nodeA","nodeB"]
    queues  =["q_gateway","q_search"]
    regions =["us-east","us-west"]
    rows=[]
    for _ in range(n):
        svc = random.choice(services)
        latency = max(0.1, random.gauss(8 if svc=="Search" else 18,5))
        rows.append({
            "time":       (datetime.datetime.now()-datetime.timedelta(minutes=random.randint(0,600))).strftime("%Y-%m-%d %H:%M:%S"),
            "service_name": svc,            # alias for service
            "host":         random.choice(hosts),   # alias for instance
            "queue":        random.choice(queues),  # alias for jms_queue
            "latency":      round(latency,2),       # alias for elapsed_time
            "level":        random.choice(["INFO","WARN","ERROR"]),
            "status":       random.choice([200,200,200,500,502]),
            "region":       random.choice(regions),
            "app_version":  f"v{random.randint(1,3)}.{random.randint(0,9)}",
            "msg":          "alt schema sample"     # alias for message
        })
    return pd.DataFrame(rows)

df_new_diff = make_df_new_app_different(60)


Different schema report (post-mapping): {'compatible': True, 'missing_required_columns': [], 'extra_columns_ignored': ['timestamp', 'session_id', 'status_code', 'error_code', 'error_message', 'user_id', 'log_level', 'message'], 'unseen_categories': {'service': ['Gateway', 'Search'], 'instance': ['nodeA', 'nodeB'], 'jms_queue': ['q_gateway', 'q_search']}, 'numeric_psi': {'elapsed_time': 0.7563238345366388}, 'numeric_psi_level': {'elapsed_time': 'major'}, 'notes': ['Unseen categories are encoded as all-zero one-hots; consider retraining if many are present.', 'PSI ≥ 0.25 suggests major drift; consider retraining or separate model for this app.']}
            timestamp  service instance  jms_queue  elapsed_time  ml_anomaly
0 2025-08-21 13:09:55   Search    nodeB   q_search         10.68           0
1 2025-08-21 17:43:55   Search    nodeB  q_gateway         12.19           0
2 2025-08-21 12:03:55  Gateway    nodeA  q_gateway         16.93           0
3 2025-08-21 15:01:55  Gateway    nodeB