# Explotory Data Analysis

## Download Data

from pathlib import Path
import requests

url = "https://raw.githubusercontent.com/hkerma/kubernetes-event-dataset/main/audit-logs.csv"
out_path = Path("../data/raw/kubernetes-event-dataset/audit-logs.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)

with requests.get(url, stream=True) as r:
    r.raise_for_status()
    with open(out_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 1024): # 1MB chunks
            if chunk:
                f.write(chunk)
out_path, out_path.stat().st_size

In [10]:
from pathlib import Path
import requests

url = "https://github.com/hkerma/kubernetes-event-dataset/blob/main/events-dataset.txt"
out_path = Path("../data/raw/kubernetes-event-dataset/events-dataset.txt")
out_path.parent.mkdir(parents=True, exist_ok=True)

with requests.get(url, stream=True) as r:
    r.raise_for_status()
    with open(out_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            if chunk:
                f.write(chunk)

out_path, out_path.stat().st_size

(PosixPath('../data/raw/kubernetes-event-dataset/events-dataset.txt'), 755860)

## Explore Data

In [14]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path("../data/raw/kubernetes-event-dataset")  # adjust if notebook path differs

audit_csv = DATA_DIR / "audit-logs.csv"
events_txt = DATA_DIR / "events-dataset.txt"

# 1) Load audits (this CSV has NO header row)
cols = [
    "cluster_json", "ts", "user", "verb", "resource",
    "namespace", "name", "extra_1",
    "request_uri", "request_uid", "stage", "status_code",
    "extra_2", "extra_3", "extra_4",
]
df = pd.read_csv(audit_csv, header=None, names=cols)

display(df.head())
print("Rows:", len(df))
print("Columns:", list(df.columns))

# 2) Missingness quick check
missing = (df.isna().mean().sort_values(ascending=False) * 100).round(2)
print(missing.head(20))

# 3) Parse timestamps + status code
df["ts"] = pd.to_datetime(df["ts"], errors="coerce", utc=True)
df["status_code"] = pd.to_numeric(df["status_code"], errors="coerce")

print("Min ts:", df["ts"].min(), "Max ts:", df["ts"].max())
print(df[["verb", "resource", "stage", "status_code"]].value_counts().head(10))


Unnamed: 0,cluster_json,ts,user,verb,resource,namespace,name,extra_1,request_uri,request_uid,stage,status_code,extra_2,extra_3,extra_4
0,"{""name"":""k8s-master-perfspec""}",2023-01-27T18:28:08.129781Z,system:node:k8s-node-1-perfspec,watch,configmaps,,kube-flannel-cfg,,/api/v1/namespaces/kube-flannel/configmaps?all...,a1b86d5d-6f8f-4058-9710-19f5cc40a68a,ResponseStarted,403.0,,,
1,"{""name"":""k8s-master-perfspec""}",2023-01-27T18:28:08.131024Z,system:node:k8s-node-1-perfspec,watch,configmaps,,kube-root-ca.crt,,/api/v1/namespaces/kube-system/configmaps?allo...,b51f1a8c-e53e-4104-b1a6-e695a24ed43c,ResponseStarted,403.0,,,
2,"{""name"":""k8s-master-perfspec""}",2023-01-27T18:28:08.132303Z,system:node:k8s-node-1-perfspec,watch,services,,,,/api/v1/services?allowWatchBookmarks=true&reso...,56c0cbe5-89f6-4820-b8fb-473ca0bff697,ResponseStarted,200.0,,,
3,"{""name"":""k8s-master-perfspec""}",2023-01-27T18:28:08.139829Z,system:kube-scheduler,list,csidrivers,,,,/apis/storage.k8s.io/v1/csidrivers?limit=500&r...,33f5dcf5-cd59-47ce-93e8-b8d006824591,ResponseComplete,403.0,,,
4,"{""name"":""k8s-master-perfspec""}",2023-01-27T18:28:08.142306Z,system:kube-scheduler,list,nodes,,,,/api/v1/nodes?limit=500&resourceVersion=0,8e6e8f52-3224-4af1-bb0f-ae0aa788ac40,ResponseComplete,403.0,,,


Rows: 16564
Columns: ['cluster_json', 'ts', 'user', 'verb', 'resource', 'namespace', 'name', 'extra_1', 'request_uri', 'request_uid', 'stage', 'status_code', 'extra_2', 'extra_3', 'extra_4']
extra_3         100.00
extra_4         100.00
extra_1         100.00
extra_2         100.00
namespace        96.73
name             70.56
resource         62.16
stage             0.01
status_code       0.01
ts                0.00
user              0.00
verb              0.00
cluster_json      0.00
request_uid       0.00
request_uri       0.00
dtype: float64
Min ts: 2023-01-27 18:28:08.127428+00:00 Max ts: 2023-01-27 18:32:12.474142+00:00
verb    resource           stage             status_code
get     secrets            ResponseComplete  200.0          429
update  secrets            ResponseComplete  200.0          357
        leases             ResponseComplete  200.0          321
get     leases             ResponseComplete  200.0          271
create  events             ResponseComplete  201.0    

In [15]:
df["status_code"].value_counts().loc[[500, 503]] if set([500, 503]).issubset(df["status_code"].value_counts().index) else df["status_code"].value_counts().head(20)


status_code
200.0    14874
201.0      804
404.0      747
403.0       71
409.0       66
503.0        1
Name: count, dtype: int64

In [16]:
import re
import pandas as pd

# --- helper columns ---
df = df.copy()

df["is_sa"] = df["user"].astype(str).str.startswith("system:serviceaccount:")

# Extract SA namespace + name from the user string: system:serviceaccount:<ns>:<sa>
m = df["user"].astype(str).str.extract(r"^system:serviceaccount:([^:]+):(.+)$")
df["sa_namespace"] = m[0]
df["sa_name"] = m[1]

RBAC_RESOURCES = {"rolebindings", "clusterrolebindings", "roles", "clusterroles"}
RBAC_CHANGE_VERBS = {"create", "update", "patch", "delete", "deletecollection"}
FAIL_VERBS = {"get", "list", "watch"}

df["is_rbac_obj"] = df["resource"].isin(RBAC_RESOURCES)
df["is_rbac_change"] = df["is_rbac_obj"] & df["verb"].isin(RBAC_CHANGE_VERBS)

# Failing ServiceAccount access (exclude RBAC objects themselves)
fail = df[
    (df["status_code"] == 403) &
    (df["is_sa"]) &
    (df["verb"].isin(FAIL_VERBS)) &
    (~df["is_rbac_obj"])
].copy()

# Candidate RBAC changes (the “cause” pool)
rbac = df[df["is_rbac_change"]].copy()

# Sort for merge_asof (required)
fail = fail.sort_values("ts")
rbac = rbac.sort_values("ts")

# Optional: scope causes by same namespace when RBAC object is namespaced
# In your dataset you have df["namespace"] as a column. We'll use it to increase precision.
# For cluster-scoped RBAC objects (clusterrole/clusterrolebinding), namespace is often null/empty.

# Prepare columns to carry over from RBAC change rows
rbac_cols = [
    "ts", "verb", "resource", "namespace", "name", "request_uid", "request_uri", "user"
]
rbac = rbac[rbac_cols].rename(columns={
    "ts": "rbac_ts",
    "verb": "rbac_verb",
    "resource": "rbac_resource",
    "namespace": "rbac_namespace",
    "name": "rbac_name",
    "request_uid": "rbac_request_uid",
    "request_uri": "rbac_request_uri",
    "user": "rbac_actor",
})

# merge_asof: attach the nearest prior RBAC change within a time tolerance (e.g., 30 minutes)
# merge_asof matches on nearest key <= left key when direction="backward". [web:37]
paired = pd.merge_asof(
    fail,
    rbac,
    left_on="ts",
    right_on="rbac_ts",
    direction="backward",
    tolerance=pd.Timedelta("30min"),
)

# Keep only failures that found a prior RBAC change
paired = paired[paired["rbac_ts"].notna()].copy()

# (Optional) tighten: require namespace match when the failing SA is namespaced and the RBAC change is namespaced
paired = paired[
    (paired["rbac_namespace"].isna()) | (paired["rbac_namespace"] == paired["sa_namespace"])
].copy()

print("Candidate Scenario B pairs:", len(paired))
paired[[
    "ts","user","verb","resource","namespace","name","request_uid","request_uri",
    "rbac_ts","rbac_actor","rbac_verb","rbac_resource","rbac_namespace","rbac_name","rbac_request_uid","rbac_request_uri"
]].head()


Candidate Scenario B pairs: 0


Unnamed: 0,ts,user,verb,resource,namespace,name,request_uid,request_uri,rbac_ts,rbac_actor,rbac_verb,rbac_resource,rbac_namespace,rbac_name,rbac_request_uid,rbac_request_uri


In [13]:
# 4) Load event sequences (each line = sequence of event names)
lines = events_txt.read_text(encoding="utf-8").splitlines()
print("Num sequences:", len(lines))

seqs = [ln.split() for ln in lines if ln.strip()]
lengths = pd.Series([len(s) for s in seqs], name="seq_len")
print(lengths.describe(percentiles=[0.5, 0.9, 0.95, 0.99]))

# 5) Event frequency
from collections import Counter
c = Counter(e for s in seqs for e in s)
freq = pd.DataFrame(c.most_common(50), columns=["event", "count"])
display(freq)

Num sequences: 1451
count    1076.000000
mean       11.182156
std        97.951304
min         1.000000
50%         2.000000
90%        15.500000
95%        45.000000
99%        91.000000
max      3174.000000
Name: seq_len, dtype: float64


Unnamed: 0,event,count
0,0,2548
1,1,659
2,"data-view-component=""true""",168
3,1.75,135
4,"aria-hidden=""true""",111
5,"viewBox=""0",108
6,"class=""octicon",108
7,16,105
8,"crossorigin=""anonymous""",100
9,<div,93
