In [11]:
# =========================
# CELL 1 — Config & Paths (clean + judge-friendly)
# - Tách rõ: DATA/MODEL vs AUTOSCALING/SIM
# - Window-aware + Metric-aware (buffer/capacity)
# - Không phá các helper đã dùng ở cell sau
# =========================

import os, re, json, math
from datetime import datetime, timezone
from typing import Dict, Any, List, Tuple, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(42)

# -------------------------
# Paths
# -------------------------
from pathlib import Path
import os, shutil

# Root luôn là thư mục notebooks (vì notebook nằm trong notebooks/)
PROJECT_ROOT = Path.cwd()   # => .../AUTOSCALING-ANALYSIS/notebooks

# (Optional) Nếu data đang nằm ở ../data thì copy vào notebooks/data để mọi thứ "nằm trong notebooks"
src_data = (PROJECT_ROOT / ".." / "data").resolve()
dst_data = (PROJECT_ROOT / "data").resolve()

if not (dst_data / "raw").exists() and (src_data / "raw").exists():
    dst_data.mkdir(parents=True, exist_ok=True)
    shutil.copytree(src_data, dst_data, dirs_exist_ok=True)
    print(f"✅ Copied data from {src_data} -> {dst_data}")

PROJECT_ROOT = str(PROJECT_ROOT)  # giữ kiểu string cho code hiện tại
OUT_02 = os.path.join(PROJECT_ROOT, "outputs", "02_eda")
OUT_03 = os.path.join(PROJECT_ROOT, "outputs", "03_features")
OUT_04 = os.path.join(PROJECT_ROOT, "outputs", "04_models")
OUT_04P = os.path.join(OUT_04, "predictions")
OUT_05 = os.path.join(PROJECT_ROOT, "outputs", "05_scaling")

for p in [OUT_02, OUT_03, OUT_04, OUT_04P, OUT_05]:
    os.makedirs(p, exist_ok=True)

# -------------------------
# Core helpers (keep as-is for other cells)
# -------------------------
def tag_minutes(tag: str) -> int:
    return {"1m": 1, "5m": 5, "15m": 15}[tag]

def steps_per_day(tag: str) -> int:
    return int(24 * 60 / tag_minutes(tag))

def steps_per_hour(tag: str) -> int:
    return int(60 / tag_minutes(tag))

def resolve_roll_windows(tag: str, roll_windows: List[str]) -> Dict[str, int]:
    sph = steps_per_hour(tag)
    spd = steps_per_day(tag)
    out = {}
    for w in roll_windows:
        if w == "1h":
            out[w] = 1 * sph
        elif w == "6h":
            out[w] = 6 * sph
        elif w == "1d":
            out[w] = 1 * spd
        else:
            raise ValueError(f"Unsupported roll window: {w}")
    return out

# -------------------------
# CFG (one source of truth)
# -------------------------
CFG: Dict[str, Any] = {
    # ===== Dataset =====
    "RAW_LOG_PATH": os.path.join(PROJECT_ROOT, "data", "access_log.txt"),  # optional
    "TAGS": ["1m", "5m", "15m"],
    "TIME_COL_RAW": "timestamp",
    "TIME_COL_BUCKET": "bucket_start",

    # Storm gap (problem statement)
    "STORM_START": pd.Timestamp("1995-08-01 14:52:01"),
    "STORM_END":   pd.Timestamp("1995-08-03 04:36:13"),

    # ===== Feature engineering =====
    "LAG_DAYS": [1,2,3,4,5,6,7],
    "ROLL_WINDOWS": ["1h","6h","1d"],
    "ROLL_USE_STD": True,
    "USE_CYCLIC": True,
    "HORIZON_STEPS": 1,
    "KEEP_RAW_EXTRA": [
        "unique_hosts","err_4xx","err_5xx","error_rate",
        "is_missing_bucket","is_gap_storm","is_gap_unknown"
    ],
    "REQUIRE_COLS": ["bucket_start","hits","bytes_sum","is_gap"],

    # ===== Modeling =====
    "TARGETS": ["hits", "bytes_sum"],
    "XGB_PARAMS": dict(
        booster="gbtree",
        n_estimators=5000,
        early_stopping_rounds=50,
        objective="reg:squarederror",
        max_depth=6,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        random_state=42,
    ),
    "CV_SPLITS": 5,
    "CV_TEST_DAYS": 2,
    "CV_GAP_STEPS": 1,

    # ==========================================================
    # AUTOSCALING / SIMULATION CONFIG (Window-aware + Metric-aware)
    # ==========================================================
    "SCALING": {
        # bounds
        "min_instances": 2,
        "max_instances": 50,

        # unit cost
        "cost_per_instance_per_hour": 0.05,

        # window -> minutes
        "window_minutes": {"1m": 1, "5m": 5, "15m": 15},

        # --- Metric-aware safety buffer (tránh bytes_sum bị under-provision)
        # hits thường ổn với buffer vừa; bytes_sum hay burst => buffer cao hơn
        "safety_buffer_by_metric": {"hits": 0.3, "bytes_sum": 0.3},

        # --- Per-instance capacity (tune để required_instances có dao động đẹp)
        # NOTE: nếu muốn demo "predictive có phản ứng", hạ bytes_sum cap xuống
        "capacity_per_instance": {
            ("hits","1m"): 20, ("hits","5m"): 100, ("hits","15m"): 350,
            ("bytes_sum","1m"): 350_000, ("bytes_sum","5m"): 1_200_000, ("bytes_sum","15m"): 3_500_000,
        },

        # --- Step change per window (15m không nên nhảy quá lớn cho đẹp)
        "max_step_change_by_window": {"1m": 6, "5m": 10, "15m": 15},

        # --- Hysteresis per window (1m noise => high/low lớn hơn)
        # high: số cửa sổ liên tiếp vượt ngưỡng mới scale-out
        # low : số cửa sổ liên tiếp dưới ngưỡng mới scale-in
        "hysteresis_by_window": {
            "1m": {"high": 2, "low": 6, "in_margin": 0.18},
            "5m": {"high": 1, "low": 4, "in_margin": 0.15},
            "15m":{"high": 1, "low": 2, "in_margin": 0.12},
        },

        "predictive_deadband_by_window": {"1m": 0.5, "5m": 0.5, "15m": 0.5},

        # --- cooldown (tính theo phút, convert trong code)
        "cooldown_minutes": {"base": 8, "spike": 15},

        # --- provisioning per window
        "provisioning_by_window": {
            "1m": {"warmup_windows": 1, "min_uptime_windows": 6},
            "5m": {"warmup_windows": 1, "min_uptime_windows": 4},
            "15m":{"warmup_windows": 0, "min_uptime_windows": 2},
        },

        # --- Reactive (rescue) knobs
        "reactive": {
            "enabled": True,
            "overload_scale_out_immediate": True,
            "rescue_extra_instances": 3,
            "queue_low_fraction": 0.05,
            "queue_high_multiplier": 4.0,  # cao hơn để giảm false rescue => đẹp demo
        },

        # --- SLO / latency model (đơn giản hóa)
        "slo": {
            "base_latency_ms": 80.0,
            "alpha_latency_per_unit_queue": 0.15,
            "p95_latency_target_ms": 300.0,
        },

        # --- Anomaly detection (MAD) theo lookback giờ (convert trong code)
        "anomaly": {
            "enabled": True,
            "method": "mad",
            "lookback_hours": 2,
            "mad_k": 6.0,
            "min_points": 10,
            "max_flag_rate": 0.30,
        },

        # --- DDoS mode (force step per window)
        "ddos_mode": {
            "enabled": True,
            "force_scale_out_step_by_window": {"1m": 6, "5m": 10, "15m": 12},
            "max_instances_during_ddos": 50,
        },
    }
}

print("✅ Cell 1 done — paths ready + CFG ready (CFG['SCALING'] exists)")


✅ Cell 1 done — paths ready + CFG ready (CFG['SCALING'] exists)


In [12]:
# CELL 2 — Streaming parse raw logs -> raw_train, raw_test
TRAIN_LOG_PATH = os.environ.get("TRAIN_LOG_PATH", "data/raw/train.txt")
TEST_LOG_PATH  = os.environ.get("TEST_LOG_PATH",  "data/raw/test.txt")
CHUNK_SIZE_LINES = int(os.environ.get("CHUNK_SIZE_LINES", "300000"))

DT_FORMAT = "%d/%b/%Y:%H:%M:%S %z"

LOG_RE = re.compile(
    r'^(?P<host>\S+)\s+\S+\s+\S+\s+\[(?P<ts>[^\]]+)\]\s+'
    r'"(?P<request>[^"]*)"\s+(?P<status>\d{3})\s+(?P<bytes>\S+)\s*$'
)
REQ_RE = re.compile(r'^(?P<method>[A-Z]+)\s+(?P<url>\S+)\s+(?P<version>HTTP/\d\.\d)$')

def _parse_line(line: str):
    m = LOG_RE.match(line)
    if not m:
        return None

    host = m.group("host")
    ts_raw = m.group("ts")
    req_raw = m.group("request")
    status_raw = m.group("status")
    bytes_raw = m.group("bytes")

    try:
        dt = datetime.strptime(ts_raw, DT_FORMAT)
    except Exception:
        dt = pd.NaT

    method = url = version = "UNKNOWN"
    rm = REQ_RE.match(req_raw.strip())
    if rm:
        method, url, version = rm.group("method"), rm.group("url"), rm.group("version")

    try:
        status = int(status_raw)
    except Exception:
        status = pd.NA

    if bytes_raw in ("-", ""):
        bval, miss = pd.NA, 1
    else:
        try:
            bval, miss = int(bytes_raw), 0
        except Exception:
            bval, miss = pd.NA, 1

    return (dt, host, method, url, version, status, bval, miss)

def _normalize_df(df: pd.DataFrame) -> pd.DataFrame:
    df["status"] = pd.to_numeric(df["status"], errors="coerce").astype("Int16")
    df["bytes"] = pd.to_numeric(df["bytes"], errors="coerce").astype("Int64")
    df["bytes_missing_flag"] = pd.to_numeric(df["bytes_missing_flag"], errors="coerce").astype("Int8")
    return df

def parse_file_streaming(path: str, chunk_lines: int = CHUNK_SIZE_LINES) -> pd.DataFrame:
    parts = []
    buf = []

    with open(path, "r", errors="replace") as f:
        for line in f:
            ev = _parse_line(line.rstrip("\n"))
            if ev is None:
                continue
            buf.append(ev)

            if len(buf) >= chunk_lines:
                df = pd.DataFrame(buf, columns=[
                    "datetime","host","method","url","version","status","bytes","bytes_missing_flag"
                ])
                parts.append(_normalize_df(df))
                buf = []

    if buf:
        df = pd.DataFrame(buf, columns=[
            "datetime","host","method","url","version","status","bytes","bytes_missing_flag"
        ])
        parts.append(_normalize_df(df))

    return pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=[
        "datetime","host","method","url","version","status","bytes","bytes_missing_flag"
    ])

raw_train = parse_file_streaming(TRAIN_LOG_PATH)
raw_test  = parse_file_streaming(TEST_LOG_PATH)

print("raw_train:", raw_train.shape, "| raw_test:", raw_test.shape)
try:
    from IPython.display import display
    display(raw_train.head(3))
    display(raw_test.head(3))
    display(raw_train.tail(3))
    display(raw_test.tail(3))
except Exception:
    print(raw_train.head(3).to_string(index=False))
    print(raw_test.head(3).to_string(index=False))


raw_train: (2934932, 8) | raw_test: (526648, 8)


Unnamed: 0,datetime,host,method,url,version,status,bytes,bytes_missing_flag
0,1995-07-01 00:00:01-04:00,199.72.81.55,GET,/history/apollo/,HTTP/1.0,200,6245,0
1,1995-07-01 00:00:06-04:00,unicomp6.unicomp.net,GET,/shuttle/countdown/,HTTP/1.0,200,3985,0
2,1995-07-01 00:00:09-04:00,199.120.110.21,GET,/shuttle/missions/sts-73/mission-sts-73.html,HTTP/1.0,200,4085,0


Unnamed: 0,datetime,host,method,url,version,status,bytes,bytes_missing_flag
0,1995-08-23 00:00:00-04:00,ix-mia1-02.ix.netcom.com,GET,/ksc.html,HTTP/1.0,200,7087,0
1,1995-08-23 00:00:05-04:00,internet-gw.watson.ibm.com,GET,/history/apollo/pad-abort-test-2/pad-abort-tes...,HTTP/1.0,200,1292,0
2,1995-08-23 00:00:06-04:00,ix-mia1-02.ix.netcom.com,GET,/images/ksclogo-medium.gif,HTTP/1.0,200,5866,0


Unnamed: 0,datetime,host,method,url,version,status,bytes,bytes_missing_flag
2934929,1995-08-22 23:59:57-04:00,sfsp129.slip.net,GET,/images/MOSAIC-logosmall.gif,HTTP/1.0,200,363,0
2934930,1995-08-22 23:59:58-04:00,sfsp129.slip.net,GET,/images/USA-logosmall.gif,HTTP/1.0,200,234,0
2934931,1995-08-22 23:59:59-04:00,sfsp129.slip.net,GET,/images/WORLD-logosmall.gif,HTTP/1.0,200,669,0


Unnamed: 0,datetime,host,method,url,version,status,bytes,bytes_missing_flag
526645,1995-08-31 23:59:52-04:00,cys-cap-9.wyoming.com,GET,/shuttle/missions/sts-71/movies/sts-71-launch-...,HTTP/1.0,200,57344,0
526646,1995-08-31 23:59:52-04:00,www-c8.proxy.aol.com,GET,/icons/unknown.xbm,HTTP/1.0,200,515,0
526647,1995-08-31 23:59:53-04:00,cindy.yamato.ibm.co.jp,GET,/images/kscmap-small.gif,HTTP/1.0,200,39017,0


In [13]:
# CELL 03 — TS3 ONLY + SAVE to data/train and data/test
import os, numpy as np, pandas as pd

assert "raw_train" in globals() and "raw_test" in globals(), "Run CELL 02 first."

OUT_DIR = os.environ.get("OUT_DIR_03", "outputs/02_eda")
os.makedirs(OUT_DIR, exist_ok=True)

SAVE_TRAIN_DIR = os.environ.get("SAVE_TRAIN_DIR", "data/train")
SAVE_TEST_DIR  = os.environ.get("SAVE_TEST_DIR",  "data/test")
os.makedirs(SAVE_TRAIN_DIR, exist_ok=True)
os.makedirs(SAVE_TEST_DIR, exist_ok=True)

FREQS = {"1m":"1min", "5m":"5min", "15m":"15min"}
GAP_STORM_START = pd.Timestamp("1995-08-01 14:52:01-0400")
GAP_STORM_END   = pd.Timestamp("1995-08-03 04:36:13-0400")
UNKNOWN_GAP_MIN_HOURS = int(os.environ.get("UNKNOWN_GAP_MIN_HOURS", "12"))
FILL_COLS = ["hits","bytes_sum","avg_bytes_per_req","err_4xx","err_5xx","error_rate","unique_hosts"]

def agg(raw, freq):
    d = raw[["datetime","host","status","bytes"]].copy()
    d["datetime"] = pd.to_datetime(d["datetime"], utc=False)
    b = d["datetime"].dt.floor(freq)
    st = pd.to_numeric(d["status"], errors="coerce")
    g = d.assign(bucket_start=b, bytes_num=pd.to_numeric(d["bytes"], errors="coerce")).groupby("bucket_start", sort=True)
    idx = g.size().index
    ts2 = pd.DataFrame({
        "bucket_start": idx,
        "hits": g.size().astype("int64").values,
        "bytes_sum": g["bytes_num"].sum(min_count=1).astype("float64").reindex(idx).values,
        "unique_hosts": g["host"].nunique().astype("int64").reindex(idx).values,
        "err_4xx": st.between(400,499).groupby(b).sum().astype("int64").reindex(idx, fill_value=0).values,
        "err_5xx": st.between(500,599).groupby(b).sum().astype("int64").reindex(idx, fill_value=0).values,
    }).sort_values("bucket_start").reset_index(drop=True)
    ts2["avg_bytes_per_req"] = np.where(ts2["hits"] > 0, ts2["bytes_sum"]/ts2["hits"], 0.0)
    ts2["error_rate"] = np.where(ts2["hits"] > 0, (ts2["err_4xx"]+ts2["err_5xx"])/ts2["hits"], 0.0)
    return ts2

def to_ts3(ts2, freq):
    s, e = ts2["bucket_start"].min(), ts2["bucket_start"].max()
    out = pd.DataFrame({"bucket_start": pd.date_range(s, e, freq=freq, tz=s.tz)}).merge(ts2, on="bucket_start", how="left")
    out["is_missing_bucket"] = out["hits"].isna().astype("int8")

    ss, ee = GAP_STORM_START.floor(freq), GAP_STORM_END.floor(freq)
    out["is_gap_storm"] = ((out["bucket_start"] >= ss) & (out["bucket_start"] < ee)).astype("int8")

    is_m = out["is_missing_bucket"].astype(bool)
    run_id = (is_m != is_m.shift()).cumsum()
    min_len = int((UNKNOWN_GAP_MIN_HOURS*60) / (pd.Timedelta(freq).total_seconds()/60))
    out["is_gap_unknown"] = (is_m & (is_m.groupby(run_id).transform("sum") >= min_len)).astype("int8")

    out["is_gap"] = ((out["is_gap_storm"]==1) | (out["is_gap_unknown"]==1)).astype("int8")

    for c in FILL_COLS:
        out.loc[(out["is_gap"]==0) & (out[c].isna()), c] = 0
        out.loc[out["is_gap"]==1, c] = np.nan
    return out

def rep(split, k, df):
    print(f"{split}/{k} | rows={len(df):,} | range={df.bucket_start.min()} -> {df.bucket_start.max()} | "
          f"missing={int(df.is_missing_bucket.sum()):,} | gap={int(df.is_gap.sum()):,} "
          f"(storm={int(df.is_gap_storm.sum()):,}, unknown={int(df.is_gap_unknown.sum()):,})")

for split, raw in [("train", raw_train), ("test", raw_test)]:
    save_dir = SAVE_TRAIN_DIR if split == "train" else SAVE_TEST_DIR

    for k, freq in FREQS.items():
        out = to_ts3(agg(raw, freq), freq)

        # 1) save to outputs/02_eda
        p1 = os.path.join(OUT_DIR, f"ts3_{split}_{k}.parquet")
        out.to_parquet(p1, index=False)

        # 2) save to data/train or data/test
        p2 = os.path.join(save_dir, f"ts3_{k}.parquet")
        out.to_parquet(p2, index=False)

        rep(split, k, out)
        print("  saved:", p1)
        print("  saved:", p2)


train/1m | rows=76,320 | range=1995-07-01 00:00:00-04:00 -> 1995-08-22 23:59:00-04:00 | missing=7,852 | gap=7,211 (storm=2,264, unknown=7,210)
  saved: outputs/02_eda\ts3_train_1m.parquet
  saved: data/train\ts3_1m.parquet
train/5m | rows=15,264 | range=1995-07-01 00:00:00-04:00 -> 1995-08-22 23:55:00-04:00 | missing=1,490 | gap=1,442 (storm=453, unknown=1,441)
  saved: outputs/02_eda\ts3_train_5m.parquet
  saved: data/train\ts3_5m.parquet
train/15m | rows=5,088 | range=1995-07-01 00:00:00-04:00 -> 1995-08-22 23:45:00-04:00 | missing=483 | gap=480 (storm=151, unknown=479)
  saved: outputs/02_eda\ts3_train_15m.parquet
  saved: data/train\ts3_15m.parquet
test/1m | rows=12,960 | range=1995-08-23 00:00:00-04:00 -> 1995-08-31 23:59:00-04:00 | missing=32 | gap=0 (storm=0, unknown=0)
  saved: outputs/02_eda\ts3_test_1m.parquet
  saved: data/test\ts3_1m.parquet
test/5m | rows=2,592 | range=1995-08-23 00:00:00-04:00 -> 1995-08-31 23:55:00-04:00 | missing=2 | gap=0 (storm=0, unknown=0)
  saved: 

In [14]:
# (Optional) quick peek
print('train rows', len(raw_train), 'test rows', len(raw_test))

train rows 2934932 test rows 526648
