In [7]:
# =========================
# CELL 1 — Config & Paths (judge-friendly + reproducible)
# - No hard-coded absolute paths
# - Works whether you run notebook from repo root or from notebooks/
# - DOES NOT copy data anywhere (judge-friendly)
# - Uses pathlib for cross-platform paths
# =========================

import os, re, json, math
from datetime import datetime, timezone
from typing import Dict, Any, List, Tuple, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# -------------------------
# Reproducibility seed
# -------------------------
SEED = int(os.environ.get("SEED", "42"))
np.random.seed(SEED)

# -------------------------
# Paths (robust for Jupyter)
# -------------------------
from pathlib import Path

def _find_repo_root(start: Path) -> Path:
    """
    Find project root by walking up until a folder containing 'notebooks' is found.
    Fallback: use current working dir.
    """
    start = start.resolve()
    for p in [start] + list(start.parents):
        if (p / "notebooks").exists():
            return p
    return start

# In Jupyter, __file__ may not exist. Use cwd.
CWD = Path.cwd().resolve()
REPO_ROOT = _find_repo_root(CWD)

# If notebook is in notebooks/, keep NOTEBOOKS_DIR = REPO_ROOT/notebooks
NOTEBOOKS_DIR = (REPO_ROOT / "notebooks").resolve()

# Data/outputs location:
# - Prefer notebooks/data if exists
# - Else fallback to repo_root/data
DATA_DIR = (NOTEBOOKS_DIR / "data") if (NOTEBOOKS_DIR / "data").exists() else (REPO_ROOT / "data")

# Outputs always under notebooks/outputs to match your current structure
OUT_DIR = NOTEBOOKS_DIR / "outputs"
OUT_02 = OUT_DIR / "02_eda"
OUT_03 = OUT_DIR / "03_features"
OUT_04 = OUT_DIR / "04_models"
OUT_04P = OUT_04 / "predictions"
OUT_05 = OUT_DIR / "05_scaling"

for p in [OUT_02, OUT_03, OUT_04, OUT_04P, OUT_05]:
    p.mkdir(parents=True, exist_ok=True)

# Provide string paths if later cells use os.path.join / strings
PROJECT_ROOT = str(NOTEBOOKS_DIR)  # keep compatible with later cells that expect PROJECT_ROOT as string

print("✅ Paths resolved:")
print(" - REPO_ROOT     :", REPO_ROOT)
print(" - NOTEBOOKS_DIR :", NOTEBOOKS_DIR)
print(" - DATA_DIR      :", DATA_DIR)
print(" - OUT_DIR       :", OUT_DIR)

# -------------------------
# Core helpers (keep as-is for other cells)
# -------------------------
def tag_minutes(tag: str) -> int:
    return {"1m": 1, "5m": 5, "15m": 15}[tag]

def steps_per_day(tag: str) -> int:
    return int(24 * 60 / tag_minutes(tag))

def steps_per_hour(tag: str) -> int:
    return int(60 / tag_minutes(tag))

def resolve_roll_windows(tag: str, roll_windows: List[str]) -> Dict[str, int]:
    sph = steps_per_hour(tag)
    spd = steps_per_day(tag)
    out = {}
    for w in roll_windows:
        if w == "1h":
            out[w] = 1 * sph
        elif w == "6h":
            out[w] = 6 * sph
        elif w == "1d":
            out[w] = 1 * spd
        else:
            raise ValueError(f"Unsupported roll window: {w}")
    return out

# -------------------------
# CFG (one source of truth)
# -------------------------
CFG: Dict[str, Any] = {
    # ===== Dataset =====
    # Prefer access_log.txt in DATA_DIR; judge just needs to put data in ./data or ./notebooks/data
    "RAW_LOG_PATH": str(DATA_DIR / "access_log.txt"),
    "TAGS": ["1m", "5m", "15m"],
    "TIME_COL_RAW": "timestamp",
    "TIME_COL_BUCKET": "bucket_start",

    # Storm gap (problem statement)
    "STORM_START": pd.Timestamp("1995-08-01 14:52:01"),
    "STORM_END":   pd.Timestamp("1995-08-03 04:36:13"),

    # ===== Feature engineering =====
    "LAG_DAYS": [1,2,3,4,5,6,7],
    "ROLL_WINDOWS": ["1h","6h","1d"],
    "ROLL_USE_STD": True,
    "USE_CYCLIC": True,
    "HORIZON_STEPS": 1,
    "KEEP_RAW_EXTRA": [
        "unique_hosts","err_4xx","err_5xx","error_rate",
        "is_missing_bucket","is_gap_storm","is_gap_unknown"
    ],
    "REQUIRE_COLS": ["bucket_start","hits","bytes_sum","is_gap"],

    # ===== Modeling =====
    "TARGETS": ["hits", "bytes_sum"],
    "XGB_PARAMS": dict(
        booster="gbtree",
        n_estimators=5000,
        early_stopping_rounds=50,
        objective="reg:squarederror",
        max_depth=6,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        random_state=SEED,
    ),
    "CV_SPLITS": 5,
    "CV_TEST_DAYS": 2,
    "CV_GAP_STEPS": 1,

    # ==========================================================
    # AUTOSCALING / SIMULATION CONFIG (Window-aware + Metric-aware)
    # ==========================================================
    "SCALING": {
        "min_instances": 2,
        "max_instances": 50,
        "cost_per_instance_per_hour": 0.05,
        "window_minutes": {"1m": 1, "5m": 5, "15m": 15},
        "safety_buffer_by_metric": {"hits": 0.3, "bytes_sum": 0.3},
        "capacity_per_instance": {
            ("hits","1m"): 20, ("hits","5m"): 100, ("hits","15m"): 350,
            ("bytes_sum","1m"): 350_000, ("bytes_sum","5m"): 1_200_000, ("bytes_sum","15m"): 3_500_000,
        },
        "max_step_change_by_window": {"1m": 6, "5m": 10, "15m": 15},
        "hysteresis_by_window": {
            "1m": {"high": 2, "low": 6, "in_margin": 0.18},
            "5m": {"high": 1, "low": 4, "in_margin": 0.15},
            "15m":{"high": 1, "low": 2, "in_margin": 0.12},
        },
        "predictive_deadband_by_window": {"1m": 0.5, "5m": 0.5, "15m": 0.5},
        "cooldown_minutes": {"base": 8, "spike": 15},
        "provisioning_by_window": {
            "1m": {"warmup_windows": 1, "min_uptime_windows": 6},
            "5m": {"warmup_windows": 1, "min_uptime_windows": 4},
            "15m":{"warmup_windows": 0, "min_uptime_windows": 2},
        },
        "reactive": {
            "enabled": True,
            "overload_scale_out_immediate": True,
            "rescue_extra_instances": 3,
            "queue_low_fraction": 0.05,
            "queue_high_multiplier": 4.0,
        },
        "slo": {
            "base_latency_ms": 80.0,
            "alpha_latency_per_unit_queue": 0.15,
            "p95_latency_target_ms": 300.0,
        },
        "anomaly": {
            "enabled": True,
            "method": "mad",
            "lookback_hours": 2,
            "mad_k": 6.0,
            "min_points": 10,
            "max_flag_rate": 0.30,
        },
        "ddos_mode": {
            "enabled": True,
            "force_scale_out_step_by_window": {"1m": 6, "5m": 10, "15m": 12},
            "max_instances_during_ddos": 50,
        },
    }
}

print("✅ Cell 1 done — CFG ready (CFG['SCALING'] exists)")


✅ Paths resolved:
 - REPO_ROOT     : C:\Users\PC\OneDrive - National Economics University\Máy tính\SC\AUTOSCALING-ANALYSIS
 - NOTEBOOKS_DIR : C:\Users\PC\OneDrive - National Economics University\Máy tính\SC\AUTOSCALING-ANALYSIS\notebooks
 - DATA_DIR      : C:\Users\PC\OneDrive - National Economics University\Máy tính\SC\AUTOSCALING-ANALYSIS\notebooks\data
 - OUT_DIR       : C:\Users\PC\OneDrive - National Economics University\Máy tính\SC\AUTOSCALING-ANALYSIS\notebooks\outputs
✅ Cell 1 done — CFG ready (CFG['SCALING'] exists)


In [8]:

# 04_modeling_forecast.ipynb
# Runs XGB + Seasonal Naive, writes outputs/04_models/metrics_forecast.csv and pred files in outputs/04_models/predictions

import os, json, math
import numpy as np
import pandas as pd
from datetime import datetime, timezone

import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error

# =========================
# PATHS (judge-friendly) + REPRODUCIBLE
# Put this at TOP of 04_modeling_forecast.ipynb
# =========================

# Reproducible
SEED = int(os.environ.get("SEED", "42"))
np.random.seed(SEED)

# Detect where we are running from:
# - If run inside notebooks/: BASE_DIR = notebooks
# - If run from repo root: BASE_DIR = repo_root/notebooks (preferred)
cwd = Path.cwd()
if cwd.name.lower() == "notebooks":
    BASE_DIR = cwd
elif (cwd / "notebooks").exists():
    BASE_DIR = cwd / "notebooks"
else:
    # fallback: assume current is notebooks-like
    BASE_DIR = cwd

# Use OUT_03 generated by CELL 5 (features)
OUT_03 = Path(os.environ.get("OUT_03", str(BASE_DIR / "outputs" / "03_features")))
OUT_04 = Path(os.environ.get("OUT_04", str(BASE_DIR / "outputs" / "04_models")))
OUT_04P = OUT_04 / "predictions"
OUT_04.mkdir(parents=True, exist_ok=True)
OUT_04P.mkdir(parents=True, exist_ok=True)

METRICS_PATH = OUT_04 / "metrics_forecast.csv"

# (Optional but recommended) avoid duplicated rows when re-run notebook many times
# delete old metrics file if exists (comment out if you prefer append)
if METRICS_PATH.exists():
    METRICS_PATH.unlink()


def mape_threshold(y_true, y_pred, min_y=1.0):
    y_true=np.asarray(y_true, dtype=float); y_pred=np.asarray(y_pred, dtype=float)
    mask=np.abs(y_true)>=float(min_y)
    if mask.sum()==0: return np.nan
    return float(np.mean(np.abs((y_true[mask]-y_pred[mask])/y_true[mask]))*100.0)

def compute_metrics(y_true, y_pred, target):
    y_true=np.asarray(y_true, dtype=float); y_pred=np.asarray(y_pred, dtype=float)
    mse=float(mean_squared_error(y_true,y_pred))
    rmse=float(np.sqrt(mse))
    mae=float(mean_absolute_error(y_true,y_pred))
    mape=mape_threshold(y_true,y_pred, min_y=1024.0 if target=='bytes_sum' else 1.0)
    return {"RMSE":rmse,"MSE":mse,"MAE":mae,"MAPE":mape}

def write_metrics_long(rows):
    dfm=pd.DataFrame(rows)
    write_header=not os.path.exists(METRICS_PATH)
    dfm.to_csv(METRICS_PATH, mode="a", header=write_header, index=False, encoding="utf-8-sig")

def tag_minutes(tag): return {"1m":1,"5m":5,"15m":15}[tag]
def steps_per_day(tag): return int(24*60/tag_minutes(tag))

# Use CFG from CELL1 if available else minimal
if "CFG" not in globals():
    CFG={
        "TAGS":["1m","5m","15m"],
        "TARGETS":["hits","bytes_sum"],
        "CV_SPLITS":5,"CV_TEST_DAYS":2,"CV_GAP_STEPS":1,
        "XGB_PARAMS": dict(
            booster="gbtree",
            n_estimators=5000,
            early_stopping_rounds=50,
            objective="reg:squarederror",
            max_depth=6,
            learning_rate=0.05,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_lambda=1.0,
            random_state=42,
        )
    }

def train_xgb_one(tag, target):
    meta=json.load(open(os.path.join(OUT_03,f"meta_{tag}.json"),"r",encoding="utf-8"))
    TIME_COL=meta["time_col"]

    train=pd.read_parquet(os.path.join(OUT_03,f"xgb_train_{tag}.parquet"))
    testf=pd.read_parquet(os.path.join(OUT_03,f"xgb_test_features_{tag}.parquet"))

    train[TIME_COL]=pd.to_datetime(train[TIME_COL], errors="coerce")
    testf[TIME_COL]=pd.to_datetime(testf[TIME_COL], errors="coerce")
    if getattr(train[TIME_COL].dt,"tz",None) is not None: train[TIME_COL]=train[TIME_COL].dt.tz_convert(None)
    if getattr(testf[TIME_COL].dt,"tz",None) is not None: testf[TIME_COL]=testf[TIME_COL].dt.tz_convert(None)

    train=train.sort_values(TIME_COL).reset_index(drop=True)
    testf=testf.sort_values(TIME_COL).reset_index(drop=True)

    if target=="hits":
        FEAT_COLS=list(meta["hits_feature_cols"]); LABEL=meta["labels"]["hits"]; TRUE_COL="hits"; use_log=False
    else:
        FEAT_COLS=list(meta["bytes_feature_cols"]); LABEL=meta["labels"]["bytes_sum"]; TRUE_COL="bytes_sum"; use_log=True

    FEAT_COLS=[c for c in FEAT_COLS if c not in (TIME_COL, TRUE_COL, LABEL)]

    # TimeSeriesSplit sizing
    freq_min=tag_minutes(tag)
    test_size=int(CFG["CV_TEST_DAYS"]*24*60/freq_min)
    gap=int(CFG["CV_GAP_STEPS"])
    n=len(train)
    max_splits=(n-gap)//test_size - 1
    n_splits_eff=int(min(CFG["CV_SPLITS"], max(0,max_splits)))

    cv_metrics=[]
    if n_splits_eff>=2:
        tss=TimeSeriesSplit(n_splits=n_splits_eff, test_size=test_size, gap=gap)
        for tr_idx, va_idx in tss.split(train):
            tr=train.iloc[tr_idx]; va=train.iloc[va_idx]
            X_tr,X_va=tr[FEAT_COLS],va[FEAT_COLS]
            y_tr=tr[LABEL].astype(float).values
            y_va=va[LABEL].astype(float).values
            if use_log:
                y_tr_fit=np.log1p(np.maximum(y_tr,0.0))
                y_va_fit=np.log1p(np.maximum(y_va,0.0))
            else:
                y_tr_fit,y_va_fit=y_tr,y_va
            reg=xgb.XGBRegressor(**CFG["XGB_PARAMS"])
            reg.fit(X_tr,y_tr_fit,eval_set=[(X_va,y_va_fit)],verbose=False)
            pred_fit=reg.predict(X_va)
            pred=np.expm1(pred_fit) if use_log else pred_fit
            pred=np.maximum(pred,0.0)
            cv_metrics.append(compute_metrics(y_va,pred,target))

    cv_mean={k: float(np.mean([m[k] for m in cv_metrics])) if cv_metrics else np.nan for k in ["RMSE","MSE","MAE","MAPE"]}

    # Retrain full
    X_all=train[FEAT_COLS]
    y_all=train[LABEL].astype(float).values
    y_fit=np.log1p(np.maximum(y_all,0.0)) if use_log else y_all
    final_params={k:v for k,v in CFG["XGB_PARAMS"].items() if k!="early_stopping_rounds"}
    model=xgb.XGBRegressor(**final_params)
    model.fit(X_all,y_fit,eval_set=[(X_all,y_fit)],verbose=False)

    model_path=os.path.join(OUT_04,f"model_xgb_{target}_{tag}.json")
    model.get_booster().save_model(model_path)
    json.dump(FEAT_COLS, open(os.path.join(OUT_04,f"feat_cols_xgb_{target}_{tag}.json"),"w",encoding="utf-8"), ensure_ascii=False, indent=2)

    # Predict test aligned t->t+1
    df=testf[[TIME_COL, TRUE_COL]+FEAT_COLS].copy().sort_values(TIME_COL).reset_index(drop=True)
    df=df.loc[:,~df.columns.duplicated()].copy()
    df["true_next"]=pd.to_numeric(df[TRUE_COL],errors="coerce").astype(float).shift(-1)
    eval_df=df[df["true_next"].notna()].copy()
    if len(eval_df)==0:
        out0=df[[TIME_COL, TRUE_COL, "true_next"]].head(0).copy()
        out0["pred"]=np.nan
        test_m={k:np.nan for k in ["RMSE","MSE","MAE","MAPE"]}
    else:
        pred_fit=model.predict(eval_df[FEAT_COLS])
        pred=np.expm1(pred_fit) if use_log else pred_fit
        pred=np.maximum(pred,0.0)
        eval_df["pred"]=pred
        test_m=compute_metrics(eval_df["true_next"].values, eval_df["pred"].values, target)
        out0=eval_df[[TIME_COL, TRUE_COL, "true_next", "pred"]]

    # export preds
    csv_path=os.path.join(OUT_04P, f"pred_{target}_{tag}_xgb.csv")
    pq_path=os.path.join(OUT_04P, f"pred_{target}_{tag}_xgb.parquet")
    out0.to_csv(csv_path,index=False,encoding="utf-8-sig")
    out0.to_parquet(pq_path,index=False)

    rows=[]
    for split,metrics in [("cv_mean",cv_mean),("test",test_m)]:
        for metric_name,v in metrics.items():
            rows.append({"model":"xgb","target":target,"window":tag,"split":split,"metric":metric_name,"value":float(v) if v is not None else np.nan})
    write_metrics_long(rows)
    return test_m

# Model 2: seasonal naive
def seasonal_naive_forecast(hist, season_len):
    hist=np.asarray(hist,dtype=float)
    if len(hist)==0: return 0.0
    if len(hist)<season_len: return float(hist[-1])
    return float(hist[-season_len])

def train_seasonal_naive_one(tag, target):
    TIME_COL="bucket_start"
    train=pd.read_parquet(os.path.join(OUT_03,f"xgb_train_{tag}.parquet"))
    test_truth=pd.read_parquet(os.path.join(OUT_03,f"xgb_test_{tag}.parquet"))
    train[TIME_COL]=pd.to_datetime(train[TIME_COL], errors="coerce")
    test_truth[TIME_COL]=pd.to_datetime(test_truth[TIME_COL], errors="coerce")
    # choose correct true col in truth file
    true_col = "hits_true" if target=="hits" else "bytes_sum_true"
    te=test_truth.rename(columns={true_col: target}).copy()
    te=te.sort_values(TIME_COL).reset_index(drop=True)

    te["true_next"]=pd.to_numeric(te[target],errors="coerce").astype(float).shift(-1)
    eval_df=te[te["true_next"].notna()].copy().reset_index(drop=True)
    if len(eval_df)==0:
        out=te[[TIME_COL,target,"true_next"]].head(0).copy(); out["pred"]=np.nan
        tm={k:np.nan for k in ["RMSE","MSE","MAE","MAPE"]}
        return tm

    hist=pd.to_numeric(train[target],errors="coerce").astype(float).fillna(0.0).values.tolist()
    season_len=steps_per_day(tag)
    preds=[]
    for i in range(len(eval_df)):
        y_t=float(pd.to_numeric(te.iloc[i][target],errors="coerce"))
        if not np.isfinite(y_t): y_t=0.0
        hist.append(y_t)
        preds.append(max(0.0, seasonal_naive_forecast(hist, season_len)))
    eval_df["pred"]=np.asarray(preds,dtype=float)

    csv_path=os.path.join(OUT_04P,f"pred_{target}_{tag}_seasonal_naive.csv")
    pq_path=os.path.join(OUT_04P,f"pred_{target}_{tag}_seasonal_naive.parquet")
    eval_df[[TIME_COL,target,"true_next","pred"]].to_csv(csv_path,index=False,encoding="utf-8-sig")
    eval_df[[TIME_COL,target,"true_next","pred"]].to_parquet(pq_path,index=False)

    tm=compute_metrics(eval_df["true_next"].values, eval_df["pred"].values, target)
    rows=[{"model":"seasonal_naive","target":target,"window":tag,"split":"test","metric":k,"value":float(v) if v is not None else np.nan} for k,v in tm.items()]
    write_metrics_long(rows)
    return tm

# Run all
print("Running XGB...")
for target in CFG["TARGETS"]:
    for tag in CFG["TAGS"]:
        tm=train_xgb_one(tag,target)
        print("xgb",target,tag,tm)

print("Running seasonal_naive...")
for target in CFG["TARGETS"]:
    for tag in CFG["TAGS"]:
        tm=train_seasonal_naive_one(tag,target)
        print("seasonal_naive",target,tag,tm)

# Benchmark view
mdf=pd.read_csv(METRICS_PATH)
test_m=mdf[mdf["split"].astype(str).str.lower().eq("test")].copy()
bench=test_m.pivot_table(index=["target","window","metric"], columns=["model"], values="value", aggfunc="first").reset_index()
print("\nBenchmark (TEST) long->wide")
print(bench.sort_values(["target","window","metric"]).to_string(index=False))

# Validate pred files exist
pred_files=[]
for model in ["xgb","seasonal_naive"]:
    for target in CFG["TARGETS"]:
        for tag in CFG["TAGS"]:
            fp=os.path.join(OUT_04P,f"pred_{target}_{tag}_{model}.csv")
            pred_files.append({"model":model,"target":target,"window":tag,"path":fp,"exists":os.path.exists(fp)})
pred_check=pd.DataFrame(pred_files).sort_values(["target","window","model"]).reset_index(drop=True)
print(pred_check.to_string(index=False))


Running XGB...
xgb hits 1m {'RMSE': 21.58896693193818, 'MSE': 466.08349318832023, 'MAE': 15.58200884936115, 'MAPE': 64.46296418365739}
xgb hits 5m {'RMSE': 70.97562041973819, 'MSE': 5037.538693966758, 'MAE': 52.60916608459284, 'MAPE': 38.35502687318375}
xgb hits 15m {'RMSE': 158.22128171880073, 'MSE': 25033.973988740105, 'MAE': 112.97181200400979, 'MAPE': 20.767362835399872}
xgb bytes_sum 1m {'RMSE': 699665.340345756, 'MSE': 489531588481.1426, 'MAE': 463251.91777876223, 'MAPE': 88.17260670865876}
xgb bytes_sum 5m {'RMSE': 1862245.8911025848, 'MSE': 3467959758928.46, 'MAE': 1315083.320276317, 'MAPE': 43.37298572199524}
xgb bytes_sum 15m {'RMSE': 3885168.1435004836, 'MSE': 15094531503270.996, 'MAE': 2825506.901443004, 'MAPE': 31.80724978916396}
Running seasonal_naive...
seasonal_naive hits 1m {'RMSE': 24.292722225260516, 'MSE': 590.1363531136661, 'MAE': 18.19160429045451, 'MAPE': 73.94465300387039}
seasonal_naive hits 5m {'RMSE': 92.80845594235315, 'MSE': 8613.409494403704, 'MAE': 67.925