In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
完全版（ハードコーディング）
- 入力ZIP: /mnt/data/isolation_forest.zip
- 出力先  : /mnt/data/isoforest_out
- contamination=0.10, n_estimators=200, random_state=42

出力:
- /mnt/data/isoforest_out/isoforest_summary.csv
- /mnt/data/isoforest_out/isoforest_anomalies/<各日>_anomalies.csv
- /mnt/data/isoforest_out/isoforest_anomalies_all.csv （全日結合）
"""

import os, zipfile, pandas as pd, numpy as np
from sklearn.ensemble import IsolationForest

ZIP_PATH = "./isolation_forest.zip"
OUT_DIR  = "isoforest_out"
CONTAM   = 0.10
N_EST    = 200
RANDOM_STATE = 42

def ensure_clean_dir(p):
    if os.path.exists(p):
        try:
            import shutil
            shutil.rmtree(p)
        except Exception:
            pass
    os.makedirs(p, exist_ok=True)

def read_csv_fix_header(csv_path: str) -> pd.DataFrame:
    # CSVは header=None で読み、列名を明示的に付与
    df = pd.read_csv(csv_path, header=None)
    df.columns = ["ip", "timestamp", "lat", "lon"]
    # 先頭行が文字列ヘッダーの場合を削除（"ip","timestamp","lat","lon"等）
    if len(df) > 0 and str(df.iloc[0,0]).strip().lower() in ("ip","0"):
        df = df.drop(df.index[0])
    # 型整形
    df["lat"] = pd.to_numeric(df["lat"], errors="coerce")
    df["lon"] = pd.to_numeric(df["lon"], errors="coerce")
    df = df.dropna(subset=["lat","lon"]).reset_index(drop=True)
    return df

def run_isoforest_on_df(df: pd.DataFrame, contam: float, n_est: int, random_state: int):
    X = df[["lat","lon"]].values
    iso = IsolationForest(
        n_estimators=n_est,
        contamination=contam,
        random_state=random_state,
        n_jobs=-1
    )
    preds  = iso.fit_predict(X)            # 1(正常) or -1(異常)
    scores = iso.decision_function(X)      # 小さいほど異常
    out = df.copy()
    out["pred"]  = preds
    out["score"] = scores
    return out

def main():
    # 準備
    ensure_clean_dir(OUT_DIR)
    extracted = os.path.join(OUT_DIR, "_extracted")
    ensure_clean_dir(extracted)
    anom_dir = os.path.join(OUT_DIR, "isoforest_anomalies")
    os.makedirs(anom_dir, exist_ok=True)

    # ZIP展開＆CSV列挙
    if not os.path.exists(ZIP_PATH):
        raise SystemExit(f"ZIP が見つかりません: {ZIP_PATH}")
    with zipfile.ZipFile(ZIP_PATH, "r") as z:
        z.extractall(extracted)

    csv_files = []
    for root, _, files in os.walk(extracted):
        for f in files:
            if f.lower().endswith(".csv"):
                csv_files.append(os.path.join(root, f))
    csv_files = sorted(csv_files)

    rows = []
    all_anoms = []

    for csv_path in csv_files:
        base = os.path.basename(csv_path)
        try:
            df = read_csv_fix_header(csv_path)
            if len(df) < 10:
                rows.append({"file": base, "count": len(df), "anomaly_ratio": np.nan,
                             "score_mean": np.nan, "score_std": np.nan})
                continue

            out = run_isoforest_on_df(df, CONTAM, N_EST, RANDOM_STATE)

            # 集計
            count = len(out)
            anom_ratio = float((out["pred"] == -1).mean())
            score_mean = float(out["score"].mean())
            score_std  = float(out["score"].std(ddof=0))

            # 異常のみ保存
            anom = out[out["pred"] == -1].copy()
            anom["source_file"] = base
            all_anoms.append(anom)
            anom_out = os.path.join(anom_dir, base.replace(".csv", "_anomalies.csv"))
            anom.to_csv(anom_out, index=False, encoding="utf-8")

            rows.append({
                "file": base,
                "count": count,
                "anomaly_ratio": round(anom_ratio, 6),
                "score_mean": round(score_mean, 6),
                "score_std": round(score_std, 6),
            })
        except Exception as e:
            rows.append({"file": base, "count": -1, "anomaly_ratio": np.nan,
                         "score_mean": np.nan, "score_std": np.nan, "error": str(e)})

    # サマリ保存
    summary = pd.DataFrame(rows)
    summary_path = os.path.join(OUT_DIR, "isoforest_summary.csv")
    summary.to_csv(summary_path, index=False, encoding="utf-8")

    # 全日異常の結合保存
    if len(all_anoms) > 0:
        cat = pd.concat(all_anoms, ignore_index=True)
        cat_path = os.path.join(OUT_DIR, "isoforest_anomalies_all.csv")
        cat.to_csv(cat_path, index=False, encoding="utf-8")
        print(summary_path)
        print(cat_path)
    else:
        print(summary_path)
        print("(no anomalies)")

if __name__ == "__main__":
    main()


SystemExit: ZIP が見つかりません: /mnt/data/isolation_forest.zip

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
