In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
isoforest_batch.py
- Tor IP位置CSV( lat, lon )で Isolation Forest を日別に実行
- 集計CSVと、各日の異常点CSVを出力
使い方（例）：
    python isoforest_batch.py --zip /mnt/data/isolation_forest.zip --out /mnt/data
"""
import os, zipfile, argparse, pandas as pd, numpy as np
from sklearn.ensemble import IsolationForest

def read_csv_fix_header(csv_path: str) -> pd.DataFrame:
    # ヘッダー行がデータ化している前提（ip,timestamp,lat,lon）
    df = pd.read_csv(csv_path, header=None)
    df.columns = ["ip", "timestamp", "lat", "lon"]
    # 先頭行（本来のヘッダー文字列行）を除去
    if len(df) > 0 and str(df.iloc[0,0]).lower() in ("ip","0"):
        df = df.drop(df.index[0])
    # 型整形
    df["lat"] = pd.to_numeric(df["lat"], errors="coerce")
    df["lon"] = pd.to_numeric(df["lon"], errors="coerce")
    df = df.dropna(subset=["lat","lon"]).reset_index(drop=True)
    return df

def run_isoforest_on_file(csv_path: str, contamination: float = 0.10, random_state: int = 42):
    df = read_csv_fix_header(csv_path)
    if len(df) < 10:
        return None, None  # データが少なすぎる場合はスキップ
    X = df[["lat","lon"]].values
    iso = IsolationForest(
        n_estimators=200,
        contamination=contamination,
        random_state=random_state,
        n_jobs=-1,
        warm_start=False
    )
    preds = iso.fit_predict(X)           # 1 (正常) or -1 (異常)
    scores = iso.decision_function(X)    # 大きいほど正常、小さいほど異常
    df_out = df.copy()
    df_out["pred"] = preds
    df_out["score"] = scores
    return df_out, preds

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--zip", required=False, help="ZIPファイル（CSV群）")
    ap.add_argument("--indir", required=False, help="CSVが入ったディレクトリ（zip指定がなければこちらを使う）")
    ap.add_argument("--out", required=True, help="出力ディレクトリ（集計CSVと異常CSVフォルダを作成）")
    ap.add_argument("--cont", type=float, default=0.10, help="IsolationForest contamination")
    args = ap.parse_args()

    out_dir = os.path.abspath(args.out)
    os.makedirs(out_dir, exist_ok=True)
    anom_dir = os.path.join(out_dir, "isoforest_anomalies")
    os.makedirs(anom_dir, exist_ok=True)

    # 入力CSVの収集
    work_dir = None
    csv_files = []
    if args.zip:
        work_dir = os.path.join(out_dir, "_extracted")
        os.makedirs(work_dir, exist_ok=True)
        with zipfile.ZipFile(args.zip, "r") as z:
            for n in z.namelist():
                if n.lower().endswith(".csv"):
                    z.extract(n, work_dir)
                    csv_files.append(os.path.join(work_dir, n))
    elif args.indir:
        for root, _, files in os.walk(args.indir):
            for f in files:
                if f.lower().endswith(".csv"):
                    csv_files.append(os.path.join(root, f))
    else:
        raise SystemExit("ZIP か indir のどちらかを指定してください。")

    csv_files = sorted(csv_files)

    rows = []
    for csv_path in csv_files:
        try:
            df_out, preds = run_isoforest_on_file(csv_path, contamination=args.cont)
            base = os.path.basename(csv_path)
            if df_out is None:
                rows.append({"file": base, "count": 0, "anomaly_ratio": np.nan, "score_mean": np.nan, "score_std": np.nan})
                continue

            # 集計
            count = len(df_out)
            anomaly_ratio = float((df_out["pred"] == -1).mean())
            score_mean = float(df_out["score"].mean())
            score_std  = float(df_out["score"].std(ddof=0))

            # 異常のみ保存
            anom = df_out[df_out["pred"] == -1].copy()
            anom_out = os.path.join(anom_dir, base.replace(".csv", "_anomalies.csv"))
            anom.to_csv(anom_out, index=False, encoding="utf-8")

            rows.append({
                "file": base,
                "count": count,
                "anomaly_ratio": round(anomaly_ratio, 6),
                "score_mean": round(score_mean, 6),
                "score_std": round(score_std, 6),
            })
        except Exception as e:
            rows.append({"file": os.path.basename(csv_path), "count": -1, "anomaly_ratio": np.nan, "score_mean": np.nan, "score_std": np.nan, "error": str(e)})

    summary = pd.DataFrame(rows)
    summary_path = os.path.join(out_dir, "isoforest_summary.csv")
    summary.to_csv(summary_path, index=False, encoding="utf-8")
    print(summary_path)

if __name__ == "__main__":
    main()
