In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
isoforest_global_outliers.py
- 全期間の Tor 位置 CSV（ip, timestamp, lat, lon）を結合
- 行レベル（lat,lon）で Isolation Forest（IF1）
- IPセントロイド（lat_mean, lon_mean）でも Isolation Forest（IF2）
- 2つの指標と異常率から複合スコアでランキング
出力:
- /mnt/data/isoforest_out/global_outlier_ips.csv     （IF2: セントロイド外れIP）
- /mnt/data/isoforest_out/top50_outlier_ips.csv      （複合スコア上位50 IP）
- /mnt/data/isoforest_out/centroid_scatter.png       （セントロイド散布図＋外れハイライト）
"""
import os, zipfile, pandas as pd, numpy as np
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

ZIP_PATH   = "/mnt/data/isolation_forest.zip"
OUT_DIR    = "/mnt/data/isoforest_out"
EXTRACTED  = os.path.join(OUT_DIR, "_extracted")
GLOBAL_CONT = 0.05   # 行レベル IF の外れ比率
CENTROID_CONT = 0.05 # セントロイド IF の外れ比率
RANDOM_STATE = 42

def ensure_extracted():
    os.makedirs(EXTRACTED, exist_ok=True)
    has_csv = any(
        f.lower().endswith(".csv")
        for _, _, files in os.walk(EXTRACTED)
        for f in files
    )
    if (not has_csv):
        with zipfile.ZipFile(ZIP_PATH, "r") as z:
            z.extractall(EXTRACTED)

def read_csv_fix_header(p: str) -> pd.DataFrame:
    df = pd.read_csv(p, header=None)
    df.columns = ["ip","timestamp","lat","lon"]
    if len(df) > 0 and str(df.iloc[0,0]).strip().lower() in ("ip","0"):
        df = df.drop(df.index[0])
    df["lat"] = pd.to_numeric(df["lat"], errors="coerce")
    df["lon"] = pd.to_numeric(df["lon"], errors="coerce")
    df = df.dropna(subset=["lat","lon"]).reset_index(drop=True)
    try:
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
    except Exception:
        pass
    return df

def load_all():
    rows = []
    for root, _, files in os.walk(EXTRACTED):
        for f in files:
            if f.lower().endswith(".csv"):
                d = read_csv_fix_header(os.path.join(root, f))
                d["source_file"] = f
                rows.append(d)
    if not rows:
        raise SystemExit("CSVが見つかりません。")
    return pd.concat(rows, ignore_index=True)

def run():
    os.makedirs(OUT_DIR, exist_ok=True)
    ensure_extracted()
    df_all = load_all()

    # === IF1: 行レベル ===
    X = df_all[["lat","lon"]].values
    if1 = IsolationForest(
        n_estimators=300, contamination=GLOBAL_CONT, random_state=RANDOM_STATE, n_jobs=-1
    )
    df_all["row_pred"]  = if1.fit_predict(X)
    df_all["row_score"] = if1.decision_function(X)

    # === IPごとに集計 ===
    agg = (
        df_all.groupby("ip")
              .agg(
                  obs_count=("ip","size"),
                  anomaly_count=("row_pred", lambda s: int((s == -1).sum())),
                  anomaly_ratio=("row_pred", lambda s: float((s == -1).mean())),
                  mean_score=("row_score","mean"),
                  lat_mean=("lat","mean"),
                  lon_mean=("lon","mean"),
                  lat_std=("lat","std"),
                  lon_std=("lon","std"),
                  first_seen=("timestamp","min"),
                  last_seen=("timestamp","max"),
              )
              .reset_index()
    )

    # === IF2: セントロイド ===
    centroids = agg[["lat_mean","lon_mean"]].values
    if2 = IsolationForest(
        n_estimators=300, contamination=CENTROID_CONT, random_state=RANDOM_STATE+1, n_jobs=-1
    )
    agg["cent_pred"]  = if2.fit_predict(centroids)
    agg["cent_score"] = if2.decision_function(centroids)

    # === 複合スコア ===
    eps = 1e-12
    r_anom  = (agg["anomaly_ratio"] - agg["anomaly_ratio"].min()) / (agg["anomaly_ratio"].max() - agg["anomaly_ratio"].min() + eps)
    r_score = (agg["mean_score"] - agg["mean_score"].min()) / (agg["mean_score"].max() - agg["mean_score"].min() + eps)
    agg["outlier_score"] = (1.0 * r_anom) + (1.0 * (1 - r_score)) + (0.5 * (agg["cent_pred"] == -1).astype(float))

    # === 出力 ===
    centroid_outliers = agg[agg["cent_pred"] == -1].copy()
    centroid_outliers = centroid_outliers.sort_values(["cent_score","anomaly_ratio"], ascending=[True, False])
    centroid_out_path = os.path.join(OUT_DIR, "global_outlier_ips.csv")
    centroid_outliers.to_csv(centroid_out_path, index=False, encoding="utf-8")

    top50 = agg.sort_values(["outlier_score","anomaly_ratio"], ascending=[False, False]).head(50)
    top50_path = os.path.join(OUT_DIR, "top50_outlier_ips.csv")
    top50.to_csv(top50_path, index=False, encoding="utf-8")

    # === セントロイド散布図 ===
    plt.figure()
    plt.scatter(agg["lat_mean"], agg["lon_mean"], s=10, label="IPs")
    outs = agg[agg["cent_pred"] == -1]
    if not outs.empty:
        plt.scatter(outs["lat_mean"], outs["lon_mean"], s=30, marker="x", label="centroid outliers")
    plt.title("IP centroids with centroid outliers")
    plt.xlabel("lat_mean"); plt.ylabel("lon_mean")
    plt.legend()
    plt.tight_layout()
    plot_path = os.path.join(OUT_DIR, "centroid_scatter.png")
    plt.savefig(plot_path)
    plt.close()

    print(centroid_out_path)
    print(top50_path)
    print(plot_path)

if __name__ == "__main__":
    run()
