In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# 全期間：各日ごとの「初出（全過去比）」と「前日比」新規IPを算出・保存・表示
# ＋ 日毎散布図PNG保存 ＋ plt.show() ＋ IP/保存CSV名の表示
# ＋ 最後に 10月4日以降の new_vs_allprior / new_vs_prev を再プロット

import os, re, zipfile, pandas as pd
from datetime import datetime, date
import matplotlib.pyplot as plt

# ====== パラメータ ==========================================================
ZIP_PATH   = "isolation_forest.zip"
OUT_DIR    = "isoforest_out"
EXTRACTED  = os.path.join(OUT_DIR, "_extracted")
DIFF_DIR   = os.path.join(OUT_DIR, "diff_daily")
PLOT_DIR   = os.path.join(OUT_DIR, "plots_daily")

# 列名ヒント（必要な場合のみ設定）
COLUMN_HINTS = {}  # 例: {"ip":"IP","timestamp":"time","lat":"latitude","lon":"longitude"}

STRICT_IPV4 = True
SHOW_MAX = 50
DPI = 150

# 最後に再プロットする下限日（10月4日以降）
DATE_FROM = date(2025, 10, 4)
# ===========================================================================

os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(EXTRACTED, exist_ok=True)
os.makedirs(DIFF_DIR, exist_ok=True)
os.makedirs(PLOT_DIR, exist_ok=True)

_ipv4_pat = re.compile(
    r"^(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)$"
)

def ensure_extracted():
    """ZIPがあり、かつ_extracted配下にCSVが無ければ解凍。"""
    has_csv = any(
        f.lower().endswith(".csv")
        for _, _, files in os.walk(EXTRACTED)
        for f in files
    )
    if (not has_csv) and os.path.exists(ZIP_PATH):
        with zipfile.ZipFile(ZIP_PATH, "r") as z:
            z.extractall(EXTRACTED)

def parse_date(val, filename=None):
    """timestamp列 or ファイル名先頭から日付(date)を推定。"""
    if pd.isna(val) or not str(val).strip():
        if filename:
            b = os.path.basename(filename)
            for pat, fmt in [
                (r"^(\d{14})", "%Y%m%d%H%M%S"),
                (r"^(\d{8})",  "%Y%m%d"),
                (r"^(\d{4}-\d{2}-\d{2})", "%Y-%m-%d"),
            ]:
                m = re.match(pat, b)
                if m:
                    return datetime.strptime(m.group(1), fmt).date()
        return None
    s = str(val).strip()
    for fmt in (
        "%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S",
        "%Y%m%d%H%M%S", "%Y-%m-%d", "%Y/%m/%d",
        "%Y-%m-%dT%H:%M:%S",
    ):
        try:
            return datetime.strptime(s, fmt).date()
        except Exception:
            pass
    dt = pd.to_datetime(s, errors="coerce")
    return None if pd.isna(dt) else dt.date()

def read_csv_flex(p):
    """列不定CSVから [ip, timestamp, lat, lon] を柔軟抽出。失敗時None。"""
    def _pick(df):
        cols = list(df.columns)
        ip_col  = COLUMN_HINTS.get("ip")
        ts_col  = COLUMN_HINTS.get("timestamp")
        lat_col = COLUMN_HINTS.get("lat")
        lon_col = COLUMN_HINTS.get("lon")
        def _auto(want):
            for c in cols:
                lc = str(c).lower()
                if want=="ip" and "ip" in lc: return c
                if want=="timestamp" and any(k in lc for k in ["time","timestamp","date","datetime"]): return c
                if want=="lat" and "lat" in lc: return c
                if want=="lon" and any(k in lc for k in ["lon","lng","long"]): return c
            return None
        ip  = ip_col  or _auto("ip")
        ts  = ts_col  or _auto("timestamp")
        lat = lat_col or _auto("lat")
        lon = lon_col or _auto("lon")
        if not all([ip, ts, lat, lon]):
            return None
        out = df[[ip, ts, lat, lon]].copy()
        out.columns = ["ip","timestamp","lat","lon"]
        return out
    # 1) ヘッダーあり想定
    try:
        df = pd.read_csv(p, dtype=str)
        got = _pick(df)
        if got is not None:
            return got
    except Exception:
        pass
    # 2) ヘッダーなし/区切り自動
    try:
        df = pd.read_csv(p, sep=None, engine="python", dtype=str, header=None)
        if df.shape[1] >= 4:
            out = df.iloc[:, :4].copy()
            out.columns = ["ip","timestamp","lat","lon"]
            if len(out)>0 and str(out.iloc[0,0]).strip().lower() in ("ip","0","source_ip"):
                out = out.drop(out.index[0])
            return out.reset_index(drop=True)
    except Exception:
        pass
    return None

def _to_float(x):
    try:
        return float(str(x).strip())
    except Exception:
        return None

def _dedup_by_ip_with_first(df):
    """同一IPの重複を最初の1行で代表させる。"""
    return df.sort_index().drop_duplicates(subset=["ip"], keep="first")

def _save_csv_and_plot(d, df, basename_prefix):
    """
    CSV保存 + 散布図PNG保存 + Notebookに plt.show() + 保存CSV名 & IP一覧を表示
    """
    out_csv = os.path.join(DIFF_DIR, f"{basename_prefix}_{d:%Y%m%d}.csv")
    df[["ip","lat","lon"]].to_csv(out_csv, index=False, encoding="utf-8")

    # 散布用に数値化＆欠損除外
    plot_df = df.copy()
    plot_df["lat"] = plot_df["lat"].apply(_to_float)
    plot_df["lon"] = plot_df["lon"].apply(_to_float)
    plot_df = plot_df.dropna(subset=["lat","lon"])

    out_png = os.path.join(PLOT_DIR, f"{basename_prefix}_{d:%Y%m%d}.png")
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(plot_df["lon"], plot_df["lat"], s=8)  # 色指定なし
    ax.set_xlabel("Longitude")
    ax.set_ylabel("Latitude")
    ax.set_title(f"{basename_prefix} {d:%Y-%m-%d} (n={len(plot_df)})")
    # ax.set_xlim(-180,180); ax.set_ylim(-90,90)  # 必要なら固定
    fig.tight_layout()
    fig.savefig(out_png, dpi=DPI)

    # ここで表示＆情報出力
    plt.show()
    print(f"📄 保存CSV: {os.path.basename(out_csv)}")
    print(f"🖼 保存PNG: {os.path.basename(out_png)}")
    print("🔹 新規IPアドレス一覧:")
    if len(df) > 0:
        for ip in df["ip"].head(SHOW_MAX):
            print("   ", ip)
        if len(df) > SHOW_MAX:
            print(f"   ...（{len(df)}件中{SHOW_MAX}件のみ表示）")
    else:
        print("   （該当なし）")
    print("-" * 60)

    plt.close(fig)

def _plot_from_csv(csv_path, title_prefix, d_str):
    """保存済みCSVから読んで“表示のみ”再プロット（最後の10/04以降用）。"""
    if not os.path.exists(csv_path):
        print(f"[SKIP] ファイルなし: {csv_path}")
        return
    df = pd.read_csv(csv_path)
    df["lat"] = df["lat"].apply(_to_float)
    df["lon"] = df["lon"].apply(_to_float)
    df = df.dropna(subset=["lat","lon"])

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(df["lon"], df["lat"], s=8)
    ax.set_xlabel("Longitude")
    ax.set_ylabel("Latitude")
    ax.set_title(f"{title_prefix} {d_str} (n={len(df)})")
    fig.tight_layout()
    plt.show()
    print(f"📄 使用CSV: {os.path.basename(csv_path)}")
    print("🔹 IPアドレス一覧（最大50件）:")
    if len(df) > 0:
        for ip in df["ip"].head(50):
            print("   ", ip)
        if len(df) > 50:
            print(f"   ...（{len(df)}件中50件のみ表示）")
    else:
        print("   （該当なし）")
    print("-" * 60)
    plt.close(fig)

def main():
    ensure_extracted()

    # ---- 全CSV読込 ----
    rows = []
    for root, _, files in os.walk(EXTRACTED):
        for f in sorted(files):
            if not f.lower().endswith(".csv"): 
                continue
            p = os.path.join(root, f)
            d = read_csv_flex(p)
            if d is None or d.empty:
                print(f"[WARN] 読み込み失敗/空: {p}")
                continue
            d["ip"] = d["ip"].astype(str).str.strip()
            if STRICT_IPV4:
                d = d[d["ip"].apply(lambda x: bool(_ipv4_pat.match(x)))]
            d["date"] = d["timestamp"].apply(lambda v: parse_date(v, filename=f))
            d = d.dropna(subset=["ip","date"])
            d["source_file"] = f
            if not d.empty:
                rows.append(d[["ip","date","lat","lon","source_file"]])

    if not rows:
        print("[ERROR] CSVが見つからないか、有効データがありません。")
        return

    df_all = pd.concat(rows, ignore_index=True)
    # 同日・同IPの重複排除（lat/lonは最初を採用）
    df_all = df_all.sort_index().drop_duplicates(subset=["date","ip"], keep="first")

    # ---- 日毎差分 ----
    global dates  # 後段の再プロットでも使う
    dates = sorted([d for d in df_all["date"].unique() if pd.notna(d)])
    if not dates:
        print("[ERROR] 有効な日付が抽出できませんでした。")
        return

    seen_until_prev = set()
    prev_day = set()
    summary = []

    for i, d in enumerate(dates):
        day_all = df_all.loc[df_all["date"] == d, ["ip","lat","lon"]]
        day_ips = set(day_all["ip"].unique())

        new_all_set  = day_ips - seen_until_prev
        new_prev_set = day_ips - prev_day if i > 0 else day_ips

        new_all_df = _dedup_by_ip_with_first(day_all[day_all["ip"].isin(new_all_set)])
        new_prev_df = _dedup_by_ip_with_first(day_all[day_all["ip"].isin(new_prev_set)])

        # CSV保存＋PNG保存＋その場表示
        _save_csv_and_plot(d, new_all_df, "new_vs_allprior")
        _save_csv_and_plot(d, new_prev_df, "new_vs_prev")

        summary.append({
            "date": d,
            "count_today": len(day_ips),
            "count_seen_until_prev": len(seen_until_prev),
            "new_vs_allprior": len(new_all_set),
            "new_vs_prev": len(new_prev_set),
        })

        seen_until_prev |= day_ips
        prev_day = day_ips

    summary_df = pd.DataFrame(summary).sort_values("date")
    summary_df.to_csv(os.path.join(DIFF_DIR, "summary_daily_new_ips.csv"),
                      index=False, encoding="utf-8")

    print("\n=== サマリー（末尾5件）===")
    print(summary_df.tail(5).to_string(index=False))
    latest = summary_df["date"].max()
    print(f"\n最新日: {latest}")

    # ---- ここから：10月4日以降の再プロット（表示のみ）----
    target_dates = [d for d in dates if d >= DATE_FROM]
    print(f"\n=== 10月4日以降の再プロット対象日数: {len(target_dates)}日 ===")
    for d in target_dates:
        d_str = d.strftime("%Y-%m-%d")
        allprior_csv = os.path.join(DIFF_DIR, f"new_vs_allprior_{d:%Y%m%d}.csv")
        prev_csv     = os.path.join(DIFF_DIR, f"new_vs_prev_{d:%Y%m%d}.csv")
        _plot_from_csv(allprior_csv, "new_vs_allprior", d_str)
        _plot_from_csv(prev_csv,     "new_vs_prev",     d_str)

if __name__ == "__main__":
    main()
