In [1]:
# 前日 vs 今日：新規出現IP差分検出
import os, re, zipfile, pandas as pd, numpy as np
from datetime import datetime, timedelta, timezone
import matplotlib.pyplot as plt

try:
    from zoneinfo import ZoneInfo
    tz_tokyo = ZoneInfo("Asia/Tokyo")
except:
    tz_tokyo = timezone(timedelta(hours=9))

ZIP_PATH   = "isolation_forest.zip"
OUT_DIR    = "isoforest_out"
EXTRACTED  = os.path.join(OUT_DIR, "_extracted")
os.makedirs(EXTRACTED, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)

# ===== CSV抽出 =====
def ensure_extracted():
    has_csv = any(
        f.lower().endswith(".csv")
        for _, _, files in os.walk(EXTRACTED)
        for f in files
    )
    if (not has_csv) and os.path.exists(ZIP_PATH):
        with zipfile.ZipFile(ZIP_PATH,"r") as z:
            z.extractall(EXTRACTED)

# ===== timestampを柔軟に解釈 =====
def parse_date(val, filename=None):
    if pd.isna(val) or not str(val).strip():
        # ファイル名先頭から日付取得（例: 20251028 〜）
        m = re.match(r"^(\d{8,14})", os.path.basename(filename))
        if m:
            s = m.group(1)
            # YYYYMMDD or YYYYMMDDhhmmss
            if len(s)==8:
                return datetime.strptime(s,"%Y%m%d").date()
            else:
                return datetime.strptime(s,"%Y%m%d%H%M%S").date()
        return None

    s = str(val).strip()
    for fmt in ("%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S",
                "%Y%m%d%H%M%S", "%Y-%m-%dT%H:%M:%S"):
        try:
            d = datetime.strptime(s, fmt).date()
            return d
        except:
            pass
    # pandas parser fallback
    try:
        dt = pd.to_datetime(s, errors="coerce")
        if not pd.isna(dt):
            return dt.date()
    except:
        pass
    return None

# ===== 全CSVロード =====
ensure_extracted()
dfs = []
for root, _, files in os.walk(EXTRACTED):
    for f in sorted(files):
        if f.lower().endswith(".csv"):
            p = os.path.join(root, f)
            df = pd.read_csv(p, header=None)
            if df.shape[1] < 4: continue
            df = df.iloc[:, :4]
            df.columns = ["ip","timestamp","lat","lon"]
            df["ip"] = df["ip"].astype(str).str.strip()
            df["date"] = df["timestamp"].apply(lambda v: parse_date(v, filename=f))
            df["source_file"] = f
            dfs.append(df[["ip","date","source_file"]])

df_all = pd.concat(dfs, ignore_index=True)

# ===== 今日と前日を決定 =====
all_dates = sorted([d for d in df_all["date"].unique() if pd.notna(d)])
if len(all_dates) < 2:
    raise SystemExit("日付が2日分以上必要です。")

today  = all_dates[-1]
yest   = all_dates[-2]

print("比較対象：")
print(f"  今日:   {today}")
print(f"  前日:   {yest}")

# ===== 差分判定 =====
ip_today  = set(df_all.loc[df_all["date"]==today, "ip"].unique())
ip_before = set(df_all.loc[df_all["date"]<today, "ip"].unique())
ip_yest   = set(df_all.loc[df_all["date"]==yest,  "ip"].unique())

# 今日新規
ip_new_today = sorted(ip_today - ip_before)

print(f"\n前日件数: {len(ip_yest)}")
print(f"今日件数: {len(ip_today)}")
print(f"今日新規IP: {len(ip_new_today)}")

# 表示 & CSV保存
out_csv = os.path.join(OUT_DIR, "diff_new_today_ips.csv")
pd.DataFrame({"ip": ip_new_today}).to_csv(out_csv, index=False, encoding="utf-8")

print(f"保存: {out_csv}")
print("\n=== 今日新規IP（先頭50件） ===")
for ip in ip_new_today[:50]:
    print(ip)


比較対象：
  今日:   2025-10-14
  前日:   2025-10-13

前日件数: 1150
今日件数: 1153
今日新規IP: 5
保存: isoforest_out\diff_new_today_ips.csv

=== 今日新規IP（先頭50件） ===
102.211.56.20
178.29.155.156
198.167.206.210
198.98.51.249
77.239.97.46


In [3]:
# 全期間：各日ごとの「初出（全過去比）」と「前日比」新規IPを算出・保存・表示
import os, re, zipfile, pandas as pd
from datetime import datetime

ZIP_PATH   = "isolation_forest.zip"
OUT_DIR    = "isoforest_out"
EXTRACTED  = os.path.join(OUT_DIR, "_extracted")
DIFF_DIR   = os.path.join(OUT_DIR, "diff_daily")
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(EXTRACTED, exist_ok=True)
os.makedirs(DIFF_DIR, exist_ok=True)

def ensure_extracted():
    has_csv = any(
        f.lower().endswith(".csv")
        for _, _, files in os.walk(EXTRACTED)
        for f in files
    )
    if (not has_csv) and os.path.exists(ZIP_PATH):
        with zipfile.ZipFile(ZIP_PATH, "r") as z:
            z.extractall(EXTRACTED)

def parse_date(val, filename=None):
    if pd.isna(val) or not str(val).strip():
        if filename:
            m = re.match(r"^(\d{8}|\d{14})", os.path.basename(filename))
            if m:
                s = m.group(1)
                return datetime.strptime(s, "%Y%m%d").date() if len(s)==8 else datetime.strptime(s,"%Y%m%d%H%M%S").date()
        return None
    s = str(val).strip()
    for fmt in ("%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S","%Y%m%d%H%M%S","%Y-%m-%dT%H:%M:%S"):
        try: return datetime.strptime(s, fmt).date()
        except: pass
    try:
        dt = pd.to_datetime(s, errors="coerce")
        return None if pd.isna(dt) else dt.date()
    except: return None

def read_csv_flex(p):
    try:
        df = pd.read_csv(p, header=None, dtype=str)
    except Exception as e:
        print(f"Failed to read {p}: {e}"); return None
    if df.shape[1] < 4:
        try:
            df2 = pd.read_csv(p)
            cols = df2.columns
            pick = []
            for want in ["ip","timestamp","lat","lon"]:
                found = None
                for c in cols:
                    lc = c.lower()
                    if want=="ip" and "ip" in lc: found=c; break
                    if want=="timestamp" and any(k in lc for k in ["time","timestamp","date"]): found=c; break
                    if want=="lat" and "lat" in lc: found=c; break
                    if want=="lon" and any(k in lc for k in ["lon","lng"]): found=c; break
                if not found: break
                pick.append(found)
            if len(pick)==4:
                out = df2[pick].copy()
                out.columns = ["ip","timestamp","lat","lon"]
                return out
        except Exception: return None
    df = df.iloc[:, :4].copy()
    df.columns = ["ip","timestamp","lat","lon"]
    if len(df)>0 and str(df.iloc[0,0]).strip().lower() in ("ip","0"):
        df = df.drop(df.index[0])
    return df.reset_index(drop=True)

# 1) 読み込み
ensure_extracted()
rows = []
for root, _, files in os.walk(EXTRACTED):
    for f in sorted(files):
        if not f.lower().endswith(".csv"): continue
        p = os.path.join(root, f)
        d = read_csv_flex(p)
        if d is None or d.empty: continue
        d["ip"] = d["ip"].astype(str).str.strip()
        d["date"] = d["timestamp"].apply(lambda v: parse_date(v, filename=f))
        d["source_file"] = f
        rows.append(d[["ip","date","source_file"]])
df_all = pd.concat(rows, ignore_index=True)

# 2) 日付ごとの差分
dates = sorted([d for d in df_all["date"].unique() if pd.notna(d)])
seen_until_prev = set()
prev_day = set()
summary = []

for i, d in enumerate(dates):
    ips = set(df_all.loc[df_all["date"]==d, "ip"].unique())
    new_all = sorted(ips - seen_until_prev)
    new_prev = sorted(ips - prev_day) if i>0 else sorted(ips)

    pd.DataFrame({"ip": new_all}).to_csv(os.path.join(DIFF_DIR, f"new_vs_allprior_{d:%Y%m%d}.csv"), index=False, encoding="utf-8")
    pd.DataFrame({"ip": new_prev}).to_csv(os.path.join(DIFF_DIR, f"new_vs_prev_{d:%Y%m%d}.csv"), index=False, encoding="utf-8")

    summary.append({"date": d, "count_today": len(ips), "count_seen_until_prev": len(seen_until_prev),
                    "new_vs_allprior": len(new_all), "new_vs_prev": len(new_prev)})

    seen_until_prev |= ips
    prev_day = ips

summary_df = pd.DataFrame(summary).sort_values("date")
summary_df.to_csv(os.path.join(DIFF_DIR, "summary_daily_new_ips.csv"), index=False, encoding="utf-8")
summary_df


Unnamed: 0,date,count_today,count_seen_until_prev,new_vs_allprior,new_vs_prev
0,2025-10-01,6,0,6,6
1,2025-10-02,30,6,30,30
2,2025-10-03,1114,36,1113,1113
3,2025-10-04,1127,1149,12,20
4,2025-10-05,1131,1161,9,12
5,2025-10-06,1124,1170,4,9
6,2025-10-07,1130,1174,8,14
7,2025-10-08,1136,1182,11,16
8,2025-10-09,1135,1193,4,6
9,2025-10-10,1149,1197,18,22
