In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os, zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

ZIP_PATH = "./isolation_forest.zip"          # ← /mnt/data 削除
EXTRACT_DIR = "./isolation_forest_all"       # ← /mnt/data 削除

# 1) ZIP展開
os.makedirs(EXTRACT_DIR, exist_ok=True)
with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(EXTRACT_DIR)

# 2) CSV一覧取得
csv_files = []
for root, dirs, files in os.walk(EXTRACT_DIR):
    for f in files:
        if f.lower().endswith(".csv"):
            csv_files.append(os.path.join(root, f))
csv_files.sort()
print(f"Found {len(csv_files)} CSV files.")

# 緯度経度列の自動検出
def detect_lat_lon_columns(columns):
    cols_lower = [c.lower() for c in columns]
    lat_candidates = ["latitude", "lat", "y", "lat_deg"]
    lon_candidates = ["longitude", "lon", "lng", "x", "long", "lon_deg"]
    lat_col = next((c for c in columns if c.lower() in lat_candidates), None)
    lon_col = next((c for c in columns if c.lower() in lon_candidates), None)
    return lat_col, lon_col

# 3) 各CSVにIsolation Forestを適用
for path in csv_files:
    try:
        df = pd.read_csv(path)
    except Exception as e:
        print(f"[SKIP] Failed to read {path}: {e}")
        continue

    lat_col, lon_col = detect_lat_lon_columns(list(df.columns))
    if lat_col is None or lon_col is None:
        # 部分一致も試す
        if lat_col is None:
            cand = [c for c in df.columns if "lat" in c.lower()]
            if cand: lat_col = cand[0]
        if lon_col is None:
            cand = [c for c in df.columns if ("lon" in c.lower()) or ("lng" in c.lower()) or ("longi" in c.lower())]
            if cand: lon_col = cand[0]
    if lat_col is None or lon_col is None:
        print(f"[SKIP] {os.path.basename(path)}: lat/lon列が見つかりません。Columns: {list(df.columns)}")
        continue

    # 数値化と範囲内チェック
    df = df.copy()
    df[lat_col] = pd.to_numeric(df[lat_col], errors="coerce")
    df[lon_col] = pd.to_numeric(df[lon_col], errors="coerce")
    df = df.dropna(subset=[lat_col, lon_col])
    df = df[(df[lat_col].between(-90, 90)) & (df[lon_col].between(-180, 180))]
    if len(df) < 5:
        print(f"[SKIP] {os.path.basename(path)}: 有効行が少なすぎます (n={len(df)}).")
        continue

    X = df[[lon_col, lat_col]].to_numpy()

    # Isolation Forest 適用
    clf = IsolationForest(
        n_estimators=200,
        contamination="auto",
        random_state=42,
        n_jobs=-1
    )
    clf.fit(X)
    pred = clf.predict(X)
    is_out = (pred == -1)

    n, n_out = len(X), int(is_out.sum())
    n_in = n - n_out
    fname = os.path.basename(path).replace(".csv", "")
    print(f"[OK] {fname}: total={n}, inliers={n_in}, outliers={n_out}")

    # プロット
