In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os, zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor

# ===== 設定 =====
ZIP_PATH = "./isolation_forest.zip"
EXTRACT_DIR = "./isolation_forest_all"

# ===== 展開 =====
os.makedirs(EXTRACT_DIR, exist_ok=True)
with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(EXTRACT_DIR)

# ===== CSV列挙 =====
csv_files = []
for root, _, files in os.walk(EXTRACT_DIR):
    for f in files:
        if f.lower().endswith(".csv"):
            csv_files.append(os.path.join(root, f))
csv_files.sort()
print(f"発見CSV: {len(csv_files)}")

def detect_lat_lon_columns(columns):
    lat = next((c for c in columns if "lat" in c.lower()), None)
    lon = next((c for c in columns if ("lon" in c.lower()) or ("lng" in c.lower()) or ("longi" in c.lower())), None)
    return lat, lon

for path in csv_files:
    try:
        df = pd.read_csv(path)
    except Exception as e:
        print(f"[SKIP] 読み込み失敗 {path}: {e}")
        continue

    lat_col, lon_col = detect_lat_lon_columns(df.columns)
    if not lat_col or not lon_col:
        print(f"[SKIP] {os.path.basename(path)}: 緯度経度列なし -> {list(df.columns)}")
        continue

    df = df.copy()
    df[lat_col] = pd.to_numeric(df[lat_col], errors="coerce")
    df[lon_col] = pd.to_numeric(df[lon_col], errors="coerce")
    df = df.dropna(subset=[lat_col, lon_col])
    df = df[(df[lat_col].between(-90, 90)) & (df[lon_col].between(-180, 180))]
    if len(df) < 5:
        print(f"[SKIP] {os.path.basename(path)}: 有効行が少ない (n={len(df)})")
        continue

    X = df[[lon_col, lat_col]].to_numpy()

    # n_neighborsはデータ量に応じて安全に設定
    n = len(X)
    n_neighbors = max(5, min(35, int(n * 0.03)))  # だいたい全体の3%（下限5、上限35）
    if n_neighbors >= n:
        n_neighbors = max(5, n - 1)

    # ===== LOF =====
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination="auto", novelty=False)
    y_pred = lof.fit_predict(X)              # 1=inlier, -1=outlier
    scores = lof.negative_outlier_factor_    # 小さい(より負)ほど異常

    is_out = (y_pred == -1)
    inliers = (~is_out)

    fname = os.path.basename(path).replace(".csv", "")
    print(f"\n📂 {fname}")
    print(f"  n={n}  inliers={inliers.sum()}  outliers={is_out.sum()}  n_neighbors={n_neighbors}")
    print(f"  score(neg_of): mean_inlier={scores[inliers].mean():.4f}  mean_outlier={scores[is_out].mean():.4f}")
    print(f"  score range: min={scores.min():.4f}  max={scores.max():.4f}")

    # ===== プロット（1枚/ファイル）=====
    plt.figure(figsize=(10, 5))
    plt.scatter(X[inliers, 0], X[inliers, 1], s=8,  alpha=0.6, label="inlier")
    plt.scatter(X[is_out, 0],  X[is_out, 1],  s=40, alpha=0.9, marker="x", linewidths=1.0, label="outlier")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.title(f"Local Outlier Factor (LOF) - {fname}")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()


FileNotFoundError: [Errno 2] No such file or directory: './isolation_forest.zip'