In [2]:
# === Anomaly Detection for AiR 12305 ===
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler

# --- 1. Load features ---
df = pd.read_parquet("../../data/processed/air_12305_features_full.parquet")

if 'time' in df.columns:
    df['time'] = pd.to_datetime(df['time'], errors='coerce')

# --- 2. Filter ON-cycles ---
df_on = df[df['is_running'] == 1].copy()
print("Data shape (ON cycles):", df_on.shape)

if len(df_on) > 100_000:
    recent = df_on.sort_values("time").tail(50_000)
    random_sample = df_on.sample(50_000, random_state=42)
    df_on = pd.concat([recent, random_sample]).drop_duplicates().reset_index(drop=True)
    print(f"Sampled down to {len(df_on)} rows for anomaly training")

# --- 3. Select useful features ---
feature_cols = [
    c for c in df_on.columns
    if c not in ["time","id","air","device","is_running","is_running_gmm"]
]
X = df_on[feature_cols].select_dtypes(include=[np.number]).fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 4. ocsvm (slow) ---
from sklearn.svm import OneClassSVM

print("\nRunning One-Class SVM anomaly detection...")

# Train on smaller subset (avoid huge memory/time cost)
sample_size = min(50_000, len(X_scaled))
X_svm_sample = X_scaled[np.random.choice(len(X_scaled), sample_size, replace=False)]

ocsvm = OneClassSVM(kernel='rbf', gamma='scale', nu=0.02)  # nu = expected anomaly fraction
ocsvm.fit(X_svm_sample)

# Predict for all points (in chunks for efficiency)
chunk_size = 20_000
svm_preds = []
for i in range(0, len(X_scaled), chunk_size):
    chunk = X_scaled[i:i+chunk_size]
    preds = ocsvm.predict(chunk)
    svm_preds.extend(preds)

df_on["anomaly_ocsvm"] = (np.array(svm_preds) == -1).astype(int)
print("✅ One-Class SVM finished:", df_on["anomaly_ocsvm"].value_counts().to_dict())
# --- 5. Isolation Forest (fast & robust) ---
iso = IsolationForest(
    n_estimators=150,
    max_samples=256,
    contamination=0.02,   # 2% anomalies
    random_state=42,
    n_jobs=-1
)
df_on["anomaly_iforest"] = (iso.fit_predict(X_scaled) == -1).astype(int)

# --- 6. Improved Heuristic Anomalies ---
df_on["anomaly_heuristic"] = 0

# a) Unusual temperature (> 99th percentile)
if "cooltemp_degree-celsius" in df_on.columns:
    t_high = df_on["cooltemp_degree-celsius"].quantile(0.99)
    t_low = df_on["cooltemp_degree-celsius"].quantile(0.01)
    df_on.loc[
        (df_on["cooltemp_degree-celsius"] > t_high) |
        (df_on["cooltemp_degree-celsius"] < t_low),
        "anomaly_heuristic"
    ] = 1

# b) Unbalanced current (imbalance > threshold)
if "current_imbalance" in df_on.columns:
    imb_thresh = df_on["current_imbalance"].quantile(0.995)
    df_on.loc[df_on["current_imbalance"] > imb_thresh, "anomaly_heuristic"] = 1

# c) Low or zero power factor
if "pftot_none" in df_on.columns:
    df_on.loc[df_on["pftot_none"] < 0.5, "anomaly_heuristic"] = 1

# d) Voltage or battery outside normal range
for col in ["va_volt","vb_volt","vc_volt","vbat_volt"]:
    if col in df_on.columns:
        high, low = df_on[col].quantile(0.995), df_on[col].quantile(0.005)
        df_on.loc[(df_on[col] > high) | (df_on[col] < low), "anomaly_heuristic"] = 1

# --- 7. Combine votes ---
df_on['anomaly_votes'] = df_on[['anomaly_iforest','anomaly_ocsvm','anomaly_heuristic']].sum(axis=1)

print("✅ Anomalies computed successfully.")
print(df_on[['anomaly_iforest','anomaly_ocsvm','anomaly_heuristic','anomaly_votes']].head())

# --- 8. Save anomaly results ---
df_on.to_parquet("../../data/processed/air_12305_anomalies.parquet", index=False)
print("✅ Saved anomaly dataset.")


Data shape (ON cycles): (398286, 42)
Sampled down to 93747 rows for anomaly training

Running One-Class SVM anomaly detection...
✅ One-Class SVM finished: {0: 91934, 1: 1813}
✅ Anomalies computed successfully.
   anomaly_iforest  anomaly_ocsvm  anomaly_heuristic  anomaly_votes
0                0              0                  0              0
1                0              0                  0              0
2                0              0                  0              0
3                0              0                  0              0
4                0              0                  0              0
✅ Saved anomaly dataset.
