In [None]:
# anomaly_detection.ipynb

import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import os

# --------- Paths ---------
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_DIR = os.path.join(BASE_DIR, "data")

# --------- Load dataset ---------
df = pd.read_csv(os.path.join(DATA_DIR, "dummy_billing_dataset.csv"), parse_dates=["month"])
print("Dataset loaded:", df.shape)

# --------- Feature Engineering ---------
df = df.sort_values(["customer_id", "month"])

# rolling mean & std (3 months per customer)
df["rolling_mean_3"] = df.groupby("customer_id")["consumption_kwh"].rolling(3, min_periods=1).mean().reset_index(0, drop=True)
df["rolling_std_3"] = df.groupby("customer_id")["consumption_kwh"].rolling(3, min_periods=1).std().reset_index(0, drop=True).fillna(0)

# pct change vs previous month
df["pct_change"] = df.groupby("customer_id")["consumption_kwh"].pct_change().fillna(0)

# final features
features = ["consumption_kwh", "billed_kwh", "rolling_mean_3", "rolling_std_3", "pct_change"]
X = df[features].fillna(0)

# --------- Isolation Forest ---------
iso = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)
df["anomaly_flag"] = iso.fit_predict(X)   # -1 = anomaly, 1 = normal
df["anomaly_score"] = iso.decision_function(X)

# --------- Aggregate anomaly scores per customer ---------
customer_scores = df.groupby("customer_id")["anomaly_score"].mean().reset_index()
customer_scores = customer_scores.sort_values("anomaly_score")  # lower score = more anomalous

# take top 50
top50 = customer_scores.head(50)
print("Top 50 suspicious customers:")
print(top50.head())

# --------- Save results ---------
out_path = os.path.join(DATA_DIR, "top50_suspicious_customers.csv")
top50.to_csv(out_path, index=False)
print(f"✅ Saved to {out_path}")
