In [None]:
# evaluate_model.ipynb

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import classification_report, confusion_matrix

# --------- Paths ---------
DATA_DIR = os.path.join("..", "data")
MODEL_DIR = os.path.join("..", "models")

# --------- Load Data + Model ---------
df = pd.read_csv(os.path.join(DATA_DIR, "dummy_billing_dataset.csv"), parse_dates=["month"])
model = joblib.load(os.path.join(MODEL_DIR, "anomaly_model.pkl"))

# --------- Feature Engineering (same as training) ---------
def add_features(df):
    df = df.sort_values(["customer_id", "month"])
    df["ratio"] = df["billed_kwh"] / (df["consumption_kwh"] + 1)
    df["monthly_change"] = df.groupby("customer_id")["consumption_kwh"].diff().fillna(0)
    category_avg = df.groupby("consumer_category")["consumption_kwh"].transform("mean")
    df["cat_dev"] = df["consumption_kwh"] - category_avg
    df["billing_gap"] = df["consumption_kwh"] - df["billed_kwh"]
    return df

df = add_features(df)
features = ["consumption_kwh", "billed_kwh", "ratio", "monthly_change", "cat_dev", "billing_gap"]

# --------- Evaluate Scores ---------
df["anomaly_score"] = model.decision_function(df[features].fillna(0))
df["anomaly_label"] = model.predict(df[features].fillna(0))  # -1 anomaly, 1 normal

# --------- Plot Score Distribution ---------
plt.figure(figsize=(8,5))
sns.histplot(df["anomaly_score"], bins=50, kde=True)
plt.axvline(df["anomaly_score"].quantile(0.05), color="red", linestyle="--", label="5% cutoff")
plt.title("Anomaly Score Distribution")
plt.legend()
plt.show()

# --------- Inject Fake Fraud (Ground Truth for Metrics) ---------
fake = df.sample(50, random_state=42).copy()
fake["consumption_kwh"] *= 10   # extreme high usage
fake["billed_kwh"] *= 0.1      # very low billing (under-reported)
fake = add_features(fake)

fake["anomaly_score"] = model.decision_function(fake[features].fillna(0))
fake["anomaly_label"] = model.predict(fake[features].fillna(0))

# Assign true labels
df["true_label"] = 1   # 1 = normal
fake["true_label"] = -1 # -1 = fraud

# Merge real + fake
eval_df = pd.concat([df, fake], ignore_index=True)

# --------- Metrics ---------
y_true = eval_df["true_label"]
y_pred = eval_df["anomaly_label"]

print("📊 Confusion Matrix:")
print(confusion_matrix(y_true, y_pred, labels=[-1, 1]))

print("\n📈 Classification Report:")
print(classification_report(y_true, y_pred, target_names=["Fraud (-1)", "Normal (1)"]))


✅ Model saved to C:\Users\Hp\WattAudit++\models\anomaly_model.pkl
✅ Top 50 suspicious customers saved to C:\Users\Hp\WattAudit++\data\top50_suspicious_customers.csv
