In [1]:
# train_model.py

import os
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
import joblib

# --------- Paths ---------
BASE_DIR = os.path.abspath("..") 
DATA_DIR = os.path.join(BASE_DIR, "data")
MODEL_DIR = os.path.join(BASE_DIR, "models")
os.makedirs(MODEL_DIR, exist_ok=True)

# --------- Load Data ---------
df = pd.read_csv(os.path.join(DATA_DIR, "dummy_billing_dataset.csv"), parse_dates=["month"])

# --------- Feature Engineering ---------
def add_features(df):
    df = df.sort_values(["customer_id", "month"])
    
    # Ratio of billed vs consumed
    df["ratio"] = df["billed_kwh"] / (df["consumption_kwh"] + 1)
    
    # Month-to-month change in consumption
    df["monthly_change"] = df.groupby("customer_id")["consumption_kwh"].diff().fillna(0)
    
    # Deviation from category average
    category_avg = df.groupby("consumer_category")["consumption_kwh"].transform("mean")
    df["cat_dev"] = df["consumption_kwh"] - category_avg
    
    # Gap between billed and consumed
    df["billing_gap"] = df["consumption_kwh"] - df["billed_kwh"]
    
    return df

df = add_features(df)

# Features for model
features = ["consumption_kwh", "billed_kwh", "ratio", "monthly_change", "cat_dev", "billing_gap"]

# --------- Train Models ---------
iso = IsolationForest(contamination=0.05, random_state=42)
df["iso_score"] = iso.fit_predict(df[features].fillna(0))
df["iso_value"] = iso.decision_function(df[features].fillna(0))

lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
df["lof_score"] = lof.fit_predict(df[features].fillna(0))

# --------- Hybrid Scoring ---------
# Combine Isolation Forest & LOF + Rule-based flags
df["combined_score"] = df["iso_value"] + df["lof_score"]

# Rule-based detection: if billing ratio < 0.85 consistently
df["rule_flag"] = (df["ratio"] < 0.85).astype(int)

# Final anomaly score = AI + rules
df["final_score"] = df["combined_score"] - df["rule_flag"] * 2

# --------- Save Top 50 Suspicious Customers ---------
top50 = df.groupby("customer_id")["final_score"].mean().nsmallest(50).reset_index()
top50.to_csv(os.path.join(DATA_DIR, "top50_suspicious_customers.csv"), index=False)

# Save trained model (Isolation Forest only, LOF is unsupervised no save)
joblib.dump(iso, os.path.join(MODEL_DIR, "anomaly_model.pkl"))

print(f"✅ Model saved to {os.path.join(MODEL_DIR, 'anomaly_model.pkl')}")
print(f"✅ Top 50 suspicious customers saved to {os.path.join(DATA_DIR, 'top50_suspicious_customers.csv')}")


NameError: name '__file__' is not defined