In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from scipy.stats import ks_2samp

class PayrollAnomalyEngine:
    def __init__(self, contamination=0.02):
        self.contamination = contamination
        self.iso_forest = IsolationForest(contamination=self.contamination, random_state=42)
        self.lof = LocalOutlierFactor(
            n_neighbors=5,          # dataset has 6 samples
            contamination=self.contamination,
            novelty=True            # REQUIRED for prediction
        )
        self.scaler = StandardScaler()
        self.baseline_distribution = None
        self.role_stats = {}       # âœ… store role stats

    def engineer_features(self, df, training=False):
        df = df.copy()

        # Feature 1: Overtime ratio
        df['ot_ratio'] = df['overtime_pay'] / (df['base_salary'] + 1)

        # Feature 2: Hourly rate
        df['hourly_rate'] = df['total_pay'] / (df['hours_worked'] + 1)

        # Feature 3: Role deviation (SAFE)
        if training:
            stats = df.groupby('job_role')['total_pay'].agg(['mean', 'std']).fillna(1)
            self.role_stats = stats.to_dict('index')

        def role_dev(row):
            stats = self.role_stats.get(row['job_role'])
            if stats is None:
                return 0.0
            return (row['total_pay'] - stats['mean']) / (stats['std'] + 1)

        df['role_avg_dev'] = df.apply(role_dev, axis=1)

        return df[['base_salary', 'overtime_pay', 'ot_ratio', 'hourly_rate', 'role_avg_dev']]

    def fit_batch(self, df):
        features = self.engineer_features(df, training=True)
        scaled_features = self.scaler.fit_transform(features)

        self.iso_forest.fit(scaled_features)
        self.lof.fit(scaled_features)

        self.baseline_distribution = scaled_features
        print("Batch training complete. Baseline established.")

    def predict_realtime(self, single_record_df):
        features = self.engineer_features(single_record_df, training=False)
        scaled_feature = self.scaler.transform(features)

        if np.isnan(scaled_feature).any():
            raise ValueError("NaN detected after preprocessing")

        if_score = self.iso_forest.decision_function(scaled_feature)[0]
        lof_score = self.lof.decision_function(scaled_feature)[0]

        is_anomaly = (if_score < 0) or (lof_score < 0)

        return {
            "is_anomaly": bool(is_anomaly),
            "iso_forest_score": round(if_score, 4),
            "lof_score": round(lof_score, 4),
            "reasoning": "High OT ratio / Role deviation" if is_anomaly else "Normal"
        }
# --- Example Usage ---

# 1. Mock Data
data = {
    'job_role': ['Dev', 'Dev', 'Dev', 'Manager', 'Manager', 'Dev'],
    'base_salary': [5000, 5100, 4900, 8000, 8200, 5000],
    'overtime_pay': [100, 150, 120, 0, 50, 4000], # Last record is fake overtime
    'total_pay': [5100, 5250, 5020, 8000, 8250, 9000],
    'hours_worked': [160, 165, 162, 160, 160, 170]
}
df_history = pd.DataFrame(data)

# 2. Initialize and Train
engine = PayrollAnomalyEngine()
engine.fit_batch(df_history)

# 3. Predict on a suspicious new record
new_transaction = pd.DataFrame([{
    'job_role': 'Dev',
    'base_salary': 5000,
    'overtime_pay': 9000, # Massive salary manipulation
    'total_pay': 14000,
    'hours_worked': 160
}])

result = engine.predict_realtime(new_transaction)
print(f"Result: {result}")

Batch training complete. Baseline established.
Result: {'is_anomaly': True, 'iso_forest_score': -0.0132, 'lof_score': -1.0732, 'reasoning': 'High OT ratio / Role deviation'}
