# ZENVY – AI Powered Payroll
### Payroll Anomaly Detection Engine (Unsupervised)

### 1. Import Required Libraries

In [3]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings("ignore")

### 2. Load Payroll CSV (Batch Pipeline)

In [11]:
df = pd.read_csv("payroll_anomaly_data.csv")
df.head(5)


Unnamed: 0,employee_id,base_salary,overtime_hours,overtime_pay,working_days,department_id
0,1001,43973,11,2773,20,2
1,1002,38893,11,3367,22,1
2,1003,45181,5,3913,22,1
3,1004,52184,7,2196,20,4
4,1005,38126,7,4492,22,4


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   employee_id     500 non-null    int64
 1   base_salary     500 non-null    int64
 2   overtime_hours  500 non-null    int64
 3   overtime_pay    500 non-null    int64
 4   working_days    500 non-null    int64
 5   department_id   500 non-null    int64
dtypes: int64(6)
memory usage: 23.6 KB


### 3. Data Quality Checks

In [18]:
print("Negative salaries:", (df["base_salary"] <= 0).sum())
print("Negative overtime pay:", (df["overtime_pay"] < 0).sum())
print("Negative overtime hours:", (df["overtime_hours"] < 0).sum())

Negative salaries: 0
Negative overtime pay: 1
Negative overtime hours: 0


### 5. Feature Engineering

### (Detect Salary Manipulation & Fake Overtime)

In [22]:
def engineer_features(df):
    data = df.copy()

    # Overtime ratios
    data["ot_pay_ratio"] = data["overtime_pay"] / (data["base_salary"] + 1)
    data["ot_hours_ratio"] = data["overtime_hours"] / (data["working_days"] + 1)

    # Salary deviation vs department
    dept_avg_salary = data.groupby("department_id")["base_salary"].transform("mean")
    data["salary_vs_dept_avg"] = (
        data["base_salary"] - dept_avg_salary
    ) / (dept_avg_salary + 1)

    return data


### 5. Salary Anomaly Detection (Unsupervised)

In [26]:
def detect_salary_anomalies(df, contamination=0.05):
    features = ["base_salary", "salary_vs_dept_avg"]

    X = df[features].fillna(0)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    model = IsolationForest(
        contamination=contamination,
        random_state=42
    )

    preds = model.fit_predict(X_scaled)
    scores = model.score_samples(X_scaled)

    return preds, scores, model, scaler


### 6. Overtime Anomaly Detection

In [29]:
def detect_overtime_anomalies(df, contamination=0.05):
    features = [
        "overtime_hours",
        "overtime_pay",
        "ot_pay_ratio",
        "ot_hours_ratio"
    ]

    X = df[features].fillna(0)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    model = IsolationForest(
        contamination=contamination,
        random_state=42
    )

    preds = model.fit_predict(X_scaled)
    scores = model.score_samples(X_scaled)

    return preds, scores, model, scaler


### 8. Combined Anomaly Detection (Salary + Overtime)

In [32]:
def detect_combined_anomalies(df, contamination=0.05):

    numeric_features = [
        "base_salary",
        "overtime_hours",
        "overtime_pay",
        "working_days",
        "ot_pay_ratio",
        "ot_hours_ratio",
        "salary_vs_dept_avg"
    ]

    categorical_features = ["department_id"]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", Pipeline([
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler())
            ]), numeric_features),

            ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"),
             categorical_features)
        ]
    )

    X = df[numeric_features + categorical_features]
    X_transformed = preprocessor.fit_transform(X)

    model = IsolationForest(
        contamination=contamination,
        random_state=42
    )

    preds = model.fit_predict(X_transformed)
    scores = model.score_samples(X_transformed)

    return preds, scores, model, preprocessor


### 8. Concept Drift Detection

In [35]:
def detect_concept_drift(old_df, new_df):
    features = ["base_salary", "overtime_hours"]

    drift_report = []

    for f in features:
        mean_shift = abs(new_df[f].mean() - old_df[f].mean()) / (old_df[f].mean() + 1)
        std_shift = abs(new_df[f].std() - old_df[f].std()) / (old_df[f].std() + 1)

        if mean_shift > 0.15 or std_shift > 0.25:
            drift_report.append({
                "feature": f,
                "mean_shift": mean_shift,
                "std_shift": std_shift
            })

    return drift_report


### 9. Real-Time Payroll Record Detection

In [38]:
def process_realtime_record(record, salary_model, ot_model,
                            salary_scaler, ot_scaler, ref_df):

    dept_avg = ref_df[
        ref_df["department_id"] == record["department_id"]
    ]["base_salary"].mean()

    salary_vs_dept = (record["base_salary"] - dept_avg) / (dept_avg + 1)

    sal_X = salary_scaler.transform([[record["base_salary"], salary_vs_dept]])
    salary_flag = salary_model.predict(sal_X)[0] == -1

    ot_ratio = record["overtime_pay"] / (record["base_salary"] + 1)
    ot_hours_ratio = record["overtime_hours"] / (record["working_days"] + 1)

    ot_X = ot_scaler.transform([[
        record["overtime_hours"],
        record["overtime_pay"],
        ot_ratio,
        ot_hours_ratio
    ]])

    ot_flag = ot_model.predict(ot_X)[0] == -1

    return {
        "salary_anomaly": salary_flag,
        "overtime_anomaly": ot_flag
    }


### 10. Main Execution (Batch + Real-Time)

In [41]:
def main():
    processed_df = engineer_features(df)

    sal_p, _, sal_model, sal_scaler = detect_salary_anomalies(processed_df)
    ot_p, _, ot_model, ot_scaler = detect_overtime_anomalies(processed_df)
    comb_p, _, _, _ = detect_combined_anomalies(processed_df)

    processed_df["salary_anomaly"] = sal_p == -1
    processed_df["overtime_anomaly"] = ot_p == -1
    processed_df["combined_anomaly"] = comb_p == -1

    print("Total records:", len(processed_df))
    print("Salary anomalies:", processed_df["salary_anomaly"].sum())
    print("Overtime anomalies:", processed_df["overtime_anomaly"].sum())
    print("Combined anomalies:", processed_df["combined_anomaly"].sum())

if __name__ == "__main__":
    main()


Total records: 500
Salary anomalies: 25
Overtime anomalies: 25
Combined anomalies: 25


### 11. Algorithm Selection Rationale

#### Isolation Forest

    - No labels required

    - Efficient for payroll-scale data

    - Naturally isolates extreme behavior

    - Robust to evolving payroll patterns

### 12. Evaluation Strategy (No Labels)

    - Fixed contamination rate (5%)

    - Review lowest anomaly scores

    - Cross-check salary-only vs OT-only vs combined

    - Human review of flagged records

### 13. Deployment Plan

    - Daily batch payroll scans

    - Real-time API for new payroll entries

    - Weekly drift monitoring

    - Monthly retraining or drift-triggered retraining

    - Audit dashboard for flagged employees

# 1️⃣ PSEUDOCODE (Intern-Level, Clear & Structured)
### Payroll Anomaly Detection Engine (Unsupervised)

# 2️⃣ ALGORITHM SELECTION RATIONALE





### Why Isolation Forest?

    - Payroll fraud data is unlabeled

    - Isolation Forest:

        - Works with unsupervised learning

        - Efficient for large payroll datasets

        - Naturally isolates rare, abnormal behavior

        - Robust to changing payroll patterns

    - Suitable for both batch and real-time inference

# 3️⃣ EVALUATION STRATEGY (NO LABELS)

### Since fraud labels are unavailable:

#### Evaluation Techniques Used

    - Fixed contamination rate (e.g., 5%)

    - Rank records by anomaly score

    - Manual audit of top anomalies

    - Compare:
        -Salary-only anomalies

        -Overtime-only anomalies

        -Combined anomalies

    - Monitor anomaly rate stability over time

#### Success Criteria

    - Stable anomaly rate

    - Business-explainable anomalies

    - Reduced false positives after retraining

# 4️⃣ DEPLOYMENT PLAN (Production-Ready)
### Architecture Overview

