In [3]:
import polars as pl
import polars.selectors as cs
import random
from datetime import datetime, timedelta

random.seed(42)

story_templates = [
    {
        "Story_ID": "S001",
        "Role": "Remote Sales Associate",
        "Expected_Events": {
            "Initial Salary Agreement": (-30, -20),
            "2FA Setup": (2, 4),
            "Employee Handbook Acknowledged": (1, 2),
            "Security Training Completed": (7, 10),
            "Manager Meeting Scheduled": (10, 20),
            "Client Contract Signed": (5, 15),
            "Client Product Activation": (10, 20),
            "Help Desk Ticket Submitted": (15, 30),
            "Expense Report Filed": (30, 60),
            "Leave Request Submitted": (50, 80),
            "Remote Access VPN Log": (2, 90)
        }
    },
    {
        "Story_ID": "S002",
        "Role": "Remote Tech Support",
        "Expected_Events": {
            "Initial Salary Agreement": (-30, -20),
            "2FA Setup": (1, 3),
            "Employee Handbook Acknowledged": (1, 2),
            "Security Training Completed": (5, 7),
            "Help Desk Ticket Submitted": (5, 30),
            "Remote Access VPN Log": (1, 30),
            "Manager Meeting Scheduled": (7, 14)
        }
    },
    {
        "Story_ID": "S003",
        "Role": "Remote HR Assistant",
        "Expected_Events": {
            "Initial Salary Agreement": (-30, -20),
            "2FA Setup": (3, 5),
            "Employee Handbook Acknowledged": (1, 3),
            "Security Training Completed": (10, 20),
            "Manager Meeting Scheduled": (5, 10),
            "Leave Request Submitted": (60, 90)
        }
    },
    {
        "Story_ID": "S004",
        "Role": "Remote Contractor",
        "Expected_Events": {
            "Initial Salary Agreement": (-30, -20),
            "Client Contract Signed": (0, 5),
            "Client Product Activation": (5, 10),
            "First Large Commission": (10, 20),
            "First Regular Commission": (15, 30),
            "CFO Approval": (-20, -10),
        }
    }
]

def generate_employee(i, story, base_date):
    hire_date = base_date + timedelta(days=random.randint(0, 30))
    row = {
        "Employee_ID": f"E{i:03d}",
        "Story_ID": story["Story_ID"],
        "Is_Anomaly": False,
        "Role": story["Role"],
        "Employee Type": "Remote",
        "Employee Hire Date": hire_date,
        # "Initial Salary Agreement": hire_date + timedelta(days=1)
    }
    for event, (min_d, max_d) in story["Expected_Events"].items():
        row[event] = hire_date + timedelta(days=random.randint(min_d, max_d))

    row["Finance Confirmation Document"] = row.get("Client Product Activation", hire_date) + timedelta(days=2)
    row["First Large Commission"] = row["Finance Confirmation Document"] + timedelta(days=10)
    row["First Regular Commission"] = row["First Large Commission"] + timedelta(days=10)
    row["Payroll Adjustment Form"] = row["First Regular Commission"] + timedelta(days=5)
    row["Employee Termination Date"] = None

    return row

ghost_employee = {
    "Employee_ID": "E999",
    "Story_ID": "GHOST",
    "Is_Anomaly": True,
    "Role": "Remote Contractor",
    "Employee Type": "Remote",
    "Employee Hire Date": datetime(2020, 1, 15),
    "Initial Salary Agreement": datetime(2019, 12, 27),
    "Employee Handbook Acknowledged": datetime(2020, 1, 17),
    "2FA Setup": None,
    "Security Training Completed": datetime(2020, 1, 19),
    "Manager Meeting Scheduled": datetime(2020, 1, 20),
    "Email Sent to HR": datetime(2020, 1, 21),
    "Client Contract Signed": datetime(2020, 1, 22),
    "Client Product Activation": datetime(2020, 1, 23),
    "Finance Confirmation Document": datetime(2020, 1, 25),
    "Help Desk Ticket Submitted": datetime(2020, 1, 24),
    "Expense Report Filed": datetime(2020, 2, 10),
    "Leave Request Submitted": datetime(2020, 2, 5),
    "Remote Access VPN Log": datetime(2020, 1, 18),
    "Changed Direct Deposit Info": datetime(2020, 2, 15),
    "First Large Commission": datetime(2020, 2, 17),
    "First Regular Commission": datetime(2020, 2, 27),
    "Payroll Adjustment Form": datetime(2020, 3, 5),
    "Employee Termination Date": datetime(2020, 3, 20)
}


In [4]:
base_date = datetime(2020, 1, 1)
employees = [generate_employee(i, random.choice(story_templates), base_date) for i in range(1, 50)]
terminated = random.sample(range(49), 2)
for idx in terminated:
    employees[idx]["Employee Termination Date"] = employees[idx]["Payroll Adjustment Form"] + timedelta(days=20)
employees.append(ghost_employee)

df = pl.DataFrame(employees)

effective_events = [
    "Leave Request Submitted",
    "Changed Direct Deposit Info",
    "Payroll Adjustment Form",
    "First Large Commission",
    "First Regular Commission",
    "Employee Termination Date",
    "Finance Confirmation Document"
]

for event in effective_events:
    tx_dates, eff_dates = [], []

    for row in df.iter_rows(named=True):
        base = row.get(event)
        is_ghost = row["Story_ID"] == "GHOST"

        if base is None:
            tx_dates.append(None)
            eff_dates.append(None)
        else:
            if is_ghost:
                # Ghost gets instant processing
                tx_dates.append(base)
                eff_dates.append(base)
            else:
                # Regular employee: slow transaction, normal effective
                tx_date = base + timedelta(days=random.randint(5, 20))
                eff_date = base + timedelta(days=random.randint(-2, 2))
                tx_dates.append(tx_date)
                eff_dates.append(eff_date)

    df = df.with_columns([
        pl.Series(f"{event} Transaction Date", tx_dates),
        pl.Series(f"{event} Effective Date", eff_dates)
    ])



In [5]:
w4_issuance = []
w4_begin = []

for row in df.iter_rows(named=True):
    hire_date = row["Employee Hire Date"]
    is_ghost = row["Story_ID"] == "GHOST"

    if is_ghost:
        issuance = hire_date + timedelta(days=2)
        begin = issuance + timedelta(days=30)
    else:
        begin = hire_date + timedelta(days=2)
        issuance = begin + timedelta(days=30)

    w4_issuance.append(issuance)
    w4_begin.append(begin)

df = df.with_columns([
    pl.Series("W-4 Issuance Date", w4_issuance),
    pl.Series("W-4 Begin Date", w4_begin)
])

In [6]:
preferred_order = [
    "Employee_ID",
    "Story_ID",
    "Is_Anomaly",
    "Role",
    "Employee Type",
    "Initial Salary Agreement",
    "CFO Approval",
    "Employee Hire Date",

    "Employee Handbook Acknowledged",
    "2FA Setup",
    "Security Training Completed",
    "Manager Meeting Scheduled",
    "Email Sent to HR",
    "Remote Access VPN Log",

    "Client Contract Signed",
    "Client Product Activation",
    "Finance Confirmation Document",
    "Finance Confirmation Document Transaction Date",
    "Finance Confirmation Document Effective Date",

    "Help Desk Ticket Submitted",
    "Expense Report Filed",
    "Leave Request Submitted",
    "Leave Request Submitted Transaction Date",
    "Leave Request Submitted Effective Date",

    "Changed Direct Deposit Info",
    "Changed Direct Deposit Info Transaction Date",
    "Changed Direct Deposit Info Effective Date",

    "First Large Commission",
    "First Large Commission Transaction Date",
    "First Large Commission Effective Date",

    "First Regular Commission",
    "First Regular Commission Transaction Date",
    "First Regular Commission Effective Date",

    "Payroll Adjustment Form",
    "Payroll Adjustment Form Transaction Date",
    "Payroll Adjustment Form Effective Date",

    "W-4 Begin Date",
    "W-4 Issuance Date",

    "Employee Termination Date",
    "Employee Termination Date Transaction Date",
    "Employee Termination Date Effective Date"
]

In [7]:
df = df.select(preferred_order)

# cast all temporal columns to date

temp_cols = df.select(cs.temporal()).columns

for col in temp_cols:
    df = df.with_columns(
        pl.col(col).cast(pl.Date, strict=False).alias(col)
    )


In [8]:
df.head()

Employee_ID,Story_ID,Is_Anomaly,Role,Employee Type,Initial Salary Agreement,CFO Approval,Employee Hire Date,Employee Handbook Acknowledged,2FA Setup,Security Training Completed,Manager Meeting Scheduled,Email Sent to HR,Remote Access VPN Log,Client Contract Signed,Client Product Activation,Finance Confirmation Document,Finance Confirmation Document Transaction Date,Finance Confirmation Document Effective Date,Help Desk Ticket Submitted,Expense Report Filed,Leave Request Submitted,Leave Request Submitted Transaction Date,Leave Request Submitted Effective Date,Changed Direct Deposit Info,Changed Direct Deposit Info Transaction Date,Changed Direct Deposit Info Effective Date,First Large Commission,First Large Commission Transaction Date,First Large Commission Effective Date,First Regular Commission,First Regular Commission Transaction Date,First Regular Commission Effective Date,Payroll Adjustment Form,Payroll Adjustment Form Transaction Date,Payroll Adjustment Form Effective Date,W-4 Begin Date,W-4 Issuance Date,Employee Termination Date,Employee Termination Date Transaction Date,Employee Termination Date Effective Date
str,str,bool,str,str,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date
"""E001""","""S001""",False,"""Remote Sales Associate""","""Remote""",2019-12-06,,2020-01-01,2020-01-02,2020-01-03,2020-01-09,2020-01-12,,2020-01-07,2020-01-16,2020-01-19,2020-01-21,2020-01-27,2020-01-20,2020-01-18,2020-02-18,2020-03-04,2020-03-11,2020-03-04,,,,2020-01-31,2020-02-18,2020-02-02,2020-02-10,2020-03-01,2020-02-11,2020-02-15,2020-02-29,2020-02-14,2020-01-03,2020-02-02,,,
"""E002""","""S001""",False,"""Remote Sales Associate""","""Remote""",2019-12-07,,2020-01-03,2020-01-04,2020-01-05,2020-01-11,2020-01-23,,2020-02-09,2020-01-16,2020-01-19,2020-01-21,2020-01-30,2020-01-23,2020-01-25,2020-02-16,2020-03-11,2020-03-27,2020-03-13,,,,2020-01-31,2020-02-12,2020-02-01,2020-02-10,2020-02-28,2020-02-12,2020-02-15,2020-03-04,2020-02-17,2020-01-05,2020-02-04,,,
"""E003""","""S001""",False,"""Remote Sales Associate""","""Remote""",2019-12-28,,2020-01-25,2020-01-27,2020-01-29,2020-02-03,2020-02-08,,2020-03-15,2020-02-01,2020-02-07,2020-02-09,2020-02-23,2020-02-07,2020-02-19,2020-02-27,2020-03-17,2020-04-03,2020-03-19,,,,2020-02-19,2020-03-02,2020-02-19,2020-02-29,2020-03-13,2020-02-29,2020-03-05,2020-03-20,2020-03-06,2020-01-27,2020-02-26,,,
"""E004""","""S001""",False,"""Remote Sales Associate""","""Remote""",2019-12-18,,2020-01-12,2020-01-14,2020-01-16,2020-01-19,2020-01-29,,2020-02-20,2020-01-25,2020-01-23,2020-01-25,2020-02-06,2020-01-23,2020-02-08,2020-02-13,2020-03-19,2020-04-03,2020-03-17,,,,2020-02-04,2020-02-22,2020-02-05,2020-02-14,2020-02-26,2020-02-12,2020-02-19,2020-03-09,2020-02-20,2020-01-14,2020-02-13,,,
"""E005""","""S003""",False,"""Remote HR Assistant""","""Remote""",2019-12-23,,2020-01-19,2020-01-20,2020-01-24,2020-01-29,2020-01-29,,,,,2020-01-21,2020-02-08,2020-01-23,,,2020-03-26,2020-04-03,2020-03-26,,,,2020-01-31,2020-02-05,2020-02-01,2020-02-10,2020-02-23,2020-02-11,2020-02-15,2020-02-26,2020-02-17,2020-01-21,2020-02-20,,,


In [9]:
df.write_csv("enhanced_hr_audit_dataset.csv")
df.head()

Employee_ID,Story_ID,Is_Anomaly,Role,Employee Type,Initial Salary Agreement,CFO Approval,Employee Hire Date,Employee Handbook Acknowledged,2FA Setup,Security Training Completed,Manager Meeting Scheduled,Email Sent to HR,Remote Access VPN Log,Client Contract Signed,Client Product Activation,Finance Confirmation Document,Finance Confirmation Document Transaction Date,Finance Confirmation Document Effective Date,Help Desk Ticket Submitted,Expense Report Filed,Leave Request Submitted,Leave Request Submitted Transaction Date,Leave Request Submitted Effective Date,Changed Direct Deposit Info,Changed Direct Deposit Info Transaction Date,Changed Direct Deposit Info Effective Date,First Large Commission,First Large Commission Transaction Date,First Large Commission Effective Date,First Regular Commission,First Regular Commission Transaction Date,First Regular Commission Effective Date,Payroll Adjustment Form,Payroll Adjustment Form Transaction Date,Payroll Adjustment Form Effective Date,W-4 Begin Date,W-4 Issuance Date,Employee Termination Date,Employee Termination Date Transaction Date,Employee Termination Date Effective Date
str,str,bool,str,str,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date,date
"""E001""","""S001""",False,"""Remote Sales Associate""","""Remote""",2019-12-06,,2020-01-01,2020-01-02,2020-01-03,2020-01-09,2020-01-12,,2020-01-07,2020-01-16,2020-01-19,2020-01-21,2020-01-27,2020-01-20,2020-01-18,2020-02-18,2020-03-04,2020-03-11,2020-03-04,,,,2020-01-31,2020-02-18,2020-02-02,2020-02-10,2020-03-01,2020-02-11,2020-02-15,2020-02-29,2020-02-14,2020-01-03,2020-02-02,,,
"""E002""","""S001""",False,"""Remote Sales Associate""","""Remote""",2019-12-07,,2020-01-03,2020-01-04,2020-01-05,2020-01-11,2020-01-23,,2020-02-09,2020-01-16,2020-01-19,2020-01-21,2020-01-30,2020-01-23,2020-01-25,2020-02-16,2020-03-11,2020-03-27,2020-03-13,,,,2020-01-31,2020-02-12,2020-02-01,2020-02-10,2020-02-28,2020-02-12,2020-02-15,2020-03-04,2020-02-17,2020-01-05,2020-02-04,,,
"""E003""","""S001""",False,"""Remote Sales Associate""","""Remote""",2019-12-28,,2020-01-25,2020-01-27,2020-01-29,2020-02-03,2020-02-08,,2020-03-15,2020-02-01,2020-02-07,2020-02-09,2020-02-23,2020-02-07,2020-02-19,2020-02-27,2020-03-17,2020-04-03,2020-03-19,,,,2020-02-19,2020-03-02,2020-02-19,2020-02-29,2020-03-13,2020-02-29,2020-03-05,2020-03-20,2020-03-06,2020-01-27,2020-02-26,,,
"""E004""","""S001""",False,"""Remote Sales Associate""","""Remote""",2019-12-18,,2020-01-12,2020-01-14,2020-01-16,2020-01-19,2020-01-29,,2020-02-20,2020-01-25,2020-01-23,2020-01-25,2020-02-06,2020-01-23,2020-02-08,2020-02-13,2020-03-19,2020-04-03,2020-03-17,,,,2020-02-04,2020-02-22,2020-02-05,2020-02-14,2020-02-26,2020-02-12,2020-02-19,2020-03-09,2020-02-20,2020-01-14,2020-02-13,,,
"""E005""","""S003""",False,"""Remote HR Assistant""","""Remote""",2019-12-23,,2020-01-19,2020-01-20,2020-01-24,2020-01-29,2020-01-29,,,,,2020-01-21,2020-02-08,2020-01-23,,,2020-03-26,2020-04-03,2020-03-26,,,,2020-01-31,2020-02-05,2020-02-01,2020-02-10,2020-02-23,2020-02-11,2020-02-15,2020-02-26,2020-02-17,2020-01-21,2020-02-20,,,
