In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

In [None]:
num_users = 120
users = np.arange(1000, 1000 + num_users)

# Power-law distribution for user activity
user_weights = np.random.pareto(a=2, size=num_users)
user_weights = user_weights / user_weights.sum()

In [None]:
rows = 15000

# Office hours more likely
hours = np.concatenate(
    [
        np.random.normal(10, 2, int(rows * 0.6)),  # work hours
        np.random.normal(15, 2, int(rows * 0.25)),  # afternoon
        np.random.uniform(0, 24, int(rows * 0.15)),  # random
    ]
)

hours = np.clip(hours, 0, 23).astype(int)

# Generate random days across a 14-day window
days = np.random.randint(0, 30, size=rows)

base_date = pd.to_datetime("2024-01-01")

timestamps = (
    base_date
    + pd.to_timedelta(days, unit="D")
    + pd.to_timedelta(hours, unit="h")
    + pd.to_timedelta(np.random.randint(0, 60, size=rows), unit="m")
)

In [None]:
df = pd.DataFrame(
    {
        "user_id": np.random.choice(users, size=rows, p=user_weights),
        "timestamp": timestamps,
        "file_accessed": np.random.choice(
            ["HR_Records", "Finance_Report", "Engineering_Design", "Client_Data"],
            rows,
            p=[0.2, 0.25, 0.35, 0.2],
        ),
        "access_type": np.random.choice(
            ["read", "write", "delete"], rows, p=[0.75, 0.2, 0.05]
        ),
        "location": np.random.choice(
            ["Office", "Remote", "Unknown"], rows, p=[0.65, 0.3, 0.05]
        ),
        "role": np.random.choice(
            ["Employee", "Manager", "Admin"], rows, p=[0.75, 0.15, 0.1]
        ),
    }
)

In [None]:
# Failed logins: mostly zero, spikes for risky users
df["failed_logins"] = np.random.poisson(0.2, rows)

risky_indices = np.random.choice(df.index, size=int(rows * 0.05), replace=False)
df.loc[risky_indices, "failed_logins"] += np.random.randint(
    5, 15, size=len(risky_indices)
)

df.head()

Unnamed: 0,user_id,timestamp,file_accessed,access_type,location,role,failed_logins
0,1104,2024-01-01 08:06:00,Finance_Report,read,Office,Employee,0
1,1034,2024-01-01 09:05:00,Finance_Report,read,Office,Employee,1
2,1051,2024-01-01 09:05:00,HR_Records,read,Office,Employee,0
3,1103,2024-01-01 07:21:00,Client_Data,read,Remote,Admin,0
4,1039,2024-01-01 10:07:00,Engineering_Design,write,Remote,Employee,0


In [None]:
df.to_csv("../data/access_logs.csv", index=False)
print("Realistic dataset regenerated.")

Realistic dataset regenerated.
