In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(42)

In [3]:
n_employees = 600
employee_ids = np.arange(1001, 1001 + n_employees)

In [4]:
departments = {
    "Engineering": ["Software Engineer", "Senior Engineer", "Tech Lead"],
    "Sales": ["Sales Executive", "Sales Manager"],
    "HR": ["HR Executive", "HR Manager"],
    "Finance": ["Analyst", "Finance Manager"],
    "Marketing": ["Marketing Executive", "Marketing Manager"]
}

dept_names = list(departments.keys())

In [11]:
department_col = np.random.choice(
    dept_names,
    size = n_employees,
    p = [0.30, 0.25, 0.15, 0.15, 0.15]
)

job_role_col = [
    np.random.choice(departments[dept])
    for dept in department_col
]


In [12]:
total_experience = np.random.randint(1 , 36, size = n_employees)

years_at_company = np.array([
    np.random.randint(0, exp+1)
    for exp in total_experience
])

age  = total_experience + np.random.randint(21, 30, size=n_employees)

In [13]:
currunt_year = 2025

join_year = currunt_year - years_at_company
join_month = np.random.randint(1, 13, size = n_employees)
join_day = np.random.randint(1, 29, size = n_employees)

join_date = pd.to_datetime(
    {
        "Year" : join_year,
        "Month" : join_month,
        "Day" : join_day
    }
)

In [14]:
base_salary = {
    "Engineering": 70000,
    "Sales": 60000,
    "HR": 45000,
    "Finance": 80000,
    "Marketing": 55000
}

In [15]:
salary = []

for dept, exp in zip(department_col, total_experience):
    noise = np.random.normal(0, 8000)
    sal = base_salary[dept] + (exp * 4000) + noise
    salary.append(max(30000, int(sal)))

In [16]:
performance_rating = np.random.choice(
    [1, 2, 3, 4, 5],
    size=n_employees,
    p=[0.05, 0.15, 0.50, 0.20, 0.10]
)


In [17]:
attrition_prob = (
    (salary < np.percentile(salary, 30)).astype(int) * 0.25 +
    (performance_rating <= 2).astype(int) * 0.30 +
    (years_at_company <= 2).astype(int) * 0.25
)

attrition_prob = np.clip(attrition_prob, 0, 0.8)

attrition = np.where(
    np.random.rand(n_employees) < attrition_prob,
    "Yes",
    "No"
)

In [20]:
df = pd.DataFrame({
    "EmployeeID": employee_ids,
    "Age": age,
    "Department": department_col,
    "JobRole": job_role_col,
    "JoinDate": join_date,
    "YearsAtCompany": years_at_company,
    "TotalExperience": total_experience,
    "Salary": salary,
    "PerformanceRating": performance_rating,
    "Attrition": attrition
})

df.to_csv(
    "../data/raw/employee_data.csv",
    index=False
)
