## HR Synthetic Data Generator in Python

### This script creates synthetic HR data across 8 related tables

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import os

In [None]:
# Set seed for reproducibility
np.random.seed(123)
random.seed(123)

# Helper functions

In [None]:
def random_date(start_date, end_date):
    """Generate a random date between start_date and end_date"""
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    return start_date + timedelta(days=random_number_of_days)

In [None]:
def random_choice(options, size=1):
    """Choose random elements from options"""
    if size == 1:
        return random.choice(options)
    return [random.choice(options) for _ in range(size)]

In [None]:
# Generate age_group table
def generate_age_groups():
    """Generate age_group reference table"""
    return pd.DataFrame({
        'agegroupid': range(1, 6),
        'age_category': ['18-24', '25-34', '35-44', '45-54', '55+']
    })

In [None]:
# Generate BU (Business Unit) table
def generate_bus():
    """Generate Business Unit reference table"""
    regions = ["North America", "Europe", "Asia Pacific", "Latin America", "Middle East & Africa"]
    vps = ["John Smith", "Maria Rodriguez", "Aisha Patel", "Carlos Gomez", "Sarah Johnson",
           "Wei Chen", "David Kim", "Rachel Green", "Michael Scott", "Omar Hassan"]
    
    business_units = []
    region_seq = 1
    
    # Create main BUs
    main_bus = ["Sales", "Marketing", "Finance", "IT", "HR", "Operations", "R&D", "Customer Support"]
    
    for i, main_bu in enumerate(main_bus):
        num_regions = random.randint(1, 3)
        
        for j in range(num_regions):
            region = random_choice(regions)
            bu_name = f"{main_bu} {j+1}" if j > 0 else main_bu
            
            business_units.append({
                'bu': bu_name,
                'regionseq': region_seq,
                'vp': random_choice(vps),
                'region': region
            })
            
            region_seq += 1
    
    return pd.DataFrame(business_units)

In [None]:
# Generate ethnicity table
def generate_ethnicity():
    """Generate ethnicity reference table"""
    return pd.DataFrame({
        'ethnicgroup': ['A', 'B', 'H', 'W', 'O', 'T', 'N'],
        'ethnicity': ['Asian', 'Black', 'Hispanic', 'White', 'Other', 'Two or More', 'Not Specified']
    })

In [None]:
# Generate fp (Full/Part Time) table
def generate_fp():
    """Generate full/part time reference table"""
    return pd.DataFrame({
        'fp': ['F', 'P', 'C', 'I'],
        'fpdescription': ['Full Time', 'Part Time', 'Contractor', 'Intern']
    })

In [None]:
def generate_gender():
    """Generate gender reference table"""
    return pd.DataFrame({
        'id': range(1, 5),
        'gender': ['Male', 'Female', 'Non-Binary', 'Not Specified'],
        'sort': range(1, 5)
    })

In [None]:
# Generate paytype table
def generate_paytype():
    """Generate pay type reference table"""
    return pd.DataFrame({
        'paytypeid': range(1, 5),
        'paytype': ['Hourly', 'Salary', 'Commission', 'Piece Rate']
    })

In [None]:
# Generate separation reason table
def generate_separation_reason():
    """Generate separation reason reference table"""
    return pd.DataFrame({
        'Separationtypeid': range(1, 11),
        'separationreason': [
            "Voluntary - Career Opportunity",
            "Voluntary - Relocation",
            "Voluntary - Retirement",
            "Voluntary - Personal",
            "Voluntary - Health",
            "Involuntary - Performance",
            "Involuntary - Conduct",
            "Involuntary - Restructuring",
            "End of Contract",
            "Other"
        ]
    })

In [None]:
# Generate employees table
def generate_employees(bu_data, num_employees=1000):
    """Generate employee data with relationships to other dimension tables"""
    start_date = datetime(2018, 1, 1)
    end_date = datetime(2023, 12, 31)
    
    genders = ['Male', 'Female', 'Non-Binary', 'Not Specified']
    gender_ids = {'Male': 1, 'Female': 2, 'Non-Binary': 3, 'Not Specified': 4}
    
    ethnic_groups = ['A', 'B', 'H', 'W', 'O', 'T', 'N']
    fp_options = ['F', 'P', 'C', 'I']
    
    employees = []
    
    for i in range(1, num_employees + 1):
        gender = random_choice(genders)
        age = random.randint(18, 65)
        
        # Determine age group
        if age <= 24:
            age_group_id = 1
        elif age <= 34:
            age_group_id = 2
        elif age <= 44:
            age_group_id = 3
        elif age <= 54:
            age_group_id = 4
        else:
            age_group_id = 5
        
        hire_date = random_date(start_date, end_date)
        
        # Determine if terminated
        is_terminated = random.random() < 0.25  # 25% chance of being terminated
        term_date = None
        term_reason = None
        
        if is_terminated:
            term_date = random_date(hire_date, end_date)
            term_reason = random.randint(1, 10)
        
        # Calculate tenure
        end_date_obj = term_date if term_date else datetime(2023, 12, 31)
        tenure_days = (end_date_obj - hire_date).days
        tenure_months = int(tenure_days / 30.44)
        
        # Determine if new hire (hired in last 90 days of data period)
        is_new_hire = 1 if (datetime(2023, 12, 31) - hire_date).days <= 90 else 0
        
        # Determine if bad hire (terminated within 90 days)
        bad_hire = 1 if (is_terminated and (term_date - hire_date).days <= 90) else 0
        
        employees.append({
            'date': datetime(2023, 12, 31).strftime('%Y-%m-%d'),  # Current reporting date
            'employeeid': i,
            'gender': gender_ids[gender],
            'age': age,
            'ethnicgroup': random_choice(ethnic_groups),
            'fp': random_choice(fp_options),
            'termdate': term_date.strftime('%Y-%m-%d') if term_date else None,
            'isnnewhire': is_new_hire,
            'bu': random_choice(bu_data['bu'].tolist()),
            'hiredate': hire_date.strftime('%Y-%m-%d'),
            'paytypeid': random.randint(1, 4),
            'termreason': term_reason,
            'agegroupid': age_group_id,
            'tenuredays': tenure_days,
            'tenuremonths': tenure_months,
            'badhire': bad_hire
        })
    
    return pd.DataFrame(employees)

In [None]:
# Main function to generate all data
def generate_hr_data(num_employees=1000, output_dir='.'):
    """Generate all HR data tables and save as CSV files"""
    # Generate reference tables
    age_groups = generate_age_groups()
    business_units = generate_bus()
    ethnicity_data = generate_ethnicity()
    fp_data = generate_fp()
    gender_data = generate_gender()
    paytype_data = generate_paytype()
    separation_reason_data = generate_separation_reason()
    
    # Generate employee data with relationships to reference tables
    employees_data = generate_employees(business_units, num_employees)
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Save all tables as CSV files
    age_groups.to_csv(os.path.join(output_dir, 'age_group.csv'), index=False)
    business_units.to_csv(os.path.join(output_dir, 'BU.csv'), index=False)
    ethnicity_data.to_csv(os.path.join(output_dir, 'ethnicity.csv'), index=False)
    fp_data.to_csv(os.path.join(output_dir, 'fp.csv'), index=False)
    gender_data.to_csv(os.path.join(output_dir, 'gender.csv'), index=False)
    paytype_data.to_csv(os.path.join(output_dir, 'paytype.csv'), index=False)
    separation_reason_data.to_csv(os.path.join(output_dir, 'separationreason.csv'), index=False)
    employees_data.to_csv(os.path.join(output_dir, 'employees.csv'), index=False)

In [None]:
# Print table summaries
    print("Age Groups Table:")
    print(age_groups)
    print("\nBusiness Units Table (first 5 rows):")
    print(business_units.head())
    print("\nEthnicity Table:")
    print(ethnicity_data)
    print("\nFull/Part Time Table:")
    print(fp_data)
    print("\nGender Table:")
    print(gender_data)
    print("\nPay Type Table:")
    print(paytype_data)
    print("\nSeparation Reason Table:")
    print(separation_reason_data)
    print("\nEmployees Table (first 5 rows):")
    print(employees_data.head())
    
    # Generate basic statistics for the employees data
    print("\nEmployee Statistics:")
    print(f"Total Employees: {len(employees_data)}")
    print(f"Terminated Employees: {employees_data['termdate'].notna().sum()}")
    print(f"New Hires: {employees_data['isnnewhire'].sum()}")
    print(f"Bad Hires: {employees_data['badhire'].sum()}")
    print(f"Average Tenure (months): {employees_data['tenuremonths'].mean():.2f}")
    
    # Generate sample analysis
    # Distribution of employees by age group
    age_distribution = employees_data.groupby('agegroupid').size().reset_index(name='count')
    age_distribution = pd.merge(age_distribution, age_groups, on='agegroupid')
    
    print("\nAge Group Distribution:")
    print(age_distribution)
    
    # Termination reasons distribution
    if employees_data['termreason'].notna().sum() > 0:
        term_reasons = employees_data[employees_data['termreason'].notna()].groupby('termreason').size().reset_index(name='count')
        term_reasons = pd.merge(term_reasons, separation_reason_data, left_on='termreason', right_on='Separationtypeid')
        
        print("\nTermination Reasons Distribution:")
        print(term_reasons[['separationreason', 'count']])
    
    return {
        'age_groups': age_groups,
        'business_units': business_units,
        'ethnicity_data': ethnicity_data,
        'fp_data': fp_data,
        'gender_data': gender_data,
        'paytype_data': paytype_data,
        'separation_reason_data': separation_reason_data,
        'employees_data': employees_data
    }

# Execute the data generation if this script is run directly
if __name__ == "__main__":
    # Generate 1000 employee records and save to current directory
    data = generate_hr_data(num_employees=1000, output_dir='hr_data')
    print("\nData generation complete. Files saved to 'hr_data' directory.")