In [4]:
!pip install Faker



In [5]:
from faker import Faker
fake = Faker()

In [10]:
import random
import datetime
import pandas as pd
from faker import Faker

# Initialize Faker for realistic name generation
fake = Faker()

# -----------------------------
# Global Predefined Dictionaries
# -----------------------------

# States and their cities
states_cities = {
    "California": ["Los Angeles", "San Francisco", "San Diego"],
    "Texas": ["Houston", "Dallas", "Austin"],
    "New York": ["New York City", "Buffalo", "Rochester"],
    "Florida": ["Miami", "Orlando", "Tampa"],
    "Illinois": ["Chicago", "Springfield", "Naperville"]
}

# Hire Date years (2015-2024) with custom weights
hire_years = list(range(2015, 2025))
hire_year_weights = [5, 7, 10, 12, 15, 17, 14, 10, 6, 4]

# Departments and their probabilities (in percentage)
departments = ['Sales', 'Human Resources', 'Engineering', 'Marketing', 'Finance', 'IT Support']
dept_weights = [20, 10, 25, 15, 15, 15]

# Job titles per department with (job title, weight) tuples
job_titles = {
    "Sales": [("Sales Representative", 60), ("Account Manager", 40)],
    "Human Resources": [("HR Generalist", 70), ("Recruiter", 30)],
    "Engineering": [("Software Engineer", 50), ("Senior Software Engineer", 30), ("DevOps Engineer", 20)],
    "Marketing": [("Marketing Coordinator", 50), ("Digital Marketer", 50)],
    "Finance": [("Financial Analyst", 60), ("Accountant", 40)],
    "IT Support": [("IT Support Specialist", 70), ("Systems Administrator", 30)]
}

# Mapping of job title to required education level
education_mapping = {
    "Sales Representative": "Bachelor's",
    "Account Manager": "Bachelor's",
    "HR Generalist": "Bachelor's",
    "Recruiter": "Bachelor's",
    "Software Engineer": "Bachelor's",
    "Senior Software Engineer": "Master's",
    "DevOps Engineer": "Bachelor's",
    "Marketing Coordinator": "Bachelor's",
    "Digital Marketer": "Bachelor's",
    "Financial Analyst": "Bachelor's",
    "Accountant": "Bachelor's",
    "IT Support Specialist": "Associate's",
    "Systems Administrator": "Bachelor's"
}

# Performance Rating options with weights
performance_ratings = ['Excellent', 'Good', 'Satisfactory', 'Needs Improvement']
perf_weights = [20, 50, 25, 5]

# Overtime probability (30% Yes, 70% No)
overtime_options = ['Yes', 'No']
overtime_weights = [30, 70]

# Salary ranges per job title
salary_ranges = {
    "Sales Representative": (40000, 60000),
    "Account Manager": (50000, 80000),
    "HR Generalist": (45000, 70000),
    "Recruiter": (40000, 65000),
    "Software Engineer": (70000, 120000),
    "Senior Software Engineer": (100000, 150000),
    "DevOps Engineer": (80000, 130000),
    "Marketing Coordinator": (40000, 60000),
    "Digital Marketer": (45000, 70000),
    "Financial Analyst": (50000, 85000),
    "Accountant": (45000, 75000),
    "IT Support Specialist": (35000, 55000),
    "Systems Administrator": (50000, 75000)
}

# Minimum age required at hire per job title
min_age_mapping = {
    "Sales Representative": 22,
    "Account Manager": 22,
    "HR Generalist": 22,
    "Recruiter": 22,
    "Software Engineer": 22,
    "Senior Software Engineer": 25,
    "DevOps Engineer": 22,
    "Marketing Coordinator": 22,
    "Digital Marketer": 22,
    "Financial Analyst": 22,
    "Accountant": 22,
    "IT Support Specialist": 22,
    "Systems Administrator": 22
}

# Termination probability (11.2% of employees get a termination date)
termination_probability = 0.112

# -----------------------------
# Helper Functions
# -----------------------------

def generate_employee_id(index):
    """Generate a unique Employee ID given an index."""
    return f"EMP{index:05d}"  # e.g., EMP00001

def generate_name():
    """Generate a random first and last name using Faker."""
    return fake.first_name(), fake.last_name()

def generate_gender():
    """Randomly generate gender with 46% probability for Female and 54% for Male."""
    return random.choices(['Female', 'Male'], weights=[46, 54], k=1)[0]

def generate_state_city():
    """Randomly choose a state and then a city from that state."""
    state = random.choice(list(states_cities.keys()))
    city = random.choice(states_cities[state])
    return state, city

def generate_hire_date():
    """Generate a random hire date based on custom probabilities for each year (2015-2024)."""
    year = random.choices(hire_years, weights=hire_year_weights, k=1)[0]
    month = random.randint(1, 12)
    day = random.randint(1, 28)  # Using 28 to sidestep month-length issues
    return datetime.date(year, month, day)

def generate_department():
    """Randomly select a department based on specified probabilities."""
    dept = random.choices(departments, weights=dept_weights, k=1)[0]
    return dept

def generate_job_title(department):
    """Select a job title based on the department and internal probabilities."""
    titles, weights = zip(*job_titles[department])
    return random.choices(titles, weights=weights, k=1)[0]

def get_education_level(job_title):
    """Determine the education level required for a given job title."""
    return education_mapping.get(job_title, "Bachelor's")

def generate_performance_rating():
    """Randomly select a performance rating with custom probabilities."""
    return random.choices(performance_ratings, weights=perf_weights, k=1)[0]

def generate_overtime():
    """Randomly choose whether an employee does overtime."""
    return random.choices(overtime_options, weights=overtime_weights, k=1)[0]

def generate_salary(job_title):
    """Generate salary within a specified range for the given job title."""
    low, high = salary_ranges[job_title]
    return random.randint(low, high)

def generate_birth_date(hire_date, job_title):
    """
    Generate a birth date ensuring the employee's age at hire is between the minimum and 60.
    For simplicity, we generate an age uniformly between the minimum age and 60 years.
    """
    min_age = min_age_mapping.get(job_title, 22)
    age_at_hire = random.randint(min_age, 60)
    birth_year = hire_date.year - age_at_hire
    # Randomize month and day (using 28 as max day for uniformity)
    month = random.randint(1, 12)
    day = random.randint(1, 28)
    return datetime.date(birth_year, month, day)

def generate_termination_date(hire_date):
    """
    With a probability of termination_probability, generate a termination date.
    The termination date must be at least 6 months after the hire date and occur before or on Dec 31, 2024.
    """
    if random.random() < termination_probability:
        min_term_date = hire_date + datetime.timedelta(days=183)  # 6 months later
        # Define the maximum termination date as December 31, 2024
        max_term_date = datetime.date(2024, 12, 31)
        if min_term_date >= max_term_date:
            return None  # Cannot set termination if hire_date is too late.
        # Generate a random date between min_term_date and max_term_date
        delta_days = (max_term_date - min_term_date).days
        random_days = random.randint(0, delta_days)
        return min_term_date + datetime.timedelta(days=random_days)
    else:
        return None

def calculate_adjusted_salary(base_salary, gender, education_level, birth_date, hire_date):
    """
    Calculate the adjusted salary based on gender, education level, and age at hire.
    Applying:
      - 2% bonus for females,
      - Additional multiplier: 5% for Master's, 3% for Bachelor's, 1% for Associate's,
      - 1% increase for every year of age over 40 at hire.
    """
    age_at_hire = hire_date.year - birth_date.year
    multiplier = 1.0
    if gender == 'Female':
        multiplier += 0.02  # 2% bonus for female employees
    if education_level == "Master's":
        multiplier += 0.05
    elif education_level == "Bachelor's":
        multiplier += 0.03
    elif education_level == "Associate's":
        multiplier += 0.01
    if age_at_hire > 40:
        multiplier += (age_at_hire - 40) * 0.01  # 1% for each year over 40
    return int(base_salary * multiplier)

# -----------------------------
# Main Data Generation Function
# -----------------------------

def generate_employee_record(emp_index):
    """Generate a single record for an employee."""
    employee = {}
    employee['EmployeeID'] = generate_employee_id(emp_index)
    
    # Generate names
    first_name, last_name = generate_name()
    employee['FirstName'] = first_name
    employee['LastName'] = last_name
    
    # Gender selection
    employee['Gender'] = generate_gender()
    
    # State and City
    state, city = generate_state_city()
    employee['State'] = state
    employee['City'] = city
    
    # Hire Date
    hire_date = generate_hire_date()
    employee['HireDate'] = hire_date
    
    # Department
    department = generate_department()
    employee['Department'] = department
    
    # Job Title based on department
    job_title = generate_job_title(department)
    employee['JobTitle'] = job_title
    
    # Education Level based on job title
    education_level = get_education_level(job_title)
    employee['EducationLevel'] = education_level
    
    # Performance Rating
    employee['PerformanceRating'] = generate_performance_rating()
    
    # Overtime
    employee['Overtime'] = generate_overtime()
    
    # Salary based on job title
    base_salary = generate_salary(job_title)
    employee['BaseSalary'] = base_salary
    
    # Birth Date ensuring realistic age at hire
    birth_date = generate_birth_date(hire_date, job_title)
    employee['BirthDate'] = birth_date
    
    # Termination Date (if applicable)
    termination_date = generate_termination_date(hire_date)
    employee['TerminationDate'] = termination_date
    
    # Adjusted Salary based on multipliers:
    employee['AdjustedSalary'] = calculate_adjusted_salary(base_salary, employee['Gender'], education_level, birth_date, hire_date)
    
    return employee

def generate_dataset(num_records=8950):
    """Generate the complete dataset with the specified number of records."""
    records = []
    for i in range(1, num_records + 1):
        record = generate_employee_record(i)
        records.append(record)
    return pd.DataFrame(records)

# -----------------------------
# Generate and Save the Dataset
# -----------------------------

if __name__ == "__main__":
    # Generate the dataset DataFrame
    df = generate_dataset(8950)
    
    # For demonstration, show the first five records
    print(df.head())
    


  EmployeeID FirstName  LastName  Gender     State           City    HireDate  \
0   EMP00001  Jennifer    Conrad    Male  New York      Rochester  2022-07-09   
1   EMP00002  Kimberly  Valencia    Male  New York  New York City  2017-03-26   
2   EMP00003    Isaiah    Weaver  Female   Florida        Orlando  2022-01-12   
3   EMP00004      Emma   Ramirez    Male     Texas         Dallas  2022-12-20   
4   EMP00005    Jeremy   Padilla  Female  New York  New York City  2019-09-15   

    Department                  JobTitle EducationLevel PerformanceRating  \
0      Finance                Accountant     Bachelor's      Satisfactory   
1  Engineering  Senior Software Engineer       Master's              Good   
2      Finance                Accountant     Bachelor's              Good   
3    Marketing     Marketing Coordinator     Bachelor's              Good   
4        Sales      Sales Representative     Bachelor's         Excellent   

  Overtime  BaseSalary   BirthDate TerminationDate

In [12]:
df.to_csv("human_resources_dataset.csv", index=False)

In [14]:
import os
print(os.listdir("."))

['.anaconda', '.conda', '.condarc', '.continuum', '.ipynb_checkpoints', '.ipython', '.jupyter', '.matplotlib', '.ms-ad', 'anaconda3', 'anaconda_projects', 'AppData', 'Application Data', 'Contacts', 'Cookies', 'Documents', 'Downloads', 'Favorites', 'HR Project_New', 'HR_Dashboard_Project_Shaik.ipynb', 'human_resources_dataset.csv', 'Links', 'Local Settings', 'Microsoft', 'Music', 'My Documents', 'NetHood', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{f4ae997f-944e-11ef-8b4a-c0bfbe840d9f}.TM.blf', 'NTUSER.DAT{f4ae997f-944e-11ef-8b4a-c0bfbe840d9f}.TMContainer00000000000000000001.regtrans-ms', 'NTUSER.DAT{f4ae997f-944e-11ef-8b4a-c0bfbe840d9f}.TMContainer00000000000000000002.regtrans-ms', 'ntuser.ini', 'OneDrive', 'Pictures', 'PrintHood', 'Recent', 'Saved Games', 'Searches', 'SendTo', 'Start Menu', 'Templates', 'Untitled.ipynb', 'Untitled1.ipynb', 'Videos']
