In [3]:
import random
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Define constants
NUM_RECORDS = 300000
ROLES = [
    "Software Engineer", "Associate Consultant", "Product Engineer", "Senior Software Engineer", 
    "Consultant", "Senior Product Engineer", "Specialist", "Senior Specialist", "Manager", 
    "Senior Manager", "Senior Consultant", "Principal Consultant", "Associate Principal", 
    "Director", "Senior Director", "Vice President", "Senior Vice President", "Senior Principal", 
    "Managing Principal", "Executive Vice President", "Managing Director", "Associate Director", "Principal"
]
LINE_OF_BUSINESS = {
    "Banking and Financial Services": {
        "Digital and Analytics": ["Digital Transformation", "Data Analytics", "Artificial Intelligence"],
        "Cybersecurity": ["Security Solutions", "Risk Management", "Compliance Services"],
        "Enterprise Application Services": ["ERP Solutions", "CRM Solutions", "Enterprise Integration"]
    },
    "Insurance": {
        "Digital and Analytics": ["Digital Transformation", "Data Analytics", "Artificial Intelligence"],
        "Assurance Services": ["Quality Assurance", "Testing Services", "Reliability Engineering"],
        "Consulting": ["Strategic Consulting", "Business Process Consulting", "Technology Consulting"]
    },
    "Manufacturing": {
        "Enterprise IT Transformation": ["IT Modernization", "Application Development", "IT Infrastructure"],
        "Cloud and Infrastructure Services": ["Cloud Migration", "Cloud Management", "Infrastructure Services"],
        "Digital and Analytics": ["Digital Transformation", "Data Analytics", "Artificial Intelligence"]
    },
    "Retail and Consumer Goods": {
        "Digital and Analytics": ["Digital Transformation", "Data Analytics", "Artificial Intelligence"],
        "Enterprise Application Services": ["ERP Solutions", "CRM Solutions", "Enterprise Integration"],
        "Consulting": ["Strategic Consulting", "Business Process Consulting", "Technology Consulting"]
    },
    "Healthcare and Life Sciences": {
        "Digital and Analytics": ["Digital Transformation", "Data Analytics", "Artificial Intelligence"],
        "Assurance Services": ["Quality Assurance", "Testing Services", "Reliability Engineering"],
        "Cybersecurity": ["Security Solutions", "Risk Management", "Compliance Services"]
    },
    "Energy": {
        "Cloud and Infrastructure Services": ["Cloud Migration", "Cloud Management", "Infrastructure Services"],
        "Enterprise IT Transformation": ["IT Modernization", "Application Development", "IT Infrastructure"],
        "Consulting": ["Strategic Consulting", "Business Process Consulting", "Technology Consulting"]
    },
    "Travel, Transportation, and Hospitality (TTH)": {
        "Digital and Analytics": ["Digital Transformation", "Data Analytics", "Artificial Intelligence"],
        "Enterprise Application Services": ["ERP Solutions", "CRM Solutions", "Enterprise Integration"],
        "Cloud and Infrastructure Services": ["Cloud Migration", "Cloud Management", "Infrastructure Services"]
    },
    "Media and Entertainment": {
        "Digital and Analytics": ["Digital Transformation", "Data Analytics", "Artificial Intelligence"],
        "Consulting": ["Strategic Consulting", "Business Process Consulting", "Technology Consulting"],
        "Assurance Services": ["Quality Assurance", "Testing Services", "Reliability Engineering"]
    }
}
TURNOVER_REASONS = [
    "Better Opportunities", "Work Life Balance", "Lack of Growth", "Compensation", "Job Satisfaction", 
    "Management Issues", "Higher-Study", "Career Advancement", "Company Culture", "Health Reasons", 
    "Lack of Recognition", "Personal Reasons"
]
SHIFTS = ["General Shift", "First Shift", "Second Shift", "Night Shift"]

# Function to generate random date within the past 30 years until 30th September 2024
def generate_random_date():
    start_date = datetime.now() - timedelta(days=30*365)
    end_date = datetime(2024, 9, 30)
    return start_date + (end_date - start_date) * random.random()

# Function to calculate tenure in months
def calculate_tenure(start_date, end_date):
    if pd.isnull(end_date):
        end_date = datetime(2024, 9, 30)
    return (end_date.year - start_date.year) * 12 + end_date.month - start_date.month

# Function to generate salary range and levels
def get_salary_range_and_level(salary):
    if salary < 300000:
        return ("₹1,50,000 - ₹3,00,000 per annum","Entry Level")
    elif salary < 500000:
        return ("₹3,00,000 - ₹5,00,000 per annum","Junior Level")
    elif salary < 800000:
        return ("₹5,00,000 - ₹8,00,000 per annum","Mid Level")
    elif salary < 1200000:
        return ("₹8,00,000 - ₹12,00,000 per annum","Senior Level")
    elif salary < 1800000:
        return ("₹12,00,000 - ₹18,00,000 per annum","Lead/Managerial Level")
    else:
        return ("₹18,00,000 - ₹25,00,000+ per annum","Executive Level")

# Generate data
data = []
for i in range(NUM_RECORDS):
    employee_id = f"{random.randint(10000000, 99999999)}"
    job_start_date = generate_random_date()
    
    # Ensure job end date is not more than sep 2024
    max_end_date = datetime(2024, 9, 30)
    job_end_date = job_start_date + timedelta(days=random.randint(365, 365*10)) if random.random() > 0.7 else None
    if job_end_date and job_end_date > max_end_date:
        job_end_date = max_end_date
    
    tenure_months = calculate_tenure(job_start_date, job_end_date)
    
    distance_options = ["<2 miles", "4-5 miles", "5-10 miles", ">10 miles"]
    distance = random.choices(distance_options, weights=[1, 2, 3, 4], k=1)
    
    role = random.choice(ROLES)
    
    line_of_business = random.choice(list(LINE_OF_BUSINESS.keys()))
    delivery_unit = random.choice(list(LINE_OF_BUSINESS[line_of_business].keys()))
    practice_unit = random.choice(LINE_OF_BUSINESS[line_of_business][delivery_unit])
    
    employment_type = random.choice(["Full-time", "Part-time", "Contract"])
    
    turnover_reason = random.choices(TURNOVER_REASONS, weights=[15, 10, 10, 15, 10, 5, 5, 15, 5, 5, 5, 10], k=1)
    
    shift = random.choice(SHIFTS)
    
         # Adjust salary distribution
    salary_ranges = [
        (150000, 300000), (300000, 500000), (500000, 800000), 
        (800000, 1200000), (1200000, 1800000), (1800000, 2500000)
    ]
    salary_weights = [5, 4, 3, 2, 1, 1]
    # Get the tuple from the list
    # Get the tuple from the list
    salary_range = random.choices(salary_ranges, weights=salary_weights, k=1)
    # Unpack the tuple into lower and upper bounds
    lower_bound, upper_bound = salary_range

    # Generate a random salary within the range
    salary = round(random.uniform(lower_bound, upper_bound), 2)


    salary_range_str, salary_level = get_salary_range_and_level(salary)
    
    job_satisfaction = random.randint(1, 5)
    
    avg_salary_hike = round(random.uniform(0, 15), 2)
    avg_performance_rating = random.randint(1, 4) if avg_salary_hike > 8 else random.randint(1, 3)
    
    churn = 1 if job_end_date else 0
    
    # Adjust avg_salary_hike for churned employees
    if churn == 1:
        avg_salary_hike = round(random.uniform(0, 5), 2)
    
    data.append([
        employee_id, tenure_months, job_start_date, job_end_date, distance, role, 
        line_of_business, delivery_unit, practice_unit, employment_type, turnover_reason, 
        shift, salary, salary_range_str, salary_level, job_satisfaction, avg_salary_hike, 
        avg_performance_rating, churn
    ])

# Create DataFrame
columns = [
    "EMPLOYEE_ID", "TENURE_MONTHS", "JOB_STARTDATE", "JOB_ENDDATE", "DISTANCE", "ROLE", 
    "LINE_OF_BUSINESS", "DELIVERY_UNIT", "PRACTICE_UNIT", "EMPLOYMENT_TYPE", "TURNOVER_REASONS", 
    "SHIFT", "SALARY(INR)", "SALARY_RANGE", "SALARY_LEVELS", "JOB_SATISFACTION", 
    "AVERAGE_PERCENTAGE_SALARY_HIKE", "AVERAGE_PERFORMANCE_RATING", "CHURN"
]
df = pd.DataFrame(data, columns=columns)

# Adjust churn values to meet the desired distribution
churn_0_indices = df[df['CHURN'] == 0].index
churn_1_indices = df[df['CHURN'] == 1].index

# Ensure we do not sample more than available
churn_0_sample_size = min(len(churn_0_indices), 85030)
churn_1_sample_size = min(len(churn_1_indices), 214970)

# Randomly sample indices to adjust churn distribution
churn_0_sample = np.random.choice(churn_0_indices, size=churn_0_sample_size, replace=False)
churn_1_sample = np.random.choice(churn_1_indices, size=churn_1_sample_size, replace=False)

# Set the rest to churn = 1
df.loc[churn_0_indices.difference(churn_0_sample), 'CHURN'] = 1

# Save to CSV
df.to_csv("Employment_and_Performance_Metrics_augumented1.csv", index=False)

print("Data generation complete. Saved to 'Employment_and_Performance_Metrics_augumented1.csv'.")

ValueError: not enough values to unpack (expected 2, got 1)