In [None]:
!pip install pyarrow
!pip install faker

In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Constants
num_records = 300000
start_date = pd.to_datetime('1994-10-01')
end_date = pd.to_datetime('2024-09-30')

# Generate unique employee IDs
employee_ids = [f'{i:08d}' for i in range(1, num_records + 1)]

# Generate job start dates
job_start_dates = pd.date_range(start=start_date, end=end_date, freq='MS').to_list()
job_start_dates = np.random.choice(job_start_dates, num_records)

# Generate random distances
distances = np.random.uniform(1, 50, num_records)

# Sample values for categorical columns
ethnicities = ['Asian', 'Black', 'Hispanic', 'White', 'Other']
marital_statuses = ['Single', 'Married', 'Divorced', 'Widowed']
genders = ['Male', 'Female', 'Other']
roles = ['Software Engineer', 'Associate Consultant', 'Product Engineer', 'Senior Software Engineer', 
         'Consultant', 'Senior Product Engineer', 'Specialist', 'Senior Specialist', 'Manager', 
         'Senior Manager', 'Senior Consultant', 'Principal Consultant', 'Associate Principal', 
         'Director', 'Senior Director', 'Vice President', 'Senior Vice President', 'Senior Principal', 
         'Managing Principal', 'Executive Vice President', 'Managing Director', 'Associate Director', 'Principal']

# Geography data
geography = [
    ('North America', 'United States', 'Washington', 'Bellevue', 47.6101, -122.2015),
    ('North America', 'United States', 'Washington', 'Redmond', 47.6730, -122.1215),
    # Add other locations similarly...
]

# Line of Business data
line_of_business = [
    ('Banking and Financial Services', 'Digital and Analytics', 'Digital Transformation'),
    ('Banking and Financial Services', 'Cybersecurity', 'Security Solutions'),
    # Add other lines of business similarly...
]

# Employment types
employment_types = ['Full-Time', 'Internship']

# Turnover reasons
turnover_reasons = ['Better Opportunities', 'Work Life Balance', 'Lack of Growth', 'Compensation', 
                    'Job Satisfaction', 'Management Issues', 'Higher-Study', 'Career Advancement', 
                    'Company Culture', 'Health Reasons', 'Lack of Recognition', 'Personal Reasons']

# Shifts
shifts = ['General Shift', 'First Shift', 'Second Shift', 'Night Shift']

# Salary ranges
salary_ranges = ['₹3,00,000 - ₹5,00,000 per annum', '₹5,00,000 - ₹8,00,000 per annum', 
                 '₹8,00,000 - ₹12,00,000 per annum', '₹12,00,000 - ₹18,00,000 per annum', 
                 '₹18,00,000 - ₹25,00,000 per annum', '₹25,00,000 - ₹40,00,000+ per annum']

# Generate data
data = {
    'EMPLOYEE_ID': employee_ids,
    'JOB_STARTDATE': job_start_dates,
    'DISTANCE': distances,
    'ETHNICITY': np.random.choice(ethnicities, num_records),
    'MARITAL_STATUS': np.random.choice(marital_statuses, num_records),
    'GENDER': np.random.choice(genders, num_records),
    'ROLE': np.random.choice(roles, num_records),
    'CONTINENTS': [random.choice(geography)[0] for _ in range(num_records)],
    'COUNTRY': [random.choice(geography)[1] for _ in range(num_records)],
    'STATE': [random.choice(geography)[2] for _ in range(num_records)],
    'CITY': [random.choice(geography)[3] for _ in range(num_records)],
    'LATITUDE': [random.choice(geography)[4] for _ in range(num_records)],
    'LONGITUDE': [random.choice(geography)[5] for _ in range(num_records)],
    'LINE_OF_BUSINESS': [random.choice(line_of_business)[0] for _ in range(num_records)],
    'DELIVERY_UNIT': [random.choice(line_of_business)[1] for _ in range(num_records)],
    'PRACTICE_UNIT': [random.choice(line_of_business)[2] for _ in range(num_records)],
    'EMPLOYEMENT_TYPE': np.random.choice(employment_types, num_records),
    'TURNOVER_REASONS': np.random.choice(turnover_reasons, num_records),
    'SHIFT': np.random.choice(shifts, num_records),
    'SALARY(INR)': np.random.randint(300000, 4000000, num_records),
    'SALARY_RANGE': np.random.choice(salary_ranges, num_records),
    'JOB_ENDDATE': [fake.date_between(start_date=start_date, end_date=end_date) if random.random() > 0.5 else '' for _ in range(num_records)],
    'BIRTH_YEAR': np.random.randint(1960, 2000, num_records)
}

# Calculate additional fields
df = pd.DataFrame(data)
df['CHURN'] = df['JOB_ENDDATE'].apply(lambda x: 1 if x else 0)
df['TENURE_MONTHS'] = df.apply(lambda row: (pd.to_datetime('2024-09-30') - pd.to_datetime(row['JOB_STARTDATE'])).days // 30, axis=1)
df['AGE'] = 2024 - df['BIRTH_YEAR']

# Save to CSV
df.to_csv('augmented_data.csv', index=False)
