In [None]:
!pip install pyarrow
!pip install faker

In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Constants
num_records = 300000
start_date = pd.to_datetime('1994-10-01')
end_date = pd.to_datetime('2024-09-30')

# Generate unique employee IDs
employee_ids = [f'{i:08d}' for i in range(1, num_records + 1)]

# Generate job start dates
job_start_dates = pd.date_range(start=start_date, end=end_date, freq='MS').to_list()
job_start_dates = np.random.choice(job_start_dates, num_records)

# Generate random distances
distances = np.random.uniform(1, 50, num_records)

# Sample values for categorical columns
ethnicities = ['Asian', 'Black', 'Hispanic', 'White', 'Other']
marital_statuses = ['Single', 'Married', 'Divorced', 'Widowed']
genders = ['Male', 'Female', 'Other']
roles = ['Software Engineer', 'Associate Consultant', 'Product Engineer', 'Senior Software Engineer', 
         'Consultant', 'Senior Product Engineer', 'Specialist', 'Senior Specialist', 'Manager', 
         'Senior Manager', 'Senior Consultant', 'Principal Consultant', 'Associate Principal', 
         'Director', 'Senior Director', 'Vice President', 'Senior Vice President', 'Senior Principal', 
         'Managing Principal', 'Executive Vice President', 'Managing Director', 'Associate Director', 'Principal']

# Geography data
geography = [
    ('North America', 'United States', 'Washington', 'Bellevue', 47.6101, -122.2015),
    ('North America', 'United States', 'Washington', 'Redmond', 47.6730, -122.1215),
    ('North America', 'United States', 'Texas', 'Dallas', 32.7767, -96.7970),
    ('North America', 'United States', 'Texas', 'Houston', 29.7604, -95.3698),
    ('North America', 'United States', 'New Jersey', 'Warren', 40.6301, -74.5004),
    ('North America', 'United States', 'Georgia', 'Alpharetta', 34.0754, -84.2941),
    ('North America', 'United States', 'Florida', 'Tampa', 27.9506, -82.4572),
    ('North America', 'United States', 'Ohio', 'Cincinnati', 39.1031, -84.5120),
    ('North America', 'United States', 'Arizona', 'Scottsdale', 33.4942, -111.9261),
    ('North America', 'United States', 'Connecticut', 'Hartford', 41.7658, -72.6734),
    ('North America', 'United States', 'Wisconsin', 'Glendale', 43.1286, -87.9243),
    ('North America', 'Canada', 'Alberta', 'Calgary', 51.0447, -114.0719),
    ('North America', 'Canada', 'Ontario', 'Mississauga', 43.5890, -79.6441),
    ('North America', 'Mexico', 'Mexico City', 'Mexico City', 19.4326, -99.1332),
    ('North America', 'Mexico', 'Jalisco', 'Guadalajara', 20.6597, -103.3496),
    ('North America', 'Costa Rica', 'San Jose', 'San Jose', 9.9281, -84.0907),
    ('Europe', 'Belgium', 'Brussels', 'Brussels', 50.8503, 4.3517),
    ('Europe', 'Belgium', 'Diegem', 'Diegem', 50.8870, 4.4370),
    ('Europe', 'Cyprus', 'Nicosia', 'Nicosia', 35.1856, 33.3823),
    ('Europe', 'Czech Republic', 'Prague', 'Prague', 50.0755, 14.4378),
    ('Europe', 'Denmark', 'Copenhagen', 'Copenhagen', 55.6761, 12.5683),
    ('Europe', 'Finland', 'Espoo', 'Espoo', 60.2055, 24.6559),
    ('Europe', 'France', 'Puteaux', 'Puteaux', 48.8848, 2.2399),
    ('Europe', 'France', 'Blagnac', 'Blagnac', 43.6328, 1.3936),
    ('Europe', 'Germany', 'Frankfurt', 'Frankfurt', 50.1109, 8.6821),
    ('Asia-Pacific', 'India', 'Karnataka', 'Bangalore', 12.9716, 77.5946),
    ('Asia-Pacific', 'India', 'Maharashtra', 'Mumbai', 19.0760, 72.8777),
    ('Asia-Pacific', 'India', 'Maharashtra', 'Pune', 18.5204, 73.8567),
    ('Asia-Pacific', 'India', 'Tamil Nadu', 'Chennai', 13.0827, 80.2707),
    ('Asia-Pacific', 'India', 'Telangana', 'Hyderabad', 17.3850, 78.4867),
    ('Asia-Pacific', 'India', 'Telangana', 'Warangal', 17.9910, 79.5346),
    ('Asia-Pacific', 'India', 'Gujarat', 'Ahmedabad', 23.0225, 72.5714),
    ('Asia-Pacific', 'India', 'West Bengal', 'Kolkata', 22.5726, 88.3639),
    ('Asia-Pacific', 'India', 'Uttar Pradesh', 'Noida', 28.5355, 77.3910),
    ('Asia-Pacific', 'India', 'Haryana', 'Gurgaon', 28.4595, 77.0266),
    ('Asia-Pacific', 'Australia', 'New South Wales', 'Sydney', -33.8688, 151.2093),
    ('Asia-Pacific', 'Australia', 'Victoria', 'Melbourne', -37.8136, 144.9631),
    ('Asia-Pacific', 'Singapore', 'Singapore', 'Singapore City', 1.3521, 103.8198),
    ('Middle East and Africa', 'United Arab Emirates', 'Dubai', 'Dubai City', 25.2048, 55.2708),
    ('Middle East and Africa', 'South Africa', 'Johannesburg', 'Johannesburg City', -26.2041, 28.0473)
]

# Line of Business data (continued)
line_of_business = [
    ('Banking and Financial Services', 'Digital and Analytics', 'Digital Transformation'),
    ('Banking and Financial Services', 'Digital and Analytics', 'Data Analytics'),
    ('Banking and Financial Services', 'Digital and Analytics', 'Artificial Intelligence'),
    ('Banking and Financial Services', 'Cybersecurity', 'Security Solutions'),
    ('Banking and Financial Services', 'Cybersecurity', 'Risk Management'),
    ('Banking and Financial Services', 'Cybersecurity', 'Compliance Services'),
    ('Banking and Financial Services', 'Enterprise Application Services', 'ERP Solutions'),
    ('Banking and Financial Services', 'Enterprise Application Services', 'CRM Solutions'),
    ('Banking and Financial Services', 'Enterprise Application Services', 'Enterprise Integration'),
    ('Insurance', 'Digital and Analytics', 'Digital Transformation'),
    ('Insurance', 'Digital and Analytics', 'Data Analytics'),
    ('Insurance', 'Digital and Analytics', 'Artificial Intelligence'),
    ('Insurance', 'Assurance Services', 'Quality Assurance'),
    ('Insurance', 'Assurance Services', 'Testing Services'),
    ('Insurance', 'Assurance Services', 'Reliability Engineering'),
    ('Insurance', 'Consulting', 'Strategic Consulting'),
    ('Insurance', 'Consulting', 'Business Process Consulting'),
    ('Insurance', 'Consulting', 'Technology Consulting'),
    ('Manufacturing', 'Enterprise IT Transformation', 'IT Modernization'),
    ('Manufacturing', 'Enterprise IT Transformation', 'Application Development'),
    ('Manufacturing', 'Enterprise IT Transformation', 'IT Infrastructure'),
    ('Manufacturing', 'Cloud and Infrastructure Services', 'Cloud Migration'),
    ('Manufacturing', 'Cloud and Infrastructure Services', 'Cloud Management'),
    ('Manufacturing', 'Cloud and Infrastructure Services', 'Infrastructure Services'),
    ('Manufacturing', 'Digital and Analytics', 'Digital Transformation'),
    ('Manufacturing', 'Digital and Analytics', 'Data Analytics'),
    ('Manufacturing', 'Digital and Analytics', 'Artificial Intelligence'),
    ('Retail and Consumer Goods', 'Digital and Analytics', 'Digital Transformation'),
    ('Retail and Consumer Goods', 'Digital and Analytics', 'Data Analytics'),
    ('Retail and Consumer Goods', 'Digital and Analytics', 'Artificial Intelligence'),
    ('Retail and Consumer Goods', 'Enterprise Application Services', 'ERP Solutions'),
    ('Retail and Consumer Goods', 'Enterprise Application Services', 'CRM Solutions'),
    ('Retail and Consumer Goods', 'Enterprise Application Services', 'Enterprise Integration'),
    ('Retail and Consumer Goods', 'Consulting', 'Strategic Consulting'),
    ('Retail and Consumer Goods', 'Consulting', 'Business Process Consulting'),
    ('Retail and Consumer Goods', 'Consulting', 'Technology Consulting'),
    ('Healthcare and Life Sciences', 'Digital and Analytics', 'Digital Transformation'),
    ('Healthcare and Life Sciences', 'Digital and Analytics', 'Data Analytics'),
    ('Healthcare and Life Sciences', 'Digital and Analytics', 'Artificial Intelligence'),
    ('Healthcare and Life Sciences', 'Assurance Services', 'Quality Assurance'),
    ('Healthcare and Life Sciences', 'Assurance Services', 'Testing Services'),
    ('Healthcare and Life Sciences', 'Assurance Services', 'Reliability Engineering'),
    ('Healthcare and Life Sciences', 'Cybersecurity', 'Security Solutions'),
    ('Healthcare and Life Sciences', 'Cybersecurity', 'Risk Management'),
    ('Healthcare and Life Sciences', 'Cybersecurity', 'Compliance Services'),
    ('Energy', 'Cloud and Infrastructure Services', 'Cloud Migration'),
    ('Energy', 'Cloud and Infrastructure Services', 'Cloud Management'),
    ('Energy', 'Cloud and Infrastructure Services', 'Infrastructure Services'),
    ('Energy', 'Enterprise IT Transformation', 'IT Modernization'),
    ('Energy', 'Enterprise IT Transformation', 'Application Development'),
    ('Energy', 'Enterprise IT Transformation', 'IT Infrastructure'),
    ('Energy', 'Consulting', 'Strategic Consulting'),
    ('Energy', 'Consulting', 'Business Process Consulting'),
    ('Energy', 'Consulting', 'Technology Consulting'),
    ('Travel, Transportation, and Hospitality (TTH)', 'Digital and Analytics', 'Digital Transformation'),
    ('Travel, Transportation, and Hospitality (TTH)', 'Digital and Analytics', 'Data Analytics'),
    ('Travel, Transportation, and Hospitality (TTH)', 'Digital and Analytics', 'Artificial Intelligence'),
    ('Travel, Transportation, and Hospitality (TTH)', 'Enterprise Application Services', 'ERP Solutions'),
    ('Travel, Transportation, and Hospitality (TTH)', 'Enterprise Application Services', 'CRM Solutions'),
    ('Travel, Transportation, and Hospitality (TTH)', 'Enterprise Application Services', 'Enterprise Integration'),
    ('Travel, Transportation, and Hospitality (TTH)', 'Cloud and Infrastructure Services', 'Cloud Migration'),
    ('Travel, Transportation, and Hospitality (TTH)', 'Cloud and Infrastructure Services', 'Cloud Management'),
    ('Travel, Transportation, and Hospitality (TTH)', 'Cloud and Infrastructure Services', 'Infrastructure Services'),
    ('Media and Entertainment', 'Digital and Analytics', 'Digital Transformation'),
    ('Media and Entertainment', 'Digital and Analytics', 'Data Analytics'),
    ('Media and Entertainment', 'Digital and Analytics', 'Artificial Intelligence'),
    ('Media and Entertainment', 'Consulting', 'Strategic Consulting'),
    ('Media and Entertainment', 'Consulting', 'Business Process Consulting'),
    ('Media and Entertainment', 'Consulting', 'Technology Consulting'),
    ('Media and Entertainment', 'Assurance Services', 'Quality Assurance'),
    ('Media and Entertainment', 'Assurance Services', 'Testing Services'),
    ('Media and Entertainment', 'Assurance Services', 'Reliability Engineering')
]

# Employment types
employment_types = ['Full-Time', 'Internship']

# Turnover reasons
turnover_reasons = ['Better Opportunities', 'Work Life Balance', 'Lack of Growth', 'Compensation', 
                    'Job Satisfaction', 'Management Issues', 'Higher-Study', 'Career Advancement', 
                    'Company Culture', 'Health Reasons', 'Lack of Recognition', 'Personal Reasons']

# Shifts
shifts = ['General Shift', 'First Shift', 'Second Shift', 'Night Shift']

# Salary ranges
salary_ranges = ['₹3,00,000 - ₹5,00,000 per annum', '₹5,00,000 - ₹8,00,000 per annum', 
                 '₹8,00,000 - ₹12,00,000 per annum', '₹12,00,000 - ₹18,00,000 per annum', 
                 '₹18,00,000 - ₹25,00,000 per annum', '₹25,00,000 - ₹40,00,000+ per annum']

# Generate data
data = {
    'EMPLOYEE_ID': employee_ids,
    'JOB_STARTDATE': job_start_dates,
    'DISTANCE': distances,
    'ETHNICITY': np.random.choice(ethnicities, num_records),
    'MARITAL_STATUS': np.random.choice(marital_statuses, num_records),
    'GENDER': np.random.choice(genders, num_records),
    'ROLE': np.random.choice(roles, num_records),
    'CONTINENTS': [random.choice(geography)[0] for _ in range(num_records)],
    'COUNTRY': [random.choice(geography)[1] for _ in range(num_records)],
    'STATE': [random.choice(geography)[2] for _ in range(num_records)],
    'CITY': [random.choice(geography)[3] for _ in range(num_records)],
    'LATITUDE': [random.choice(geography)[4] for _ in range(num_records)],
    'LONGITUDE': [random.choice(geography)[5] for _ in range(num_records)],
    'LINE_OF_BUSINESS': [random.choice(line_of_business)[0] for _ in range(num_records)],
    'DELIVERY_UNIT': [random.choice(line_of_business)[1] for _ in range(num_records)],
    'PRACTICE_UNIT': [random.choice(line_of_business)[2] for _ in range(num_records)],
    'EMPLOYEMENT_TYPE': np.random.choice(employment_types, num_records),
    'TURNOVER_REASONS': np.random.choice(turnover_reasons, num_records),
    'SHIFT': np.random.choice(shifts, num_records),
    'SALARY(INR)': np.random.randint(300000, 4000000, num_records),
    'SALARY_RANGE': np.random.choice(salary_ranges, num_records),
    'JOB_ENDDATE': [fake.date_between(start_date=start_date, end_date=end_date) if random.random() > 0.5 else '' for _ in range(num_records)],
    'BIRTH_YEAR': np.random.randint(1960, 2000, num_records)
}

# Calculate additional fields
df = pd.DataFrame(data)
df['CHURN'] = df['JOB_ENDDATE'].apply(lambda x: 1 if x else 0)
df['TENURE_MONTHS'] = df.apply(lambda row: (pd.to_datetime('2024-09-30') - pd.to_datetime(row['JOB_STARTDATE'])).days // 30, axis=1)
df['AGE'] = 2024 - df['BIRTH_YEAR']

# Save to CSV
df.to_csv('augmented_data1.csv', index=False)
