In [None]:
!pip install pyarrow

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Constants
num_records = 350000
start_date = datetime(1993, 1, 1)
end_date = datetime(2024, 9, 30)
roles = ["Software Engineer", "Associate Consultant", "Product Engineer", "Senior Software Engineer", "Consultant", 
         "Senior Product Engineer", "Specialist", "Senior Specialist", "Manager", "Senior Manager", 
         "Senior Consultant", "Principal Consultant", "Associate Principal", "Director", "Senior Director", 
         "Vice President", "Senior Vice President", "Senior Principal", "Managing Principal", 
         "Executive Vice President", "Managing Director", "Associate Director", "Principal"]
continents = ["North America", "Europe", "Asia-Pacific", "Middle East and Africa"]
countries = {
    "North America": ["United States", "Canada", "Mexico", "Costa Rica"],
    "Europe": ["Belgium", "Cyprus", "Czech Republic", "Denmark", "Finland", "France", "Germany"],
    "Asia-Pacific": ["India", "Australia", "Singapore"],
    "Middle East and Africa": ["United Arab Emirates", "South Africa"]
}
states_cities = {
    # North America
    ("United States", "Washington"): [("Bellevue", 47.6101, -122.2015), ("Redmond", 47.6730, -122.1215)],
    ("United States", "Texas"): [("Dallas", 32.7767, -96.7970), ("Houston", 29.7604, -95.3698)],
    ("United States", "New Jersey"): [("Warren", 40.6301, -74.5004)],
    ("United States", "Georgia"): [("Alpharetta", 34.0754, -84.2941)],
    ("United States", "Florida"): [("Tampa", 27.9506, -82.4572)],
    ("United States", "Ohio"): [("Cincinnati", 39.1031, -84.5120)],
    ("United States", "Arizona"): [("Scottsdale", 33.4942, -111.9261)],
    ("United States", "Connecticut"): [("Hartford", 41.7658, -72.6734)],
    ("United States", "Wisconsin"): [("Glendale", 43.1286, -87.9243)],
    ("Canada", None): [("Calgary", 51.0447, -114.0719), ("Mississauga", 43.5890, -79.6441)],
    ("Mexico", None): [("Mexico City", 19.4326, -99.1332), ("Guadalajara", 20.6597, -103.3496)],
    ("Costa Rica", None): [("San Jose", 9.9281, -84.0907)],
    
    # Europe
    ("Belgium", None): [("Brussels", 50.8503, 4.3517), ("Diegem", 50.8870, 4.4370)],
    ("Cyprus", None): [("Nicosia", 35.1856, 33.3823)],
    ("Czech Republic", None): [("Prague", 50.0755, 14.4378)],
    ("Denmark", None): [("Copenhagen", 55.6761, 12.5683)],
    ("Finland", None): [("Espoo", 60.2055, 24.6559)],
    ("France", None): [("Puteaux", 48.8848, 2.2399), ("Blagnac", 43.6328, 1.3936)],
    ("Germany", None): [("Frankfurt", 50.1109, 8.6821)],

    # Asia-Pacific
    ("India","Karnataka"): [("Bangalore" ,12.9716 ,77.5946 )],
    ("India","Maharashtra"): [("Mumbai" ,19.0760 ,72.8777 ),("Pune" ,18.5204 ,73.8567 )],
    ("India","Tamil Nadu"): [("Chennai" ,13.0827 ,80.2707 )],
    ("India","Telangana"): [("Hyderabad" ,17.3850 ,78.4867 ),("Warangal" ,17.9910 ,79.5346 )],
    ("India","Gujarat"): [("Ahmedabad" ,23.0225 ,72.5714 )],
    ("India","West Bengal"): [("Kolkata" ,22.5726 ,88.3639 )],
    ("India","Uttar Pradesh"): [("Noida" ,28.5355 ,77.3910 )],
    ("India","Haryana"): [("Gurgaon" ,28.4595 ,77.0266 )],
    
    ("Australia","New South Wales"): [("Sydney" ,-33.8688 ,151.2093 )],
    ("Australia","Victoria"): [("Melbourne" ,-37.8136 ,144.9631 )],
    
    # Middle East and Africa
    ("United Arab Emirates","Dubai"): [("Dubai City" ,25.2048 ,55.2708 )],
    
    # South Africa
    ("South Africa","Johannesburg"): [("Johannesburg City" ,-26.2041 ,28.0473 )]
}
line_of_business = {
    'Banking and Financial Services': {
        'Digital and Analytics': ['Digital Transformation', 'Data Analytics', 'Artificial Intelligence'],
        'Cybersecurity': ['Security Solutions', 'Risk Management', 'Compliance Services'],
        'Enterprise Application Services': ['ERP Solutions', 'CRM Solutions', 'Enterprise Integration']
    },
    'Insurance': {
        'Digital and Analytics': ['Digital Transformation', 'Data Analytics', 'Artificial Intelligence'],
        'Assurance Services': ['Quality Assurance', 'Testing Services', 'Reliability Engineering'],
        'Consulting': ['Strategic Consulting', 'Business Process Consulting', 'Technology Consulting']
    },
    'Manufacturing': {
        'Enterprise IT Transformation': ['IT Modernization', 'Application Development', 'IT Infrastructure'],
        'Cloud and Infrastructure Services': ['Cloud Migration', 'Cloud Management', 'Infrastructure Services'],
        'Digital and Analytics': ['Digital Transformation', 'Data Analytics', 'Artificial Intelligence']
    },
    'Retail and Consumer Goods': {
        'Digital and Analytics': ['Digital Transformation', 'Data Analytics', 'Artificial Intelligence'],
        'Enterprise Application Services': ['ERP Solutions', 'CRM Solutions', 'Enterprise Integration'],
        'Consulting': ['Strategic Consulting', 'Business Process Consulting', 'Technology Consulting']
    },
    'Healthcare and Life Sciences': {
        'Digital and Analytics': ['Digital Transformation', 'Data Analytics', 'Artificial Intelligence'],
        'Assurance Services': ['Quality Assurance', 'Testing Services', 'Reliability Engineering'],
        'Cybersecurity': ['Security Solutions', 'Risk Management', 'Compliance Services']
    },
    'Energy': {
        'Cloud and Infrastructure Services': ['Cloud Migration', 'Cloud Management', 'Infrastructure Services'],
        'Enterprise IT Transformation': ['IT Modernization', 'Application Development', 'IT Infrastructure'],
        'Consulting': ['Strategic Consulting', 'Business Process Consulting', 'Technology Consulting']
    },
    'Travel, Transportation, and Hospitality (TTH)': {
        'Digital and Analytics': ['Digital Transformation', 'Data Analytics', 'Artificial Intelligence'],
        'Enterprise Application Services': ['ERP Solutions', 'CRM Solutions', 'Enterprise Integration'],
        'Cloud and Infrastructure Services': ['Cloud Migration', 'Cloud Management', 'Infrastructure Services']
    },
    'Media and Entertainment': {
        'Digital and Analytics': ['Digital Transformation', 'Data Analytics', 'Artificial Intelligence'],
        'Consulting': ['Strategic Consulting', 'Business Process Consulting', 'Technology Consulting'],
        'Assurance Services': ['Quality Assurance', 'Testing Services', 'Reliability Engineering']
    }
}

# Helper functions
def generate_employee_id(n):
    return [str(i).zfill(8) for i in range(1, n+1)]

def generate_dates(start, end, n):
    date_range = pd.date_range(start, end, freq='ME')
    return np.random.choice(date_range, n)

def generate_random_choice(choices, n):
    return np.random.choice(choices, n)

def generate_geography(n):
    geographies = []
    for _ in range(n):
        continent = random.choice(continents)
        country = random.choice(countries[continent])
        state_city = random.choice(states_cities[(country, None)] if (country, None) in states_cities else states_cities[(country, random.choice(list(states_cities.keys()))[1])])
        geographies.append((continent, country, state_city[0], state_city[1], state_city[2]))
    return zip(*geographies)

# Generate data
data = {
    'EMPLOYEE_ID': generate_employee_id(num_records),
    'JOB_STARTDATE': generate_dates(start_date, end_date, num_records),
    'DISTANCE': np.random.uniform(1, 50, num_records),  # Random distance between 1 and 50 miles
    'ETHNICITY': generate_random_choice(['Asian', 'Black', 'Hispanic', 'White', 'Other'], num_records),
    'MARITAL_STATUS': generate_random_choice(['Single', 'Married', 'Divorced', 'Widowed'], num_records),
    'GENDER': generate_random_choice(['Male', 'Female', 'Other'], num_records),
    'ROLE': generate_random_choice(roles, num_records),
    'CONTINENTS': [],
    'COUNTRY': [],
    'STATE': [],
    'CITY': [],
    'LATITUDE': [],
    'LONGITUDE': [],
    'LINE_OF_BUSINESS': [],
    'DELIVERY_UNIT': [],
    'PRACTICE_UNIT': [],
    'EMPLOYEMENT_TYPE': generate_random_choice(['Full-Time', 'Internship'], num_records),
    'TURNOVER_REASONS': generate_random_choice(['Better Opportunities', 'Work Life Balance', 'Lack of Growth', 'Compensation', 'Job Satisfaction', 'Management Issues', 'Higher-Study', 'Career Advancement', 'Company Culture', 'Health Reasons', 'Lack of Recognition', 'Personal Reasons'], num_records),
    'SHIFT': generate_random_choice(['General Shift', 'First Shift', 'Second Shift', 'Night Shift'], num_records),
    'SALARY(INR)': np.random.randint(300000, 4000000, num_records),
    'SALARY_RANGE': [],
    'JOB_ENDDATE': [],
    'CHURN': []
}


In [17]:
def generate_geography(n):
    geographies = []
    for _ in range(n):
        country_list = random.choice(list(countries.values()))
        country = random.choice(country_list)
        if country == "United States":
            state = random.choice(["Washington", "Texas", "New Jersey", "Georgia", "Florida", "Ohio", "Arizona", "Connecticut", "Wisconsin"])
            state_city = random.choice(states_cities[(country, state)])
        elif country == "Australia":
            state = random.choice(["New South Wales", "Victoria"])
            state_city = random.choice(states_cities[(country, state)])
        else:
            state_city = random.choice(states_cities.get((country, None), random.choice(list(states_cities.values()))))
        geographies.append((country, state_city[0], state_city[1], state_city[2]))
    return geographies

# Generate geography data
geography_data = generate_geography(num_records)
countries, states, cities, latitudes, longitudes = zip(*geography_data)
data['COUNTRY'] = countries
data['STATE'] = states
data['CITY'] = cities
data['LATITUDE'] = latitudes
data['LONGITUDE'] = longitudes

# Generate hierarchical business data
for i in range(num_records):
    lob = random.choice(list(line_of_business.keys()))
    du = random.choice(list(line_of_business[lob].keys()))
    pu = random.choice(line_of_business[lob][du])
    data['LINE_OF_BUSINESS'].append(lob)
    data['DELIVERY_UNIT'].append(du)
    data['PRACTICE_UNIT'].append(pu)

# Generate salary range based on salary
salary_ranges = [
    '₹3,00,000 - ₹5,00,000 per annum',
    '₹5,00,000 - ₹8,00,000 per annum',
    '₹8,00,000 - ₹12,00,000 per annum',
    '₹12,00,000 - ₹18,00,000 per annum',
    '₹18,00,000 - ₹25,00,000 per annum',
    '₹25,00,000 - ₹40,00,000+ per annum'
]
for salary in data['SALARY(INR)']:
    if salary < 500000:
        data['SALARY_RANGE'].append(salary_ranges[0])
    elif salary < 800000:
        data['SALARY_RANGE'].append(salary_ranges[1])
    elif salary < 1200000:
        data['SALARY_RANGE'].append(salary_ranges[2])
    elif salary < 1800000:
        data['SALARY_RANGE'].append(salary_ranges[3])
    elif salary < 2500000:
        data['SALARY_RANGE'].append(salary_ranges[4])
    else:
        data['SALARY_RANGE'].append(salary_ranges[5])

# Generate job end date and churn
for start_date in data['JOB_STARTDATE']:
    if random.random() > 0.7:  # 30% chance of churn
        end_date = pd.to_datetime(start_date) + timedelta(days=random.randint(30, 365*5))  # Random end date within 5 years
        data['JOB_ENDDATE'].append(end_date)
        data['CHURN'].append(1)
    else:
        data['JOB_ENDDATE'].append(None)
        data['CHURN'].append(0)

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('augmented_employee_data.csv', index=False)


AttributeError: 'tuple' object has no attribute 'values'