In [7]:
%pip install faker pandas

Note: you may need to restart the kernel to use updated packages.


In [8]:
from faker import Faker
import random
import pandas as pd
from datetime import datetime, timedelta

fake = Faker()

NUM_EMPLOYEES = 200
NUM_TRAININGS = 30
NUM_SCORES = 5000

# Generate Employees
employees = []
for _ in range(180):
    employee = {
        "Emp_id": len(employees) + 1,
        "name": fake.name(),
        "email": fake.unique.email(),
        "password": fake.password(),
        "Gender": random.choice(["Male", "Female"]),
        "Role": "Employee",  # All are regular employees
        "Region": random.choice(['Chennai', 'Mumbai', 'Delhi', 'Hyderabad', 'Kolkata', 'Bangalore']),
        "Department": random.choice(["HR", "Sale", "Designer", "Developer", "Marketing", "Analyst"]),
        "Designation": random.choice(["Software Engineer", "Senior Software Engineer", "Solutions Enabler", "Solutions Consultant", "Principal Architect"]),
        "Date_of_Birth": datetime.now() - timedelta(days=random.randint(365 * 30, 365 * 50)),
        "Date_of_Joining": datetime.now() - timedelta(days=random.randint(365, 365 * 10)),
    }
    employees.append(employee)

# Create 20 admins or trainers
for _ in range(20):
    employee = {
        "Emp_id": len(employees) + 1,
        "name": fake.name(),
        "email": fake.unique.email(),
        "password": fake.password(),
        "Gender": random.choice(["Male", "Female"]),
        "Role": random.choice(["Admin", "Trainer"]),  # Randomly choose between Admin and Trainer
        "Region": random.choice(['Chennai', 'Mumbai', 'Delhi', 'Hyderabad', 'Kolkata', 'Bangalore']),
        "Department": random.choice(["HR", "Sale", "Designer", "Developer", "Marketing", "Analyst"]),
        "Designation": random.choice(["Software Engineer", "Senior Software Engineer", "Solutions Enabler", "Solutions Consultant", "Principal Architect"]),
        "Date_of_Birth": datetime.now() - timedelta(days=random.randint(365 * 30, 365 * 50)),
        "Date_of_Joining": datetime.now() - timedelta(days=random.randint(365, 365 * 10)),
    }
    employees.append(employee)

# Trainers only
trainers = [emp for emp in employees if emp["Role"] == "Trainer"]

# Employees only
employees_only = [emp for emp in employees if emp["Role"] == "Employee"]

# Generate Trainings
trainings = []
for _ in range(NUM_TRAININGS):
    start_date = fake.date_between(start_date='-1y', end_date='today')
    end_date = fake.date_between(start_date=start_date, end_date='+30d')
    training = {
        "training_id": _ + 1,
        "training_name": fake.catch_phrase(),
        "start_date": start_date,
        "end_date": end_date,
        "Trainer_id": random.choice(trainers)["Emp_id"],
        "domain": random.choice(["Full Stack", "Data Engineering", "Data Science"]),
    }
    trainings.append(training)

# Generate all possible unique (E_id, T_id) pairs
unique_pairs = [(emp["Emp_id"], training["training_id"]) for emp in employees_only for training in trainings]
random.shuffle(unique_pairs)  # Shuffle to randomize order

# Select the first NUM_SCORES pairs for scores
selected_pairs = unique_pairs[:NUM_SCORES]

# Generate Scores
scores = []
for idx, (E_id, T_id) in enumerate(selected_pairs):
    score_entry = {
        "score_id": idx + 1,
        "Training_id": T_id,
        "Emp_id": E_id,
        "score": random.randint(0, 10),
        "punctuality": random.randint(0, 10),
        "discipline": random.randint(0, 10),
        "standards": random.randint(0, 10),
        "remarks": random.choice(["Very Poor", "Poor", "Average", "Good", "Excellent"]),
    }
    scores.append(score_entry)

# Create DataFrames
employees_df = pd.DataFrame(employees)
trainings_df = pd.DataFrame(trainings)
scores_df = pd.DataFrame(scores)

promotion_threshold = 6

# Calculate 'is_promoted' based on each row's values
scores_df['is_promoted'] = scores_df.apply(
    lambda row: 1 if (row[['score', 'punctuality', 'discipline', 'standards']].mean() >= promotion_threshold and
                      (row['remarks'] == "Excellent" or row['remarks'] == "Good")) else 0,
    axis=1
)

# Save to CSV files
employees_df.to_csv(r'C:\Users\VenkataRishitha\Training\Final Project 30-09-2024\Data Engineering\Source\employees.csv', index=False)
trainings_df.to_csv(r'C:\Users\VenkataRishitha\Training\Final Project 30-09-2024\Data Engineering\Source\trainings.csv', index=False)
scores_df.to_csv(r'C:\Users\VenkataRishitha\Training\Final Project 30-09-2024\Data Engineering\Source\scores.csv', index=False)

print("Data generated and saved to CSV files.")


Data generated and saved to CSV files.
