In [6]:
import pandas as pd
import random
import faker

# Initialize Faker
fake = faker.Faker()

# Sample lists
departments = ["Data Science", "Cloud Engineering", "DevOps", "Full Stack Dev", "AI Research", "Cybersecurity"]
locations = ["Mumbai", "Pune", "Delhi", "Chennai"]
skills_pool = [
    "Python", "NLP", "TensorFlow", "AWS", "Azure", "Kubernetes", "Terraform", 
    "CI/CD", "Docker", "Jenkins", "React", "Node.js", "PostgreSQL", "Microservices", "Healthcare Analytics", ".Net",
    "Java", "Machine Learning", "Big Data", "Data Visualization", "Cybersecurity", "Blockchain", "IoT", "Agile", "Scrum", "SQL", "NoSQL", "GraphQL", "REST APIs",
    "Data Engineering", "Data Warehousing", "Business Intelligence", "Robotic Process Automation",
    "Edge Computing", "Quantum Computing", "Mobile Development", "UI/UX Design", "Network Security", "Penetration Testing", 
    "Incident Response", "Cloud Security", "DevOps Practices", "CI/CD Pipelines", "Container Orchestration", "Serverless Architecture",
    "Data Lake Implementation", "Data Mesh Architecture", "Real-time Data Processing", "Data Governance", "Data Quality Management",
    "Data Privacy Compliance", "API Development", "Microservices Architecture", "Agile Project Management", "Scrum Methodology",
    "SQL Database Management", "NoSQL Database Management", "GraphQL API Development"
]

word_domains = ["healthcare", "finance", "retail", "insurance", "telecom"]

worked_on = [
    "cloud migration", "fraud detection", "web application development", "automation pipeline","Full Stack Development",
    "data analytics", "machine learning model deployment", "cybersecurity solutions", "AI research",
    "IoT solutions", "blockchain integration", "big data processing", "mobile app development",
    "network security", "penetration testing", "incident response", "data visualization", "data engineering",
    "data warehousing", "business intelligence", "robotic process automation", "edge computing",
    "quantum computing", "UI/UX design", "network infrastructure", "API development", "microservices architecture",
    "agile project management", "scrum methodology", "SQL database management", "NoSQL database management",
    "GraphQL API development", "RESTful API design", "data governance", "data quality management",
    "data privacy compliance", "cloud security", "DevOps practices", "CI/CD pipelines", "container orchestration",
    "serverless architecture", "data lake implementation", "data mesh architecture", "real-time data processing"
]

# --- Generate Employee Master ---
num_employees = 1000
employee_data = []
for i in range(1, num_employees + 1):
    emp_id = f"E{100 + i}"
    name = fake.name()
    dept = random.choice(departments)
    exp_years = round(random.uniform(1, 15), 1)
    project_id = None  # Can randomly assign later if needed
    loc = random.choice(locations)
    employee_data.append([emp_id, name, dept, exp_years, project_id, loc])

df_emp = pd.DataFrame(employee_data, columns=["Employee_ID", "Name", "Department", "Years_Experience", "Current_Project_ID", "Location"])
df_emp.to_csv("../data/raw/employee_master.csv", index=False)

# --- Generate Employee Experience ---
experience_data = []
for emp in df_emp["Employee_ID"]:
    exp_text = f"Worked on projects involving {random.choice(skills_pool)}, {random.choice(skills_pool)}, and {random.choice(skills_pool)}. " \
               f"Delivered solutions for {random.choice(word_domains)} domain."
    experience_data.append([emp, exp_text])

df_exp = pd.DataFrame(experience_data, columns=["Employee_ID", "Experience_Text"])

df_exp.to_csv("../data/raw/employee_experience.csv", index=False)

# --- Generate Client Projects with aligned description and skills ---
num_projects = 100
project_data = []
for i in range(1, num_projects + 1):
    proj_id = f"P{300 + i}"
    client = fake.company()
    
    # Sample 3 consistent skills per project
    sampled_skills = random.sample(skills_pool, 3)
    
    # Construct description using sampled skills
    desc = (f"Looking for expertise in {sampled_skills[0]}, {sampled_skills[1]}, and {sampled_skills[2]} "
            f"to work on a {random.choice(worked_on)} project.")
    
    # Join skills for required_skills column (exact match with description)
    required_skills = ", ".join(sampled_skills)
    
    loc = random.choice(locations)
    status = random.choice(["Open", "Closed"])
    
    project_data.append([proj_id, client, desc, required_skills, loc, status])

df_proj = pd.DataFrame(project_data, columns=["Project_ID", "Client_Name", "Project_Description", "Required_Skills", "Location", "Status"])
df_proj.to_csv("../data/raw/client_projects.csv", index=False)

print("✅ Synthetic datasets generated: employee_master.csv, employee_experience.csv, client_projects.csv")


✅ Synthetic datasets generated: employee_master.csv, employee_experience.csv, client_projects.csv
