In [11]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Predefined roles with specific titles
roles_mapping = {
    'Data Scientist': ['junior data scientist', 'senior data scientist'],
    'Data Engineer': ['junior data engineer', 'senior data engineer'],
    'Software Engineer': ['junior software engineer', 'senior software engineer'],
    'Data Analyst': ['junior data analyst', 'senior data analyst']
}

# Expanded skills based on role
skills_mapping = {
    'junior data scientist': ['Python', 'SQL', 'Data Visualization'],
    'senior data scientist': ['Machine Learning', 'TensorFlow', 'Natural Language Processing'],
    'junior data engineer': ['Python', 'ETL', 'Data Warehousing'],
    'senior data engineer': ['Hadoop', 'Spark', 'Data Lakes'],
    'junior software engineer': ['JavaScript', 'HTML', 'CSS'],
    'senior software engineer': ['Microservices', 'DevOps', 'Cloud Computing'],
    'junior data analyst': ['Excel', 'Tableau', 'SQL'],
    'senior data analyst': ['Statistics', 'Predictive Analytics', 'Data Modeling']
}

# Helper function to get a random role
def get_random_role():
    role = random.choice(list(roles_mapping.keys()))
    return random.choice(roles_mapping[role])  # Randomly choose junior or senior role

# Helper function to generate a random skill based on role
def get_random_skill(role):
    available_skills = skills_mapping[role]
    return random.choice(available_skills)  # Select one random skill

# Sample employee DataFrame for generating employee IDs
def create_sample_employee_df(num_records):
    return pd.DataFrame({
        'employee_id': range(1, num_records + 1)
    })

# Generate employee IDs DataFrame
employee_df = create_sample_employee_df(500)

# Function to generate employee data
def generate_employee_data(num_records):
    employee_data = []
    
    for _ in range(num_records):
        role = get_random_role()
        skill = get_random_skill(role)
        employee_id = random.choice(employee_df['employee_id'].values)
        employee = {
            'employee_id': employee_id,
            'employee_name': fake.name(),
            'current_role': role,
            'skills': skill,  # Assign only one skill
            'dateofbirth': fake.date_of_birth(minimum_age=18, maximum_age=70),
            'dateofjoining': fake.date_between(start_date='-20y', end_date='today'),
            'email': fake.email(),
            'mobile_number': fake.phone_number(),
            'Address': fake.address().replace('\n', ', ')
        }
        employee_data.append(employee)

    return employee_data

employee_data = generate_employee_data(500)

# Convert to DataFrame
df = pd.DataFrame(employee_data)

# Save to CSV
df.to_csv('employee_data.csv', index=False)

print('Employee dataset generated and saved to employee_data.csv')


Employee dataset generated and saved to employee_data.csv


In [12]:
df.head()

Unnamed: 0,employee_id,employee_name,current_role,skills,dateofbirth,dateofjoining,email,mobile_number,Address
0,254,Brian Jackson,senior data engineer,Data Lakes,1960-03-19,2022-02-16,georgeangela@example.net,769.820.3225x835,"715 Cunningham Forges Apt. 824, West Brandonsi..."
1,219,Mark Nielsen,senior data scientist,Natural Language Processing,1985-06-05,2010-06-30,milesmary@example.org,7635554079,"31583 Eric Mill Suite 677, Riveraton, OK 07085"
2,134,David Anderson,junior data analyst,Excel,1992-01-14,2006-05-25,ajones@example.net,724.899.0763x041,"PSC 4715, Box 9334, APO AA 03780"
3,295,Tyler Allen,senior software engineer,DevOps,1965-09-24,2008-11-07,dianamontes@example.net,3927339776,"2816 Stephen Island Suite 455, Sarahborough, S..."
4,432,Trevor Torres,senior software engineer,DevOps,1972-03-03,2019-09-15,devon53@example.org,001-707-865-5738,"742 Perez Stream Apt. 654, Rodriguezville, IN ..."


In [1]:
import pandas as pd
from pymongo import MongoClient

# Replace the URI below with your actual connection string
uri = "mongodb://localhost:27017/"
client = MongoClient(uri)

# Replace 'your_database' and 'your_collection' with your actual database and collection names
db = client['Data']
collection = db['Employee']


# Query the data
data = list(collection.find())

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# If your data has an '_id' field, you might want to drop it
if '_id' in df.columns:
    df.drop(columns=['_id'], inplace=True)

# Now you can work with the DataFrame
print(df.head())


   employee_id   employee_name              current_role  \
0          254   Brian Jackson      senior data engineer   
1          219    Mark Nielsen     senior data scientist   
2          134  David Anderson       junior data analyst   
3          295     Tyler Allen  senior software engineer   
4          432   Trevor Torres  senior software engineer   

                        skills dateofbirth dateofjoining  \
0                   Data Lakes  1960-03-19    2022-02-16   
1  Natural Language Processing  1985-06-05    2010-06-30   
2                        Excel  1992-01-14    2006-05-25   
3                       DevOps  1965-09-24    2008-11-07   
4                       DevOps  1972-03-03    2019-09-15   

                      email     mobile_number  \
0  georgeangela@example.net  769.820.3225x835   
1     milesmary@example.org        7635554079   
2        ajones@example.net  724.899.0763x041   
3   dianamontes@example.net        3927339776   
4       devon53@example.org  001-

In [2]:
df.to_csv('employee_data.csv', index=False)

print('Employee dataset generated and saved to employee_data.csv')

Employee dataset generated and saved to employee_data.csv
