In [6]:
import pandas as pd
import random
from faker import Faker
from datetime import timedelta

# Initialize Faker
fake = Faker()

# Load existing employee and course data
employee_df = pd.read_csv('employee_data.csv')  # Load your existing employee data
course_df = pd.read_csv('course_data.csv')      # Load your existing course data

# Feedback categories
positive_feedback = [
    "Excellent course, highly recommend!",
    "Very informative and well structured.",
    "The instructor was great, learned a lot!",
    "I really enjoyed this course.",
    "The materials provided were very helpful."
]

neutral_feedback = [
    "The course was okay, not too bad.",
    "It was a decent experience overall.",
    "Average course with some useful content.",
    "Not much to say, it was a standard course.",
    "The course met my expectations."
]

negative_feedback = [
    "Not worth the time, very disappointing.",
    "The content was outdated and irrelevant.",
    "I didn't learn anything new from this course.",
    "Poorly organized, I expected better.",
    "I would not recommend this course."
]

# Function to generate course enrollment data
def generate_course_enrollment(employee_df, course_df, num_records):
    enrollment_data = []
    
    for enrollment_id in range(1, num_records + 1):
        employee_id = random.choice(employee_df['employee_id'].values)  # Random employee
        course_id = random.choice(course_df['course_id'].values)        # Random course
        
        enrolled_date = fake.date_between(start_date='-2y', end_date='today')  # Enrollment date within the last 2 years
        completion_days = random.randint(30, 180)  # Completion in 1 to 6 months
        completed_date = enrolled_date + timedelta(days=completion_days)  # Completed date
        
        progress = random.randint(0, 100)  # Random progress between 0 and 100%
        ratings = round(random.uniform(1, 5), 1)  # Random rating between 1.0 and 5.0
        
        # Select feedback based on rating
        if ratings >= 4.0:
            feedback = random.choice(positive_feedback)  # Positive feedback for high ratings
        elif ratings >= 2.5:
            feedback = random.choice(neutral_feedback)  # Neutral feedback for average ratings
        else:
            feedback = random.choice(negative_feedback)  # Negative feedback for low ratings
            
        completion_successful = progress == 100  # Successful if progress is 100%
        
        enrollment_record = {
            'enrollment_id': enrollment_id,  # Use sequential ID for enrollment
            'employee_id': employee_id,
            'course_id': course_id,
            'enrolled_date': enrolled_date,
            'completed_date': completed_date,
            'progress': progress,
            'ratings': ratings,
            'feedback': feedback        
        }
        
        enrollment_data.append(enrollment_record)

    return enrollment_data

# Generate enrollment data for 20,000 records
enrollment_data = generate_course_enrollment(employee_df, course_df, 20000)

# Convert to DataFrame
enrollment_df = pd.DataFrame(enrollment_data)

# Save to CSV
enrollment_df.to_csv('course_enrollment_data.csv', index=False)

print('Course enrollment dataset generated and saved to course_enrollment_data.csv')


Course enrollment dataset generated and saved to course_enrollment_data.csv


In [7]:
enrollment_df.head()

Unnamed: 0,enrollment_id,employee_id,course_id,enrolled_date,completed_date,progress,ratings,feedback
0,1,255,5,2024-08-29,2024-10-07,59,1.7,I would not recommend this course.
1,2,391,4,2024-05-11,2024-09-26,0,4.8,The materials provided were very helpful.
2,3,384,3,2023-01-11,2023-03-13,83,3.0,The course met my expectations.
3,4,141,8,2023-07-09,2023-08-21,59,3.8,"Not much to say, it was a standard course."
4,5,163,5,2022-12-29,2023-02-25,64,3.5,"Not much to say, it was a standard course."


In [2]:
import pandas as pd
from pymongo import MongoClient

# Replace the URI below with your actual connection string
uri = "mongodb://localhost:27017/"
client = MongoClient(uri)

# Replace 'your_database' and 'your_collection' with your actual database and collection names
db = client['Data']
collection = db['Enroll']


# Query the data
data = list(collection.find())

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# If your data has an '_id' field, you might want to drop it
if '_id' in df.columns:
    df.drop(columns=['_id'], inplace=True)

# Now you can work with the DataFrame
print(df.head())


   enrollment_id  employee_id  course_id enrolled_date completed_date  \
0              1          255          5    2024-08-29     2024-10-07   
1              2          391          4    2024-05-11     2024-09-26   
2              3          384          3    2023-01-11     2023-03-13   
3              4          141          8    2023-07-09     2023-08-21   
4              5          163          5    2022-12-29     2023-02-25   

   progress  ratings                                    feedback  
0        59      1.7          I would not recommend this course.  
1         0      4.8   The materials provided were very helpful.  
2        83      3.0             The course met my expectations.  
3        59      3.8  Not much to say, it was a standard course.  
4        64      3.5  Not much to say, it was a standard course.  


In [4]:
# Save to CSV
df.to_csv('course_enrollment_data.csv', index=False)

print('Course enrollment dataset generated and saved to course_enrollment_data.csv')

Course enrollment dataset generated and saved to course_enrollment_data.csv
