In [1]:
import os
import pandas as pd

# Define the source PREP directory
prep_dir = r"C:\Users\SujithaaR\Documents\FinalProject -DWH and DS\DWH\PREP"

# Define the target REPORT directory
report_dir = r"C:\Users\SujithaaR\Documents\FinalProject -DWH and DS\DWH\REPORT"  

# Create the PREP directory if it doesn't exist
os.makedirs(prep_dir, exist_ok=True)

# Function to load CSV files into DataFrames
def load_csv(file_name):
    file_path = os.path.join(prep_dir, file_name)
    return pd.read_csv(file_path)

# Load data from PREP CSV files
dim_user = load_csv('cleaned_employees.csv') 
dim_course = load_csv('cleaned_courses.csv')  
enrollment = load_csv('cleaned_enrollment.csv')  
quiz_data = load_csv('cleaned_quiz_results.csv')  
feedback_data = load_csv('cleaned_feedback.csv')  
comment_data = load_csv('cleaned_comments.csv')  
fact_engagement = load_csv('cleaned_engagement.csv')  # Load the engagement data

# You can print or inspect the DataFrames if needed
print(dim_user.head())
print(dim_course.head())
print(enrollment.head())
print(quiz_data.head())
print(feedback_data.head())
print(comment_data.head())
print(fact_engagement.head())  # Print engagement data to verify



   employee_id         username                        email  isAdmin  \
0            1              Ywu  samanthaperkins@example.net     True   
1            2       Jennifer27           wevans@example.net     True   
2            3         Vedwards        matthew60@example.net    False   
3            4  Gonzalessabrina        melissa00@example.net     True   
4            5        Jessica89          tonya60@example.org    False   

             department                       team  timeSpent  gender  
0       Data Management        Infrastructure Team      86049    Male  
1  Software Development     Technical Support Team      56812  Female  
2        Cloud Services  User Experience (Ux) Team      79179    Male  
3            It Support                Devops Team      63847    Male  
4  Software Development  User Experience (Ux) Team      10197    Male  
   course_id                                 title  \
0          1   Proactive demand-driven methodology   
1          2      Dow

CREATE THE FACT TABLE

Key attributes - enrollment IDs, user IDs, course IDs, and flags indicating whether quizzes, feedback and comments were taken.

In [2]:
# Merge enrollment with engagement to create the fact table
fact_engagement_table = enrollment.merge(
    fact_engagement,
    on=['userId', 'enrollmentId', 'courseId'],  # Ensure these columns are the same in both DataFrames
    how='left'  # Use left join to keep all enrollment records
)

# Select relevant columns for the fact table
fact_engagement_table = fact_engagement_table[['userId', 'enrollmentId', 'courseId', 
                                                'enrolledAt', 'progress', 'completed', 
                                                'totalCount', 'completedCount', 
                                                'isQuizTaken', 'isParticipated', 
                                                'isFeedback', 'engagement']]

# Display the resulting fact engagement table
print("Fact Engagement Table:")
print(fact_engagement_table.head())

# Save the fact engagement table to a CSV file in the report directory
fact_engagement_table.to_csv(os.path.join(report_dir, 'fact_engagement.csv'), index=False)
print("Fact engagement table created and saved as 'fact_engagement.csv' in the REPORT directory!")

Fact Engagement Table:
   userId enrollmentId  courseId  enrolledAt  progress  completed  totalCount  \
0     158     rQS0kpxX        16  2024-10-07        71      False           9   
1     130     3AStlgcU        13  2024-10-07        76      False           9   
2     172     E2UsVdTq        18  2024-10-07        14      False          10   
3     271     fh9DWDHa        13  2024-10-07        60      False          11   
4     246     tmdGzFZb        20  2024-10-07        30      False          17   

   completedCount  isQuizTaken  isParticipated  isFeedback  engagement  
0               8        False            True        True           0  
1               7         True            True       False           0  
2               8         True            True       False           0  
3              10        False            True        True           1  
4               4         True            True       False           0  
Fact engagement table created and saved as 'fact_eng

CREATE THE DIMENSION TABLES

Create and save the dimension tables

In [3]:
dim_user.to_csv(os.path.join(report_dir, 'dim_employee.csv'), index=False)
dim_course.to_csv(os.path.join(report_dir, 'dim_course.csv'), index=False)


Create and save the Quiz dimension table only if quiz was taken

In [4]:
dim_quiz = quiz_data[quiz_data['enrollmentId'].isin(fact_engagement_table[fact_engagement_table['isQuizTaken']]['enrollmentId'])]
dim_quiz = dim_quiz[['quizId', 'userId', 'courseId', 'enrollmentId', 'totalScore', 'obtainedScore', 'date']]

dim_quiz.to_csv(os.path.join(report_dir, 'dim_quiz.csv'), index=False)

Create and save the Feedback dimension table only if feedback was given

In [5]:
dim_feedback = feedback_data[feedback_data['enrollmentId'].isin(fact_engagement_table[fact_engagement_table['isFeedback']]['enrollmentId'])]
dim_feedback = dim_feedback[['feedbackId', 'courseId', 'userId', 'enrollmentId', 'overallSatisfaction', 'contentQuality', 'instructorEffectiveness', 'comments', 'createdAt']]

dim_feedback.to_csv(os.path.join(report_dir, 'dim_feedback.csv'), index=False)

Create and save the Comments dimension table only if comments were made

In [6]:
dim_comments = comment_data[comment_data['enrollmentId'].isin(fact_engagement_table[fact_engagement_table['isParticipated']]['enrollmentId'])]
dim_comments = dim_comments[['commentId', 'userId', 'enrollmentId', 'courseId', 'content', 'createdAt']]

dim_comments.to_csv(os.path.join(report_dir, 'dim_comments.csv'), index=False)