In [20]:
import os
import pandas as pd

# Define the source PREP directory
prep_dir = r"C:\Users\SujithaaR\Documents\FinalProject- DWH\PREP"

# Define the target REPORT directory
report_dir = r"C:\Users\SujithaaR\Documents\FinalProject- DWH\REPORT"  

# Create the PREP directory if it doesn't exist
os.makedirs(prep_dir, exist_ok=True)

# Function to load CSV files into DataFrames
def load_csv(file_name):
    file_path = os.path.join(prep_dir, file_name)
    return pd.read_csv(file_path)

# Load data from PREP CSV files
dim_user = load_csv('cleaned_employees.csv') 
dim_course = load_csv('cleaned_courses.csv')  
fact_enrollment = load_csv('cleaned_enrollment.csv')  
quiz_data = load_csv('cleaned_quiz_results.csv')  
feedback_data = load_csv('cleaned_feedback.csv')  
comment_data = load_csv('cleaned_comments.csv')  


CREATE THE FACT TABLE

Key attributes - enrollment IDs, user IDs, course IDs, and flags indicating whether quizzes, feedback and comments were taken.

In [21]:
# Create the fact table
fact_table = fact_enrollment.copy()

# Add flags for quiz, feedback, and comments
fact_table['isQuizTaken'] = fact_table['isQuizTaken'].astype(bool)
fact_table['isFeedback'] = fact_table['isFeedback'].astype(bool)
fact_table['isCommented'] = fact_table['isParticipated'].astype(bool)

# Filter only the required columns for the fact table
fact_table = fact_table[['enrollmentId', 'userId', 'courseId', 'enrolledAt', 'progress', 'completed','totalCount','completedCount', 'isQuizTaken', 'isFeedback', 'isCommented']]

display(fact_table)

Unnamed: 0,enrollmentId,userId,courseId,enrolledAt,progress,completed,totalCount,completedCount,isQuizTaken,isFeedback,isCommented
0,rQS0kpxX,158,16,2024-10-07,71,False,9,8,False,True,True
1,3AStlgcU,130,13,2024-10-07,76,False,9,7,True,False,True
2,E2UsVdTq,172,18,2024-10-07,14,False,10,8,True,False,True
3,fh9DWDHa,271,13,2024-10-07,60,False,11,10,False,True,True
4,tmdGzFZb,246,20,2024-10-07,30,False,17,4,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...
4995,BhMALCFI,226,18,2024-10-07,35,False,13,10,True,True,True
4996,bSPIrNNP,22,1,2024-10-07,45,False,15,11,True,False,False
4997,L9WIvrWd,110,8,2024-10-07,74,False,5,4,False,True,True
4998,n12qJBay,157,3,2024-10-07,94,False,6,2,True,True,True


In [22]:
fact_table.to_csv(os.path.join(report_dir, 'fact_enrollment.csv'), index=False)

CREATE THE DIMENSION TABLES

Create and save the dimension tables

In [25]:
dim_user.to_csv(os.path.join(report_dir, 'dim_employee.csv'), index=False)
dim_course.to_csv(os.path.join(report_dir, 'dim_course.csv'), index=False)


Create and save the Quiz dimension table only if quiz was taken

In [26]:
dim_quiz = quiz_data[quiz_data['enrollmentId'].isin(fact_table[fact_table['isQuizTaken']]['enrollmentId'])]
dim_quiz = dim_quiz[['quizId', 'userId', 'courseId', 'enrollmentId', 'totalScore', 'obtainedScore', 'date']]

dim_quiz.to_csv(os.path.join(report_dir, 'dim_quiz.csv'), index=False)

Create and save the Feedback dimension table only if feedback was given

In [27]:
dim_feedback = feedback_data[feedback_data['enrollmentId'].isin(fact_table[fact_table['isFeedback']]['enrollmentId'])]
dim_feedback = dim_feedback[['feedbackId', 'courseId', 'userId', 'enrollmentId', 'overallSatisfaction', 'contentQuality', 'instructorEffectiveness', 'comments', 'createdAt']]

dim_feedback.to_csv(os.path.join(report_dir, 'dim_feedback.csv'), index=False)

Create and save the Comments dimension table only if comments were made

In [28]:
dim_comments = comment_data[comment_data['enrollmentId'].isin(fact_table[fact_table['isCommented']]['enrollmentId'])]
dim_comments = dim_comments[['commentId', 'userId', 'enrollmentId', 'courseId', 'content', 'createdAt']]

dim_comments.to_csv(os.path.join(report_dir, 'dim_comments.csv'), index=False)