In [1]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

# Load CSV files into DataFrames
courses_df = pd.read_csv('courses.csv')
performance_df = pd.read_csv('performance.csv')
enrollment_df = pd.read_csv('enrollment.csv')
users_df = pd.read_csv('users.csv')
learning_paths_df = pd.read_csv('learningPath.csv')

# Preview loaded data
print(courses_df.head())
print(performance_df.head())
print(enrollment_df.head())
print(users_df.head())
print(learning_paths_df.head())


   id                 title                          description  duration  \
0   1       Cloud Computing       Build modern web applications.         8   
1   2       Web Development     Learn the basics of programming.        42   
2   3  Intro to Programming     Learn the basics of programming.        31   
3   4       Cloud Computing      Dive deep into data structures.        84   
4   5       Data Structures  Introduction to cloud technologies.        72   

     difficulty  rating            domain  
0      advanced     3.3      Data Science  
1      beginner     3.7   Web Development  
2      beginner     1.0  Computer Science  
3      advanced     1.6  Computer Science  
4  intermediate     3.8   Cloud Computing  
   id  user_id  enrollment_id  score grade  progress  certificate_earned
0   1       39             15     73     D        24               False
1   2       80             92     60     A        92                True
2   3       48             44     68     D      

In [4]:
# Remove duplicates
courses_df.drop_duplicates(inplace=True)
performance_df.drop_duplicates(inplace=True)
enrollment_df.drop_duplicates(inplace=True)
users_df.drop_duplicates(inplace=True)
learning_paths_df.drop_duplicates(inplace=True)

# Handle missing values (basic example)
courses_df.fillna('Unknown', inplace=True)
performance_df.fillna(0, inplace=True)  # assuming 0 for missing scores/progress
# enrollment_df['completion_date'].fillna('Incomplete', inplace=True)

# Standardize string formats
courses_df['domain'] = courses_df['domain'].str.title()
learning_paths_df['domain'] = learning_paths_df['domain'].str.title()

# Example: Preview cleaned data
print(courses_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           100 non-null    int64  
 1   title        100 non-null    object 
 2   description  100 non-null    object 
 3   duration     100 non-null    int64  
 4   difficulty   100 non-null    object 
 5   rating       100 non-null    float64
 6   domain       100 non-null    object 
dtypes: float64(1), int64(2), object(4)
memory usage: 5.6+ KB
None


In [5]:
# Join enrollments with users and courses
enriched_enrollments = pd.merge(enrollment_df, users_df, left_on='user_id', right_on='id', how='inner')
enriched_enrollments = pd.merge(enriched_enrollments, courses_df, left_on='course_id', right_on='id', how='inner')

# Join performance with enrollments
performance_enriched = pd.merge(performance_df, enrollment_df, left_on='enrollment_id', right_on='id', how='inner')

# Preview the joined data
print(enriched_enrollments.head())
print(performance_enriched.head())


   id_x  user_id  course_id enrollment_date completion_date  id_y  \
0     1       19         90      2024-06-26      2024-07-15    19   
1     4       38         55      2023-11-29      Incomplete    38   
2     6       14         42      2023-10-29      2023-12-16    14   
3     8       44         24      2024-08-03      Incomplete    44   
4     9       53         56      2024-06-12      Incomplete    53   

              name                       email  password  role  id  \
0         Mia Wong         miawong@example.com  1234@Bcd  user  90   
1  Frank Underwood  frankunderwood@example.com  1234@Bcd  user  55   
2       Hannah Lee       hannahlee@example.com  1234@Bcd  user  42   
3   Luke Skywalker   lukeskywalker@example.com  1234@Bcd  user  24   
4      Uma Thurman      umathurman@example.com  1234@Bcd  user  56   

                  title                                       description  \
0      Machine Learning                  Learn the basics of programming.   
1       Da

In [6]:
# Example: Calculate completion rate
total_enrollments = enriched_enrollments.shape[0]
completed_courses = enriched_enrollments[enriched_enrollments['completion_date'] != 'Incomplete'].shape[0]
completion_rate = (completed_courses / total_enrollments) * 100

# Example: Calculate certificate percentage in performance data
certificate_rate = (performance_enriched['certificate_earned'].sum() / performance_enriched.shape[0]) * 100

print(f"Completion Rate: {completion_rate:.2f}%")
print(f"Certificate Earning Rate: {certificate_rate:.2f}%")


Completion Rate: 39.58%
Certificate Earning Rate: 52.00%


# Reporting

In [None]:
%pip install matplotlib seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count completed courses per user
user_completion = enriched_enrollments.groupby('user_id').apply(lambda x: (x['completion_date'] != 'Incomplete').sum())

# Bar plot for user completion rates
plt.figure(figsize=(10, 6))
sns.barplot(x=user_completion.index, y=user_completion.values)
plt.title('Course Completion Count by User')
plt.xlabel('User ID')
plt.ylabel('Number of Completed Courses')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Merge performance with learning paths
performance_with_paths = pd.merge(performance_enriched, learning_paths_df, left_on='course_id', right_on='id')

# Calculate average score by learning path
average_score_by_path = performance_with_paths.groupby('title')['score'].mean()

# Bar plot for average score by learning path
plt.figure(figsize=(12, 6))
sns.barplot(x=average_score_by_path.index, y=average_score_by_path.values)
plt.title('Average Score by Learning Path')
plt.xlabel('Learning Path')
plt.ylabel('Average Score')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
# Count number of enrollments per learning path
path_enrollments = enriched_enrollments.groupby('title').size()

# Pie chart for learning path popularity
plt.figure(figsize=(8, 8))
plt.pie(path_enrollments, labels=path_enrollments.index, autopct='%1.1f%%', startangle=140)
plt.title('Popularity of Learning Paths (Enrollments)')
plt.tight_layout()
plt.show()


In [None]:
# Calculate average score by user
average_score_by_user = performance_enriched.groupby('user_id')['score'].mean().sort_values(ascending=False)

# Get top 5 users
top_5_users = average_score_by_user.head(5)

print("Top 5 Users by Average Score:")
print(top_5_users)

# Bar plot for top 5 users
plt.figure(figsize=(8, 6))
sns.barplot(x=top_5_users.index, y=top_5_users.values)
plt.title('Top 5 Users by Average Score')
plt.xlabel('User ID')
plt.ylabel('Average Score')
plt.tight_layout()
plt.show()


In [None]:
# Count of certificate earned vs not earned
certificate_earned_count = performance_df['certificate_earned'].value_counts()

# Pie chart for certificates earned vs not earned
plt.figure(figsize=(8, 8))
plt.pie(certificate_earned_count, labels=['Earned', 'Not Earned'], autopct='%1.1f%%', startangle=140)
plt.title('Certificate Earned vs Not Earned')
plt.tight_layout()
plt.show()


In [None]:
# Group by domain and calculate average duration
average_duration_by_domain = courses_df.groupby('domain')['duration'].mean()

# Bar plot for average duration by domain
plt.figure(figsize=(12, 6))
sns.barplot(x=average_duration_by_domain.index, y=average_duration_by_domain.values)
plt.title('Average Course Duration by Domain')
plt.xlabel('Domain')
plt.ylabel('Average Duration (hours)')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
# Scatter plot of progress vs. score
plt.figure(figsize=(10, 6))
sns.scatterplot(data=performance_df, x='progress', y='score', hue='certificate_earned')
plt.title('Course Progress vs Score')
plt.xlabel('Progress (%)')
plt.ylabel('Score')
plt.legend(title='Certificate Earned', loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
# Count of enrollments per course
enrollment_count = enriched_enrollments.groupby('course_id').size()

# Merge with course titles for better visualization
course_enrollment = pd.merge(enrollment_count, courses_df[['id', 'title']], left_on='course_id', right_on='id')

# Bar plot for course enrollments
plt.figure(figsize=(12, 6))
sns.barplot(x=course_enrollment['title'], y=course_enrollment[0])
plt.title('Course Popularity by Enrollments')
plt.xlabel('Course Title')
plt.ylabel('Number of Enrollments')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
# Merge performance data with course data to get difficulty level
performance_with_difficulty = pd.merge(performance_df, courses_df[['id', 'difficulty']], left_on='course_id', right_on='id')

# Calculate average score by difficulty level
average_score_by_difficulty = performance_with_difficulty.groupby('difficulty')['score'].mean()

# Bar plot for average score by difficulty level
plt.figure(figsize=(8, 6))
sns.barplot(x=average_score_by_difficulty.index, y=average_score_by_difficulty.values)
plt.title('Average Score by Course Difficulty')
plt.xlabel('Difficulty Level')
plt.ylabel('Average Score')
plt.tight_layout()
plt.show()


In [None]:
# Convert enrollment date to datetime if necessary
enriched_enrollments['enrollment_date'] = pd.to_datetime(enriched_enrollments['enrollment_date'])

# Group by month or year to analyze trends
enrollments_over_time = enriched_enrollments.groupby(enriched_enrollments['enrollment_date'].dt.to_period('M')).size()

# Line plot for enrollments over time
plt.figure(figsize=(10, 6))
enrollments_over_time.plot(kind='line')
plt.title('User Enrollments Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Enrollments')
plt.tight_layout()
plt.show()
