In [1]:
import pandas as pd


In [4]:
students_data = pd.read_csv('cleaned_students_dataset.csv')
course_activity_data = pd.read_csv('cleaned_course_activity.csv')
feedback_data = pd.read_csv('cleaned_feedback.csv')

In [5]:
print("Students Dataset Info:")
print(students_data.info())
print("\nStudents Dataset Description:")
print(students_data.describe())

Students Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Student_ID      100 non-null    object
 1   Name            100 non-null    object
 2   Age             100 non-null    int64 
 3   Gender          100 non-null    object
 4   Location        100 non-null    object
 5   Enrolment_Date  100 non-null    object
dtypes: int64(1), object(5)
memory usage: 4.8+ KB
None

Students Dataset Description:
              Age
count  100.000000
mean    25.480000
std      4.835642
min     18.000000
25%     21.000000
50%     25.000000
75%     29.000000
max     34.000000


In [10]:
# Calculate overall average completion rate
average_completion_rate = course_activity_data['Completion_Percentage'].mean()
print(f"Overall Average Completion Rate: {average_completion_rate:.2f}%")

Overall Average Completion Rate: 54.78%


In [11]:
# Identify the course with the highest and lowest average engagement time
average_engagement_time = course_activity_data.groupby('Course_ID')['Time_Spent_Minutes'].mean()

highest_engagement_course = average_engagement_time.idxmax()
lowest_engagement_course = average_engagement_time.idxmin()
highest_engagement_time = average_engagement_time.max()
lowest_engagement_time = average_engagement_time.min()

print(f"Course with Highest Engagement: {highest_engagement_course} ({highest_engagement_time:.2f} minutes)")
print(f"Course with Lowest Engagement: {lowest_engagement_course} ({lowest_engagement_time:.2f} minutes)")

Course with Highest Engagement: DM101 (102.43 minutes)
Course with Lowest Engagement: PY202 (93.90 minutes)


In [14]:
# Calculate average engagement time by age group
bins = [0, 18, 25, 35, 45, 55, 65, 100]
labels = ['<18', '18-25', '26-35', '36-45', '46-55', '56-65', '65+']
students_data['Age_Group'] = pd.cut(students_data['Age'], bins=bins, labels=labels)

merged_data = pd.merge(course_activity_data, students_data[['Student_ID', 'Age_Group']], on='Student_ID')

engagement_by_age_group = merged_data.groupby('Age_Group')['Time_Spent_Minutes'].mean()
print("\nAverage Engagement Time by Age Group:")
print(engagement_by_age_group)


Average Engagement Time by Age Group:
Age_Group
<18      107.102041
18-25     99.675958
26-35     95.362229
36-45           NaN
46-55           NaN
56-65           NaN
65+             NaN
Name: Time_Spent_Minutes, dtype: float64


  engagement_by_age_group = merged_data.groupby('Age_Group')['Time_Spent_Minutes'].mean()


In [13]:
# Calculate average feedback rating per course
average_feedback_rating = feedback_data.groupby('Course_ID')['Rating'].mean()
print("\nAverage Feedback Rating per Course:")
print(average_feedback_rating)


Average Feedback Rating per Course:
Course_ID
DM101    2.900000
PY202    3.277778
UX303    2.923077
WD404    2.789474
Name: Rating, dtype: float64


In [15]:
# Calculate correlation
merged_feedback_completion = pd.merge(feedback_data, course_activity_data[['Course_ID', 'Completion_Percentage']], on='Course_ID')

correlation = merged_feedback_completion['Completion_Percentage'].corr(merged_feedback_completion['Rating'])
print(f"\nCorrelation between Completion Rate and Feedback Rating: {correlation:.2f}")


Correlation between Completion Rate and Feedback Rating: 0.00


In [18]:
#Identify Top 3 Student Segments Based on Engagement and Satisfaction
engagement_satisfaction = course_activity_data.merge(feedback_data[['Course_ID', 'Rating']], on='Course_ID')
engagement_satisfaction = engagement_satisfaction.merge(students_data[['Student_ID']], on='Student_ID')

engagement_summary = engagement_satisfaction.groupby('Student_ID').agg({
    'Time_Spent_Minutes': 'mean',
    'Rating': 'mean'
}).reset_index()

engagement_summary['Engagement_Satisfaction_Score'] = engagement_summary['Time_Spent_Minutes'] * engagement_summary['Rating']
top_segments = engagement_summary.nlargest(3, 'Engagement_Satisfaction_Score')

print("\nTop 3 Student Segments Based on Engagement and Satisfaction:")
print(top_segments[['Student_ID', 'Time_Spent_Minutes', 'Rating', 'Engagement_Satisfaction_Score']])


Top 3 Student Segments Based on Engagement and Satisfaction:
   Student_ID  Time_Spent_Minutes    Rating  Engagement_Satisfaction_Score
74       S075          153.640000  3.000000                     460.920000
41       S042          146.850242  2.946860                     432.747089
0        S001          140.508197  3.016393                     423.828003
