In [20]:


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load dataset
file_path = "student_engagement_data_10000.csv"   # update path if needed
df = pd.read_csv(file_path)

# 1. Basic Summary Statistics
print("=== Dataset Info ===")
print(df.info())
print("\n=== Summary Statistics ===")
print(df.describe(include="all"))
print("\n=== Missing Values per Column ===")
print(df.isnull().sum())

# Value counts for categorical columns (change names if dataset has different ones)
for col in ['learning_mode', 'student_id']:
    if col in df.columns:
        print(f"\n=== Value Counts for {col} ===")
        print(df[col].value_counts().head(10))

# 2. Daily Average Engagement and Attention
if 'date' in df.columns and 'engagement' in df.columns and 'attention' in df.columns:
    daily_avg = df.groupby('date')[['engagement', 'attention']].mean()

    # Plot daily engagement
    plt.figure(figsize=(10,5))
    plt.plot(daily_avg.index, daily_avg['engagement'], label="Avg Engagement", marker='o')
    plt.plot(daily_avg.index, daily_avg['attention'], label="Avg Attention", marker='s')
    plt.title("Daily Average Engagement & Attention")
    plt.xlabel("Date")
    plt.ylabel("Average Score")
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

# 3. Most and Least Engaged Students
if 'student_id' in df.columns and 'engagement' in df.columns:
    student_avg = df.groupby('student_id')['engagement'].mean().sort_values(ascending=False)

    # Top 10
    plt.figure(figsize=(10,5))
    student_avg.head(10).plot(kind='bar', color='green')
    plt.title("Top 10 Most Engaged Students")
    plt.xlabel("Student ID")
    plt.ylabel("Average Engagement")
    plt.show()

    # Bottom 10
    plt.figure(figsize=(10,5))
    student_avg.tail(10).plot(kind='bar', color='red')
    plt.title("Bottom 10 Least Engaged Students")
    plt.xlabel("Student ID")
    plt.ylabel("Average Engagement")
    plt.show()

# 4. Engagement by Learning Mode
if 'learning_mode' in df.columns and 'engagement' in df.columns:
    mode_avg = df.groupby('learning_mode')['engagement'].mean()

    plt.figure(figsize=(8,5))
    mode_avg.plot(kind='bar', color='skyblue')
    plt.title("Average Engagement by Learning Mode")
    plt.xlabel("Learning Mode")
    plt.ylabel("Average Engagement")
    plt.show()


=== Dataset Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Student ID                10000 non-null  int64 
 1   Session ID                10000 non-null  int64 
 2   Date                      10000 non-null  object
 3   EEG Data (PSD Features)   10000 non-null  object
 4   Engagement Level          10000 non-null  object
 5   Attention Level           10000 non-null  object
 6   Internet Connectivity     10000 non-null  object
 7   Adaptivity Level          10000 non-null  object
 8   Learning Mode Prediction  10000 non-null  object
 9   Performance Metrics       10000 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 781.4+ KB
None

=== Summary Statistics ===
         Student ID    Session ID       Date  \
count   10000.00000  10000.000000      10000   
unique          NaN           NaN         

Index(['Student ID', 'Session ID', 'Date', 'EEG Data (PSD Features)',
       'Engagement Level', 'Attention Level', 'Internet Connectivity',
       'Adaptivity Level', 'Learning Mode Prediction', 'Performance Metrics'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Student ID                10000 non-null  int64 
 1   Session ID                10000 non-null  int64 
 2   Date                      10000 non-null  object
 3   EEG Data (PSD Features)   10000 non-null  object
 4   Engagement Level          10000 non-null  object
 5   Attention Level           10000 non-null  object
 6   Internet Connectivity     10000 non-null  object
 7   Adaptivity Level          10000 non-null  object
 8   Learning Mode Prediction  10000 non-null  object
 9   Performance Metrics       10000 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 781.4+ KB
None


Student ID                  0
Session ID                  0
Date                        0
EEG Data (PSD Features)     0
Engagement Level            0
Attention Level             0
Internet Connectivity       0
Adaptivity Level            0
Learning Mode Prediction    0
Performance Metrics         0
dtype: int64

=== Dataset Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Student ID                10000 non-null  int64 
 1   Session ID                10000 non-null  int64 
 2   Date                      10000 non-null  object
 3   EEG Data (PSD Features)   10000 non-null  object
 4   Engagement Level          10000 non-null  object
 5   Attention Level           10000 non-null  object
 6   Internet Connectivity     10000 non-null  object
 7   Adaptivity Level          10000 non-null  object
 8   Learning Mode Prediction  10000 non-null  object
 9   Performance Metrics       10000 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 781.4+ KB
None

=== Summary Statistics ===
         Student ID    Session ID       Date  \
count   10000.00000  10000.000000      10000   
unique          NaN           NaN         