In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)
num_students = 200

student_id = ['S' + str(i).zfill(3) for i in range(1, num_students + 1)]
name = ['Student_' + str(i) for i in range(1, num_students + 1)]
class_name = np.random.choice(['Class A', 'Class B', 'Class C'], num_students)
comprehension = np.random.uniform(0, 100, num_students)
attention = np.random.uniform(0, 100, num_students)
focus = np.random.uniform(0, 100, num_students)
retention = np.random.uniform(0, 100, num_students)
engagement_time = np.random.uniform(10, 120, num_students)

assessment_score = (0.25 * comprehension + 0.20 * attention + 0.20 * focus + 0.20 * retention + 0.15 * (engagement_time / 120 * 100))
assessment_score += np.random.normal(0, 5, num_students)
assessment_score = np.clip(assessment_score, 0, 100)

student_df = pd.DataFrame({
    'student_id': student_id,
    'name': name,
    'class': class_name,
    'comprehension': comprehension,
    'attention': attention,
    'focus': focus,
    'retention': retention,
    'engagement_time': engagement_time,
    'assessment_score': assessment_score
})


In [None]:
eda_summary = student_df.describe().T[['mean', '50%', 'std']].rename(columns={'50%': 'median'})
print(eda_summary)

corr_matrix = student_df[['comprehension', 'attention', 'focus', 'retention', 'engagement_time', 'assessment_score']].corr()
print(corr_matrix)

                       mean     median        std
comprehension     49.648728  47.012257  28.977235
attention         51.389657  53.134880  28.612934
focus             51.187715  53.518615  28.342923
retention         52.260637  53.476281  26.828951
engagement_time   65.435189  66.842956  33.108595
assessment_score  52.123636  53.059218  14.018128
                  comprehension  attention     focus  retention  \
comprehension          1.000000  -0.035162  0.028036   0.031712   
attention             -0.035162   1.000000  0.032159   0.071050   
focus                  0.028036   0.032159  1.000000   0.022036   
retention              0.031712   0.071050  0.022036   1.000000   
engagement_time       -0.022755  -0.002121 -0.057836  -0.012944   
assessment_score       0.515674   0.393194  0.472849   0.431505   

                  engagement_time  assessment_score  
comprehension           -0.022755          0.515674  
attention               -0.002121          0.393194  
focus             

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X = student_df[['comprehension', 'attention', 'focus', 'retention', 'engagement_time']]
y = student_df['assessment_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}, R2: {r2}')

MSE: 32.135637706917535, R2: 0.8754058147338253


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

sil_scores = []
for k in range(2, 7):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    sil_scores.append(silhouette_score(X_scaled, labels))

print(sil_scores)
# Choose k with highest silhouette score, e.g., k=4
optimal_k = sil_scores.index(max(sil_scores)) + 2
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
student_df['learning_persona'] = kmeans.fit_predict(X_scaled)

[np.float64(0.15006800306637522), np.float64(0.13539673885498812), np.float64(0.13672530660798277), np.float64(0.16549741704042759), np.float64(0.1618385946242514)]


In [None]:
student_df.to_json('students.json', orient='records')


In [None]:
# Save JSON in the current folder first
student_df.to_json("students.json", orient="records", indent=2)
print("JSON saved as students.json")



JSON saved as students.json


In [None]:
import json

# Try reading back the file
with open("students.json", "r") as f:
    data = json.load(f)

print(f"Loaded {len(data)} students")
print("First record example:\n", data[0])


Loaded 200 students
First record example:
 {'student_id': 'S001', 'name': 'Student_1', 'class': 'Class C', 'comprehension': 40.3836171058, 'attention': 17.5330269889, 'focus': 24.108483233, 'retention': 45.1948601299, 'engagement_time': 68.2627846521, 'assessment_score': 36.212384902, 'learning_persona': 0}


In [None]:
import os

os.makedirs("../nextjs-app/public/data", exist_ok=True)
student_df.to_json("../nextjs-app/public/data/students.json", orient="records", indent=2)
