# Student Cognitive Skills: Analysis, Prediction & Clustering
*Generated: 2025-09-18 04:32 UTC*

This notebook:
1. Loads a synthetic dataset of students
2. Explores correlations between cognitive skills and performance
3. Trains a model to predict `assessment_score`
4. Clusters students into learning personas


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Load data
df = pd.read_csv('students.csv')
df.head()


## Basic Stats

In [None]:

df.describe(include='all')


## Correlations

In [None]:

numeric_cols = ['comprehension','attention','focus','retention','engagement_time','assessment_score']
corr = df[numeric_cols].corr()
corr


In [None]:

# Heatmap
plt.figure(figsize=(6,4))
plt.imshow(corr, cmap='viridis', aspect='auto')
plt.xticks(range(len(numeric_cols)), numeric_cols, rotation=45, ha='right')
plt.yticks(range(len(numeric_cols)), numeric_cols)
plt.colorbar(label='Correlation')
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()


## Train/Test Split & Models

In [None]:

X = df[['comprehension','attention','focus','retention','engagement_time']]
y = df['assessment_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Linear Regression
lin = LinearRegression()
lin.fit(X_train, y_train)
pred_lin = lin.predict(X_test)
r2_lin = r2_score(y_test, pred_lin)
mae_lin = mean_absolute_error(y_test, pred_lin)

# Random Forest
rf = RandomForestRegressor(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
r2_rf = r2_score(y_test, pred_rf)
mae_rf = mean_absolute_error(y_test, pred_rf)

r2_lin, mae_lin, r2_rf, mae_rf


In [None]:

importances = None
try:
    importances = rf.feature_importances_
    for name, imp in zip(X.columns, importances):
        print(f"{name}: {imp:.3f}")
except Exception as e:
    print(e)


## Clustering into Learning Personas

In [None]:

# Standardize features for clustering
features = df[['comprehension','attention','focus','retention','engagement_time']].values
scaler = StandardScaler()
Z = scaler.fit_transform(features)

# Choose k=3 personas for simplicity
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
labels = kmeans.fit_predict(Z)

df['persona'] = labels

# Brief semantics (based on cluster centers)
centroids = scaler.inverse_transform(kmeans.cluster_centers_)
centers_df = pd.DataFrame(centroids, columns=['comprehension','attention','focus','retention','engagement_time'])
centers_df['size'] = pd.Series([(df['persona']==i).sum() for i in range(3)])
centers_df


In [None]:

# Map cluster labels to human-friendly persona names based on center ranks
def persona_name(row):
    # heuristic: weight comprehension & retention higher
    score = 0.4*row['comprehension'] + 0.2*row['attention'] + 0.2*row['focus'] + 0.2*row['retention']
    if score >= centers_df[['comprehension','attention','focus','retention']].stack().mean():
        return 'Focused Achievers'
    elif row['engagement_time'] >= centers_df['engagement_time'].mean():
        return 'Engaged Strivers'
    else:
        return 'At-Risk (Low Engagement)'
    
# Build a mapping cluster_id -> persona label by applying to centers
mapping = {}
for i in range(3):
    mapping[i] = persona_name(centers_df.iloc[i])

mapping


In [None]:

df['persona_name'] = df['persona'].map(mapping)
df[['student_id','name','class','assessment_score','persona_name']].head(10)


## Save Artifacts

In [None]:

# Save predictions and personas
df.to_csv('students_with_personas.csv', index=False)

# Save model (best of RF vs Linear)
best_model = rf if r2_rf >= r2_lin else lin
import pickle
with open('model.pkl','wb') as f:
    pickle.dump(best_model, f)

with open('scaler.pkl','wb') as f:
    pickle.dump(scaler, f)

print("Artifacts saved: students_with_personas.csv, model.pkl, scaler.pkl")
