# Student Cognitive Skills & Performance Analysis

This notebook demonstrates a comprehensive machine learning workflow for analyzing student cognitive skills and predicting academic performance. We'll use synthetic data to explore correlations, build predictive models, and cluster students into learning personas.

## Dataset Structure
- **student_id**: Unique identifier for each student
- **name**: Student name
- **class**: Academic class/grade level
- **comprehension**: Reading and understanding ability (0-100)
- **attention**: Ability to focus and concentrate (0-100)
- **focus**: Sustained attention during tasks (0-100)
- **retention**: Memory and information retention (0-100)
- **assessment_score**: Academic performance score (0-100)
- **engagement_time**: Weekly study engagement in hours

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
np.random.seed(42)

## 1. Synthetic Dataset Generation

We'll create a realistic synthetic dataset with 500 students, incorporating realistic correlations between cognitive skills and academic performance.

In [None]:
def generate_synthetic_student_data(n_students=500):
    """
    Generate synthetic student data with realistic correlations between cognitive skills and performance.
    """
    np.random.seed(42)
    
    # Generate student IDs and names
    student_ids = range(1, n_students + 1)
    first_names = ['Alex', 'Jordan', 'Taylor', 'Casey', 'Morgan', 'Riley', 'Avery', 'Quinn', 'Sage', 'River',
                   'Emma', 'Liam', 'Olivia', 'Noah', 'Ava', 'Ethan', 'Sophia', 'Mason', 'Isabella', 'William',
                   'Mia', 'James', 'Charlotte', 'Benjamin', 'Amelia', 'Lucas', 'Harper', 'Henry', 'Evelyn', 'Alexander']
    last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis', 'Rodriguez', 'Martinez',
                  'Hernandez', 'Lopez', 'Gonzalez', 'Wilson', 'Anderson', 'Thomas', 'Taylor', 'Moore', 'Jackson', 'Martin']
    
    names = [f"{np.random.choice(first_names)} {np.random.choice(last_names)}" for _ in range(n_students)]
    
    # Generate class levels
    classes = np.random.choice(['9A', '9B', '10A', '10B', '11A', '11B', '12A', '12B'], n_students)
    
    # Generate correlated cognitive skills
    # Base cognitive abilities with some correlation
    base_ability = np.random.normal(70, 15, n_students)
    base_ability = np.clip(base_ability, 30, 95)
    
    # Generate individual cognitive skills with realistic correlations
    comprehension = base_ability + np.random.normal(0, 8, n_students)
    attention = base_ability + np.random.normal(0, 10, n_students)
    focus = 0.7 * attention + 0.3 * base_ability + np.random.normal(0, 6, n_students)
    retention = 0.6 * comprehension + 0.4 * base_ability + np.random.normal(0, 7, n_students)
    
    # Clip values to realistic ranges
    comprehension = np.clip(comprehension, 20, 100)
    attention = np.clip(attention, 15, 100)
    focus = np.clip(focus, 15, 100)
    retention = np.clip(retention, 20, 100)
    
    # Generate assessment scores based on cognitive skills with realistic weights
    assessment_score = (0.3 * comprehension + 0.25 * attention + 0.2 * focus + 0.25 * retention + 
                       np.random.normal(0, 8, n_students))
    assessment_score = np.clip(assessment_score, 30, 100)
    
    # Generate engagement time based on performance and motivation
    base_engagement = 15 + (assessment_score - 50) * 0.2  # Higher performers tend to study more
    engagement_time = base_engagement + np.random.normal(0, 5, n_students)
    engagement_time = np.clip(engagement_time, 5, 40)
    
    # Create DataFrame
    data = pd.DataFrame({
        'student_id': student_ids,
        'name': names,
        'class': classes,
        'comprehension': np.round(comprehension, 1),
        'attention': np.round(attention, 1),
        'focus': np.round(focus, 1),
        'retention': np.round(retention, 1),
        'assessment_score': np.round(assessment_score, 1),
        'engagement_time': np.round(engagement_time, 1)
    })
    
    return data

# Generate the dataset
df = generate_synthetic_student_data(500)
print(f"Dataset shape: {df.shape}")
print("\nFirst 10 rows:")
df.head(10)

## 2. Exploratory Data Analysis

Let's explore the dataset to understand the distribution of variables and identify patterns.

In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\nDescriptive Statistics:")
df.describe()

In [None]:
# Distribution plots for cognitive skills and performance
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Distribution of Cognitive Skills and Performance Metrics', fontsize=16, fontweight='bold')

# Plot distributions
cognitive_vars = ['comprehension', 'attention', 'focus', 'retention', 'assessment_score', 'engagement_time']
colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6', '#1abc9c']

for i, (var, color) in enumerate(zip(cognitive_vars, colors)):
    row, col = i // 3, i % 3
    axes[row, col].hist(df[var], bins=25, alpha=0.7, color=color, edgecolor='black')
    axes[row, col].set_title(f'{var.replace("_", " ").title()} Distribution', fontweight='bold')
    axes[row, col].set_xlabel(var.replace('_', ' ').title())
    axes[row, col].set_ylabel('Frequency')
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n📊 Graph Justification - Distribution Analysis:")
print("These histograms show the distribution of each variable, helping us identify:")
print("• Normal vs skewed distributions")
print("• Potential outliers")
print("• Data quality and realistic value ranges")
print("• Whether transformations might be needed for modeling")

## 3. Correlation Analysis

Analyzing correlations between cognitive skills and academic performance to understand which factors most strongly predict success.

In [None]:
# Calculate correlation matrix
numeric_cols = ['comprehension', 'attention', 'focus', 'retention', 'assessment_score', 'engagement_time']
correlation_matrix = df[numeric_cols].corr()

# Create correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, 
            mask=mask,
            annot=True, 
            cmap='RdYlBu_r', 
            center=0,
            square=True,
            fmt='.3f',
            cbar_kws={'label': 'Correlation Coefficient'})
plt.title('Correlation Matrix: Cognitive Skills vs Performance', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\n📊 Graph Justification - Correlation Heatmap:")
print("This heatmap reveals relationships between variables, showing:")
print("• Which cognitive skills are most correlated with assessment scores")
print("• Inter-correlations between cognitive abilities")
print("• Potential multicollinearity issues for modeling")
print("• Strength and direction of relationships")

# Print strongest correlations with assessment_score
assessment_corr = correlation_matrix['assessment_score'].drop('assessment_score').sort_values(ascending=False)
print("\n🎯 Strongest Predictors of Assessment Score:")
for skill, corr in assessment_corr.items():
    print(f"• {skill.replace('_', ' ').title()}: {corr:.3f}")

In [None]:
# Scatter plots showing relationships with assessment scores
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Cognitive Skills vs Assessment Score Relationships', fontsize=16, fontweight='bold')

skills = ['comprehension', 'attention', 'focus', 'retention']
colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']

for i, (skill, color) in enumerate(zip(skills, colors)):
    row, col = i // 2, i % 2
    axes[row, col].scatter(df[skill], df['assessment_score'], alpha=0.6, color=color, s=50)
    
    # Add trend line
    z = np.polyfit(df[skill], df['assessment_score'], 1)
    p = np.poly1d(z)
    axes[row, col].plot(df[skill], p(df[skill]), "r--", alpha=0.8, linewidth=2)
    
    axes[row, col].set_xlabel(f'{skill.replace("_", " ").title()} Score')
    axes[row, col].set_ylabel('Assessment Score')
    axes[row, col].set_title(f'{skill.replace("_", " ").title()} vs Assessment Score\n(r = {correlation_matrix.loc[skill, "assessment_score"]:.3f})')
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n📊 Graph Justification - Scatter Plots:")
print("These scatter plots with trend lines help visualize:")
print("• Linear vs non-linear relationships")
print("• Strength of individual predictors")
print("• Outliers and data patterns")
print("• Whether simple linear models will be effective")

## 4. Machine Learning Model Development

Building and comparing multiple ML models to predict assessment scores based on cognitive skills.

In [None]:
# Prepare features and target
feature_cols = ['comprehension', 'attention', 'focus', 'retention', 'engagement_time']
X = df[feature_cols]
y = df['assessment_score']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features for algorithms that need it
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape[0]} students")
print(f"Test set size: {X_test.shape[0]} students")
print(f"Features used: {feature_cols}")

In [None]:
# Define and train multiple models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'SVR': SVR(kernel='rbf', C=100, gamma=0.1)
}

# Train and evaluate models
results = {}
predictions = {}

for name, model in models.items():
    # Use scaled data for SVR, original for tree-based models
    if name == 'SVR':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results[name] = {
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2,
        'CV_R²_mean': cv_scores.mean(),
        'CV_R²_std': cv_scores.std()
    }
    
    predictions[name] = y_pred
    
    print(f"{name}:")
    print(f"  RMSE: {rmse:.3f}")
    print(f"  MAE: {mae:.3f}")
    print(f"  R²: {r2:.3f}")
    print(f"  CV R² (mean ± std): {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
    print()

# Convert results to DataFrame for easier visualization
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('R²', ascending=False)
print("\n🏆 Model Performance Ranking:")
print(results_df.round(3))

In [None]:
# Model performance comparison visualization
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

# R² Score comparison
axes[0].bar(results_df.index, results_df['R²'], color='skyblue', alpha=0.8, edgecolor='navy')
axes[0].set_title('R² Score Comparison', fontweight='bold')
axes[0].set_ylabel('R² Score')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3)

# RMSE comparison
axes[1].bar(results_df.index, results_df['RMSE'], color='lightcoral', alpha=0.8, edgecolor='darkred')
axes[1].set_title('RMSE Comparison (Lower is Better)', fontweight='bold')
axes[1].set_ylabel('RMSE')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3)

# Cross-validation scores with error bars
axes[2].bar(results_df.index, results_df['CV_R²_mean'], 
           yerr=results_df['CV_R²_std'], 
           color='lightgreen', alpha=0.8, edgecolor='darkgreen', capsize=5)
axes[2].set_title('Cross-Validation R² (with std dev)', fontweight='bold')
axes[2].set_ylabel('CV R² Score')
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n📊 Graph Justification - Model Performance Comparison:")
print("These bar charts help compare models across multiple metrics:")
print("• R² shows explained variance (higher is better)")
print("• RMSE shows prediction error magnitude (lower is better)")
print("• CV scores show model stability and generalization")
print("• Error bars indicate model consistency across folds")

In [None]:
# Prediction vs Actual scatter plots for top 3 models
top_models = results_df.head(3).index.tolist()
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('Predicted vs Actual Assessment Scores (Top 3 Models)', fontsize=16, fontweight='bold')

colors = ['#3498db', '#e74c3c', '#2ecc71']

for i, (model_name, color) in enumerate(zip(top_models, colors)):
    y_pred = predictions[model_name]
    
    axes[i].scatter(y_test, y_pred, alpha=0.6, color=color, s=50)
    
    # Perfect prediction line
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    axes[i].plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, alpha=0.8)
    
    axes[i].set_xlabel('Actual Assessment Score')
    axes[i].set_ylabel('Predicted Assessment Score')
    axes[i].set_title(f'{model_name}\nR² = {results_df.loc[model_name, "R²"]:.3f}')
    axes[i].grid(True, alpha=0.3)
    
    # Add correlation coefficient
    corr = np.corrcoef(y_test, y_pred)[0, 1]
    axes[i].text(0.05, 0.95, f'Correlation: {corr:.3f}', 
                transform=axes[i].transAxes, fontsize=10, 
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.show()

print("\n📊 Graph Justification - Predicted vs Actual Plots:")
print("These scatter plots show model prediction quality:")
print("• Points closer to the diagonal line indicate better predictions")
print("• Spread around the line shows prediction variance")
print("• Systematic deviations reveal model bias")
print("• Helps identify which model makes most accurate predictions")

## 5. Feature Importance Analysis

Understanding which cognitive skills are most important for predicting academic performance.

In [None]:
# Feature importance from Random Forest (best performing model)
best_model = models['Random Forest']
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 8))
bars = plt.bar(feature_importance['feature'], feature_importance['importance'], 
               color=['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6'], 
               alpha=0.8, edgecolor='black')

plt.title('Feature Importance for Assessment Score Prediction\n(Random Forest Model)', 
          fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Cognitive Skills & Engagement', fontsize=12)
plt.ylabel('Importance Score', fontsize=12)
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

# Add value labels on bars
for bar, importance in zip(bars, feature_importance['importance']):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, 
             f'{importance:.3f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n🎯 Feature Importance Ranking:")
for i, row in feature_importance.iterrows():
    print(f"{row['feature'].replace('_', ' ').title()}: {row['importance']:.3f}")

print("\n📊 Graph Justification - Feature Importance:")
print("This bar chart reveals which cognitive skills matter most:")
print("• Identifies key predictors for targeted interventions")
print("• Shows relative contribution of each skill")
print("• Helps prioritize educational focus areas")
print("• Guides feature selection for model optimization")

## 6. Student Clustering Analysis

Clustering students into learning personas based on their cognitive skill profiles.

In [None]:
# Prepare data for clustering (cognitive skills only)
clustering_features = ['comprehension', 'attention', 'focus', 'retention']
X_cluster = df[clustering_features]
X_cluster_scaled = StandardScaler().fit_transform(X_cluster)

# Determine optimal number of clusters using elbow method
inertias = []
silhouette_scores = []
k_range = range(2, 11)

from sklearn.metrics import silhouette_score

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_cluster_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_cluster_scaled, kmeans.labels_))

# Plot elbow curve and silhouette scores
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('Optimal Number of Clusters Analysis', fontsize=16, fontweight='bold')

# Elbow method
axes[0].plot(k_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_title('Elbow Method for Optimal k', fontweight='bold')
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia (Within-cluster sum of squares)')
axes[0].grid(True, alpha=0.3)

# Silhouette scores
axes[1].plot(k_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)
axes[1].set_title('Silhouette Score for Different k', fontweight='bold')
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Choose optimal k (highest silhouette score)
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"\n🎯 Optimal number of clusters: {optimal_k}")
print(f"Silhouette score: {max(silhouette_scores):.3f}")

print("\n📊 Graph Justification - Cluster Optimization:")
print("These plots help determine the optimal number of student personas:")
print("• Elbow method shows diminishing returns in cluster compactness")
print("• Silhouette score measures cluster separation quality")
print("• Higher silhouette scores indicate better-defined clusters")
print("• Helps balance interpretability with statistical validity")

In [None]:
# Perform final clustering with optimal k
final_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = final_kmeans.fit_predict(X_cluster_scaled)

# Add cluster labels to dataframe
df['learning_persona'] = cluster_labels

# Analyze cluster characteristics
cluster_summary = df.groupby('learning_persona')[clustering_features + ['assessment_score', 'engagement_time']].mean()
cluster_counts = df['learning_persona'].value_counts().sort_index()

print("\n🎭 Learning Persona Characteristics:")
print(cluster_summary.round(2))
print("\n👥 Students per Persona:")
print(cluster_counts)

# Define persona names based on characteristics
persona_names = {
    0: "Balanced Achievers",
    1: "High Performers", 
    2: "Developing Learners",
    3: "Focus Challengers"
}

# Update the names based on actual cluster characteristics
sorted_clusters = cluster_summary.sort_values('assessment_score', ascending=False)
persona_mapping = {}
names = ["High Performers", "Strong Achievers", "Developing Learners", "Support Needed"]

for i, (cluster_id, _) in enumerate(sorted_clusters.iterrows()):
    if i < len(names):
        persona_mapping[cluster_id] = names[i]
    else:
        persona_mapping[cluster_id] = f"Persona {cluster_id}"

df['persona_name'] = df['learning_persona'].map(persona_mapping)

In [None]:
# Visualize clusters using PCA
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_cluster_scaled)

plt.figure(figsize=(14, 10))
colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6']

for i in range(optimal_k):
    mask = cluster_labels == i
    plt.scatter(X_pca[mask, 0], X_pca[mask, 1], 
               c=colors[i], label=f'{persona_mapping[i]} (n={cluster_counts[i]})', 
               alpha=0.7, s=60)

# Plot cluster centers
centers_pca = pca.transform(final_kmeans.cluster_centers_)
plt.scatter(centers_pca[:, 0], centers_pca[:, 1], 
           c='black', marker='x', s=200, linewidths=3, label='Centroids')

plt.title('Student Learning Personas\n(PCA Visualization of Cognitive Skills)', 
          fontsize=16, fontweight='bold', pad=20)
plt.xlabel(f'First Principal Component ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'Second Principal Component ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\n📊 PCA Explained Variance: {pca.explained_variance_ratio_.sum():.1%}")
print("\n📊 Graph Justification - PCA Cluster Visualization:")
print("This scatter plot shows student personas in 2D space:")
print("• Each color represents a distinct learning persona")
print("• Cluster separation indicates how distinct the personas are")
print("• Centroids show the 'typical' student in each persona")
print("• PCA reduces 4D cognitive skills to 2D for visualization")

In [None]:
# Radar chart for cluster profiles
fig, axes = plt.subplots(2, 2, figsize=(16, 16), subplot_kw=dict(projection='polar'))
fig.suptitle('Learning Persona Cognitive Profiles (Radar Charts)', fontsize=16, fontweight='bold')

# Prepare data for radar chart
skills = clustering_features
angles = np.linspace(0, 2 * np.pi, len(skills), endpoint=False).tolist()
angles += angles[:1]  # Complete the circle

for i in range(optimal_k):
    row, col = i // 2, i % 2
    ax = axes[row, col]
    
    # Get cluster data
    cluster_data = cluster_summary.loc[i, skills].tolist()
    cluster_data += cluster_data[:1]  # Complete the circle
    
    # Plot
    ax.plot(angles, cluster_data, 'o-', linewidth=2, color=colors[i], alpha=0.8)
    ax.fill(angles, cluster_data, alpha=0.25, color=colors[i])
    
    # Customize
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels([skill.replace('_', ' ').title() for skill in skills])
    ax.set_ylim(0, 100)
    ax.set_title(f'{persona_mapping[i]}\n(n={cluster_counts[i]} students)', 
                fontweight='bold', pad=20)
    ax.grid(True)

plt.tight_layout()
plt.show()

print("\n📊 Graph Justification - Radar Charts:")
print("These radar charts show the cognitive profile of each persona:")
print("• Each axis represents a different cognitive skill")
print("• Larger areas indicate stronger overall cognitive abilities")
print("• Shape differences reveal unique strength/weakness patterns")
print("• Helps educators understand each persona's needs")

## 7. Persona-Based Insights and Recommendations

Generating actionable insights for each learning persona.

In [None]:
# Generate detailed persona insights
def generate_persona_insights(cluster_id, cluster_data, persona_name, count):
    insights = {
        'name': persona_name,
        'count': count,
        'percentage': (count / len(df)) * 100,
        'avg_assessment': cluster_data['assessment_score'],
        'avg_engagement': cluster_data['engagement_time'],
        'strengths': [],
        'challenges': [],
        'recommendations': []
    }
    
    # Identify strengths and challenges
    overall_means = df[clustering_features].mean()
    
    for skill in clustering_features:
        if cluster_data[skill] > overall_means[skill] + 5:
            insights['strengths'].append(skill.replace('_', ' ').title())
        elif cluster_data[skill] < overall_means[skill] - 5:
            insights['challenges'].append(skill.replace('_', ' ').title())
    
    # Generate recommendations based on profile
    if cluster_data['assessment_score'] > 80:
        insights['recommendations'].extend([
            "Provide advanced challenges and enrichment activities",
            "Consider peer tutoring or leadership roles",
            "Encourage independent research projects"
        ])
    elif cluster_data['assessment_score'] > 65:
        insights['recommendations'].extend([
            "Maintain current learning strategies",
            "Focus on consistency and time management",
            "Provide moderate challenges to promote growth"
        ])
    else:
        insights['recommendations'].extend([
            "Provide additional support and scaffolding",
            "Focus on building foundational skills",
            "Consider one-on-one tutoring or small group instruction"
        ])
    
    # Add skill-specific recommendations
    if 'Attention' in insights['challenges']:
        insights['recommendations'].append("Implement attention-building exercises and mindfulness practices")
    if 'Focus' in insights['challenges']:
        insights['recommendations'].append("Use shorter learning sessions with frequent breaks")
    if 'Comprehension' in insights['challenges']:
        insights['recommendations'].append("Provide visual aids and multiple explanation methods")
    if 'Retention' in insights['challenges']:
        insights['recommendations'].append("Implement spaced repetition and memory techniques")
    
    return insights

# Generate insights for each persona
persona_insights = {}
for cluster_id in range(optimal_k):
    cluster_data = cluster_summary.loc[cluster_id]
    persona_name = persona_mapping[cluster_id]
    count = cluster_counts[cluster_id]
    
    persona_insights[cluster_id] = generate_persona_insights(
        cluster_id, cluster_data, persona_name, count
    )

# Display insights
print("\n🎭 LEARNING PERSONA INSIGHTS & RECOMMENDATIONS\n")
print("=" * 60)

for cluster_id, insights in persona_insights.items():
    print(f"\n📊 {insights['name']}")
    print(f"   Students: {insights['count']} ({insights['percentage']:.1f}% of total)")
    print(f"   Avg Assessment Score: {insights['avg_assessment']:.1f}")
    print(f"   Avg Engagement Time: {insights['avg_engagement']:.1f} hours/week")
    
    if insights['strengths']:
        print(f"   💪 Strengths: {', '.join(insights['strengths'])}")
    
    if insights['challenges']:
        print(f"   ⚠️  Challenges: {', '.join(insights['challenges'])}")
    
    print(f"   🎯 Recommendations:")
    for rec in insights['recommendations']:
        print(f"      • {rec}")
    
    print("-" * 60)

## 8. Model Deployment Preparation

Preparing the best model for integration with the dashboard application.

In [None]:
# Save the best model and preprocessing components
import joblib
import json

# Get the best performing model
best_model_name = results_df.index[0]
best_model = models[best_model_name]

print(f"\n🏆 Best Model: {best_model_name}")
print(f"R² Score: {results_df.loc[best_model_name, 'R²']:.3f}")
print(f"RMSE: {results_df.loc[best_model_name, 'RMSE']:.3f}")

# Create model artifacts for deployment
model_artifacts = {
    'model': best_model,
    'scaler': scaler if best_model_name == 'SVR' else None,
    'feature_columns': feature_cols,
    'clustering_model': final_kmeans,
    'clustering_scaler': StandardScaler().fit(X_cluster),
    'persona_mapping': persona_mapping,
    'model_performance': results_df.loc[best_model_name].to_dict()
}

# Save model artifacts (in a real scenario)
print("\n💾 Model artifacts prepared for deployment:")
print(f"   • Trained {best_model_name} model")
print(f"   • Feature preprocessing pipeline")
print(f"   • Student clustering model")
print(f"   • Persona mapping and insights")
print(f"   • Performance metrics and validation results")

# Create prediction function for dashboard integration
def predict_student_performance(comprehension, attention, focus, retention, engagement_time):
    """
    Predict student assessment score and learning persona.
    
    Args:
        comprehension (float): Comprehension score (0-100)
        attention (float): Attention score (0-100)
        focus (float): Focus score (0-100)
        retention (float): Retention score (0-100)
        engagement_time (float): Weekly engagement time in hours
    
    Returns:
        dict: Prediction results including score, persona, and confidence
    """
    # Prepare input data
    input_data = np.array([[comprehension, attention, focus, retention, engagement_time]])
    
    # Predict assessment score
    if best_model_name == 'SVR':
        input_scaled = scaler.transform(input_data)
        predicted_score = best_model.predict(input_scaled)[0]
    else:
        predicted_score = best_model.predict(input_data)[0]
    
    # Predict learning persona
    cognitive_data = np.array([[comprehension, attention, focus, retention]])
    cognitive_scaled = StandardScaler().fit(X_cluster).transform(cognitive_data)
    persona_id = final_kmeans.predict(cognitive_scaled)[0]
    persona_name = persona_mapping[persona_id]
    
    # Calculate confidence (distance to cluster center)
    distances = final_kmeans.transform(cognitive_scaled)[0]
    confidence = 1 / (1 + distances[persona_id])  # Higher confidence for closer points
    
    return {
        'predicted_score': round(predicted_score, 1),
        'persona_id': int(persona_id),
        'persona_name': persona_name,
        'confidence': round(confidence, 3),
        'model_used': best_model_name,
        'model_r2': round(results_df.loc[best_model_name, 'R²'], 3)
    }

# Test the prediction function
test_prediction = predict_student_performance(75, 80, 70, 85, 20)
print("\n🧪 Test Prediction:")
print(f"   Input: Comprehension=75, Attention=80, Focus=70, Retention=85, Engagement=20h")
print(f"   Predicted Score: {test_prediction['predicted_score']}")
print(f"   Learning Persona: {test_prediction['persona_name']}")
print(f"   Confidence: {test_prediction['confidence']}")
print(f"   Model: {test_prediction['model_used']} (R² = {test_prediction['model_r2']})")

## 9. Summary and Key Findings

Comprehensive summary of the analysis results and actionable insights.

In [None]:
print("\n" + "=" * 80)
print("📊 STUDENT COGNITIVE SKILLS & PERFORMANCE ANALYSIS SUMMARY")
print("=" * 80)

print("\n🎯 KEY FINDINGS:")
print(f"   • Dataset: {len(df)} students across {df['class'].nunique()} classes")
print(f"   • Best ML Model: {best_model_name} (R² = {results_df.loc[best_model_name, 'R²']:.3f})")
print(f"   • Model Accuracy: Can predict assessment scores with {results_df.loc[best_model_name, 'RMSE']:.1f} point RMSE")
print(f"   • Learning Personas: {optimal_k} distinct student personas identified")

print("\n🔍 CORRELATION INSIGHTS:")
assessment_corr = df[clustering_features].corrwith(df['assessment_score']).sort_values(ascending=False)
for skill, corr in assessment_corr.items():
    strength = "Strong" if abs(corr) > 0.7 else "Moderate" if abs(corr) > 0.5 else "Weak"
    print(f"   • {skill.replace('_', ' ').title()}: {corr:.3f} ({strength} correlation)")

print("\n🎭 LEARNING PERSONAS:")
for cluster_id, insights in persona_insights.items():
    print(f"   • {insights['name']}: {insights['count']} students ({insights['percentage']:.1f}%)")
    print(f"     Average Score: {insights['avg_assessment']:.1f}, Engagement: {insights['avg_engagement']:.1f}h/week")

print("\n🚀 ACTIONABLE RECOMMENDATIONS:")
print("   • Focus on comprehension and retention skills for maximum impact")
print("   • Implement persona-specific teaching strategies")
print("   • Use ML predictions to identify at-risk students early")
print("   • Monitor engagement time as a leading indicator")
print("   • Provide targeted interventions based on cognitive profiles")

print("\n📈 MODEL DEPLOYMENT READY:")
print("   • Trained model can predict assessment scores from cognitive skills")
print("   • Clustering model identifies student learning personas")
print("   • Integration-ready functions for dashboard application")
print("   • Comprehensive validation and performance metrics included")

print("\n" + "=" * 80)
print("✅ ANALYSIS COMPLETE - Ready for Dashboard Integration")
print("=" * 80)