# 📊 Student Cognitive Analysis & Machine Learning
This notebook demonstrates how to analyze student cognitive data, build a regression model to predict assessment scores, and perform clustering to identify learning personas.
It avoids external dependencies like scikit-learn and uses only NumPy & Pandas for compatibility.

## 🔧 Step 1: Import Required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

plt.style.use('seaborn-v0_8')

## 📂 Step 2: Load Student Dataset

In [2]:
students = [
    {"student_id": 1, "name": "Alice", "class": "A", "comprehension": 0.8, "attention": 0.7, "focus": 0.9, "retention": 0.85, "assessment_score": 92, "engagement_time": 80},
    {"student_id": 2, "name": "Bob", "class": "A", "comprehension": 0.6, "attention": 0.65, "focus": 0.7, "retention": 0.6, "assessment_score": 75, "engagement_time": 60},
    {"student_id": 3, "name": "Charlie", "class": "B", "comprehension": 0.9, "attention": 0.85, "focus": 0.95, "retention": 0.88, "assessment_score": 95, "engagement_time": 90},
    {"student_id": 4, "name": "David", "class": "B", "comprehension": 0.4, "attention": 0.5, "focus": 0.45, "retention": 0.5, "assessment_score": 55, "engagement_time": 40},
    {"student_id": 5, "name": "Eva", "class": "C", "comprehension": 0.7, "attention": 0.75, "focus": 0.8, "retention": 0.7, "assessment_score": 82, "engagement_time": 70},
]

df = pd.DataFrame(students)
df

Unnamed: 0,student_id,name,class,comprehension,attention,focus,retention,assessment_score,engagement_time
0,1,Alice,A,0.8,0.7,0.9,0.85,92,80
1,2,Bob,A,0.6,0.65,0.7,0.6,75,60
2,3,Charlie,B,0.9,0.85,0.95,0.88,95,90
3,4,David,B,0.4,0.5,0.45,0.5,55,40
4,5,Eva,C,0.7,0.75,0.8,0.7,82,70


## 📈 Step 3: Linear Regression (Using Normal Equation)

In [3]:
X = df[["comprehension", "attention", "focus", "retention"]].values
y = df["assessment_score"].values

# Add bias term
X_b = np.c_[np.ones((X.shape[0], 1)), X]
theta = np.linalg.pinv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)

intercept, coefficients = theta[0], theta[1:]
predictions = X_b.dot(theta)

mse = np.mean((y - predictions) ** 2)
r_squared = 1 - (np.sum((y - predictions)**2) / np.sum((y - np.mean(y))**2))

print("Intercept:", intercept)
print("Coefficients:", coefficients)
print("MSE:", mse)
print("R^2:", r_squared)

Intercept: 26.941176470565864
Coefficients: [ 33.1372549  -23.92156863  72.54901961 -11.76470588]
MSE: 3.227327675988501e-21
R^2: 1.0


## 🎯 Step 4: Prediction Results

In [4]:
prediction_results = []
for i, s in df.iterrows():
    predicted = predictions[i]
    actual = s["assessment_score"]
    diff = actual - predicted
    accuracy = max(0, 100 - abs(diff))
    prediction_results.append({
        "student_id": s["student_id"],
        "name": s["name"],
        "actualScore": actual,
        "predictedScore": round(predicted, 2),
        "difference": round(diff, 2),
        "accuracy": round(accuracy, 2)
    })

pd.DataFrame(prediction_results)

Unnamed: 0,student_id,name,actualScore,predictedScore,difference,accuracy
0,1,Alice,92,92.0,0.0,100.0
1,2,Bob,75,75.0,0.0,100.0
2,3,Charlie,95,95.0,0.0,100.0
3,4,David,55,55.0,0.0,100.0
4,5,Eva,82,82.0,0.0,100.0


## 🤖 Step 5: K-Means Clustering (Numpy Implementation)

In [5]:
def kmeans(X, k, max_iters=100):
    np.random.seed(42)
    centroids = X[np.random.choice(len(X), k, replace=False)]
    for _ in range(max_iters):
        distances = np.linalg.norm(X[:, None] - centroids, axis=2)
        labels = np.argmin(distances, axis=1)
        new_centroids = np.array([X[labels==i].mean(axis=0) for i in range(k)])
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    return labels, centroids

features = df[["comprehension", "attention", "focus", "retention"]].values
clusters, centroids = kmeans(features, k=3)
df["cluster"] = clusters
df

Unnamed: 0,student_id,name,class,comprehension,attention,focus,retention,assessment_score,engagement_time,cluster
0,1,Alice,A,0.8,0.7,0.9,0.85,92,80,2
1,2,Bob,A,0.6,0.65,0.7,0.6,75,60,0
2,3,Charlie,B,0.9,0.85,0.95,0.88,95,90,2
3,4,David,B,0.4,0.5,0.45,0.5,55,40,0
4,5,Eva,C,0.7,0.75,0.8,0.7,82,70,1


## 🔍 Step 6: Insights & Observations

In [6]:
feature_names = ["Comprehension", "Attention", "Focus", "Retention"]
most_important_idx = np.argmax(np.abs(coefficients))
most_important = feature_names[most_important_idx]

avg_accuracy = np.mean([p['accuracy'] for p in prediction_results])
cluster_counts = Counter(df['cluster'])
largest_group = cluster_counts.most_common(1)[0]

insights = []
insights.append(f"The regression model achieves {avg_accuracy:.1f}% average prediction accuracy with R² = {r_squared*100:.1f}%.")
insights.append(f"{most_important} is the most influential cognitive skill for predicting scores.")
insights.append(f'Clustering shows the largest group has {largest_group[1]} students in cluster {largest_group[0]}')

for line in insights:
    print('-', line)

- The regression model achieves 100.0% average prediction accuracy with R² = 100.0%.
- Focus is the most influential cognitive skill for predicting scores.
- Clustering shows the largest group has 2 students in cluster 2
