In [None]:


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from pathlib import Path

# Load dataset
DATA_DIR = Path('../public')
csv_path = DATA_DIR / 'synthetic_students.csv'
df = pd.read_csv(csv_path)

# Ensure numeric dtypes
num_cols = ['comprehension','attention','focus','retention','assessment_score','engagement_time']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

print('Rows:', len(df))
print(df.head(2))

# Correlation matrix
corr = df[num_cols].corr()
plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

# Skill vs score scatter plots
fig, axes = plt.subplots(1,3, figsize=(12,3))
for ax, col in zip(axes, ['attention','focus','retention']):
    ax.scatter(df[col], df['assessment_score'], alpha=0.5)
    ax.set_xlabel(col)
    ax.set_ylabel('assessment_score')
plt.tight_layout()
plt.show()

# Simple ML: Linear Regression
X = df[['comprehension','attention','focus','retention','engagement_time']]
y = df['assessment_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linreg = LinearRegression()
linreg.fit(X_train, y_train)

y_pred = linreg.predict(X_test)
print('R2:', round(r2_score(y_test, y_pred), 3))
print('MAE:', round(mean_absolute_error(y_test, y_pred), 2))
print('Coefficients:', dict(zip(X.columns, np.round(linreg.coef_, 3))))

# Clustering: KMeans on standardized skills
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=4, random_state=42, n_init='auto')
clusters = kmeans.fit_predict(X_scaled)

df['persona'] = clusters
persona_names = {
    0: 'Engaged Achiever',
    1: 'Independent Learner',
    2: 'Active but Forgetful',
    3: 'Needs Guidance'
}
df['persona'] = df['persona'].map(lambda c: persona_names.get(c, f'Cluster {c}'))

# Export enhanced CSV for the dashboard
out_path = DATA_DIR / 'synthetic_students_with_personas.csv'
df.to_csv(out_path, index=False)
print('Wrote', out_path)

# Persona distribution
plt.figure(figsize=(5,3))
df['persona'].value_counts().plot(kind='bar')
plt.title('Persona Distribution')
plt.ylabel('Count')
plt.tight_layout()
plt.show()
