In [None]:
# Classification model (baseline)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('data/student_performance.csv')
# create target grade from average of three scores
df['average_score'] = df[['math score','reading score','writing score']].mean(axis=1)
def score_to_grade(s):
    if s>=85: return 'A'
    if s>=70: return 'B'
    if s>=50: return 'C'
    return 'D'
df['grade'] = df['average_score'].apply(score_to_grade)

# encode small categorical set
cat_cols = ['gender','race/ethnicity','parental level of education','lunch','test preparation course']
for c in cat_cols:
    if c in df.columns:
        df[c] = df[c].astype(str)

X = df[['math score','reading score','writing score']].copy()  # simple features baseline
y = df['grade']
le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42, stratify=y_enc)
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

acc = accuracy_score(y_test, pred)
report = classification_report(y_test, pred, target_names=le.classes_)
cm = confusion_matrix(y_test, pred)

print('Accuracy:', acc)
print(report)
joblib.dump({'model': clf, 'label_encoder': le}, 'models/grade_classifier.pkl')

# save confusion matrix
import os
os.makedirs('reports', exist_ok=True)
import matplotlib.pyplot as plt
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.savefig('reports/confusion_matrix.png', bbox_inches='tight')
plt.close()
