In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, roc_auc_score

In [None]:
# Load the dataset
df = pd.read_csv('cleaned_exams.csv')

In [None]:
# Grading function based on the grade
def assign_grade(score):
    if 97 <= score <= 100:
        return 'Excellent'
    elif 93 <= score <= 96:
        return 'Superior'
    elif 89 <= score <= 92:
        return 'Good'
    elif 85 <= score <= 88:
        return 'Above Average'
    elif 81 <= score <= 84:
        return 'Average'
    elif 77 <= score <= 80:
        return 'Below Average'
    elif 73 <= score <= 76:
        return 'Passing'
    else:
        return 'Failure'

# Assign grades to each subject
df['math_grade'] = df['math score'].apply(assign_grade)
df['reading_grade'] = df['reading score'].apply(assign_grade)
df['writing_grade'] = df['writing score'].apply(assign_grade)

In [None]:
def process_subject(subject):

    # Prepare features and target
    X = df.drop(columns=[f'{subject}_grade'])
    X = pd.get_dummies(X, drop_first=True)  # One-hot encode categorical features
    y = df[f'{subject}_grade']

    # Encode target labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    # Split 70% training, 20% testing, 10% unseen
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    X_test, X_unseen, y_test, y_unseen = train_test_split(X_temp, y_temp, test_size=1/3, random_state=42, stratify=y_temp)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    X_unseen_scaled = scaler.transform(X_unseen)

    # Train Naive Bayes with cross-validation
    nb_model = GaussianNB()
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    cross_val_acc = cross_val_score(nb_model, X_train_scaled, y_train, cv=cv, scoring='accuracy')
    print(f"Cross-validation Accuracy (Mean) for {subject}: {np.mean(cross_val_acc):.4f}")

    # Train final model
    nb_model.fit(X_train_scaled, y_train)

    # Predict on test set
    y_test_pred = nb_model.predict(X_test_scaled)
    y_test_proba = nb_model.predict_proba(X_test_scaled)

    # Test set evaluation
    print(f"\n{subject} Test Evaluation:")
    print("Test Accuracy:", round(accuracy_score(y_test, y_test_pred), 4))
    # print("Classification Report:\n", classification_report(y_test, y_test_pred))
    print("Precision:", round(precision_score(y_test, y_test_pred, average='weighted'), 4))
    print("Recall:", round(recall_score(y_test, y_test_pred, average='weighted'), 4))
    print("ROC-AUC:", round(roc_auc_score(y_test, y_test_proba, multi_class='ovr'), 4))

    # Confusion matrix for test set
    plt.figure(figsize=(6, 5))
    sns.heatmap(confusion_matrix(y_test, y_test_pred), annot=True, fmt='d', cmap='Greens')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix - {subject} Test Set")
    plt.show()

    # Predict on unseen data
    y_unseen_pred = nb_model.predict(X_unseen_scaled)
    y_unseen_proba = nb_model.predict_proba(X_unseen_scaled)

    # Unseen data evaluation
    print(f"\n{subject} Unseen Data Evaluation:")
    print("Unseen Accuracy:", round(accuracy_score(y_unseen, y_unseen_pred), 4))
    # print("Classification Report:\n", classification_report(y_unseen, y_unseen_pred))
    print("Unseen Precision:", round(precision_score(y_unseen, y_unseen_pred, average='weighted'), 4))
    print("Unseen Recall:", round(recall_score(y_unseen, y_unseen_pred, average='weighted'), 4))
    print("Unseen ROC-AUC:", round(roc_auc_score(y_unseen, y_unseen_proba, multi_class='ovr'), 4))

    # Confusion matrix for unseen data
    plt.figure(figsize=(6, 5))
    sns.heatmap(confusion_matrix(y_unseen, y_unseen_pred), annot=True, fmt='d', cmap='Oranges')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix - {subject} Unseen Data")
    plt.show()

In [None]:
# Process each subject separately
for subject in ['math', 'reading', 'writing']:
    process_subject(subject)