In [107]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

def assign_grade(gender, race_ethnicity, parental_education, lunch, test_preparation_course):
    # Identify high performers first
    if parental_education == "master's degree" and test_preparation_course == 'completed':
        return 'Superior'
    
    if parental_education == "bachelor's degree" and test_preparation_course == 'completed':
        return 'Excellent'

    if parental_education in ["associate's degree", "some college"] and test_preparation_course == 'completed':
        return 'Good'

    # If conditions for good performance are not met, default to Failure
    return 'Failure'


In [108]:
# Load dataset
csv_path = "D:\student-classification-model\graded_exams.csv"
df = pd.read_csv(csv_path)

# Ensure that we don't modify original data
df = df.copy()

# Features (excluding actual grade columns)
features = df.drop(columns=['math grade', 'reading grade', 'writing grade'])

# Target variables (actual dataset grades)
target_math = df['math grade']
target_reading = df['reading grade']
target_writing = df['writing grade']

In [109]:
# Splitting the data (keeping your original split strategy)
def split_data(features, target_math, target_reading, target_writing):
    # First split: Training (70%) and Temp (30%)
    X_train, X_temp, y_train_math, y_temp_math, y_train_reading, y_temp_reading, y_train_writing, y_temp_writing = train_test_split(
        features, target_math, target_reading, target_writing, test_size=0.3, random_state=42
    )

    # Second split: Testing (20%) and Unseen (10%) from Temp (30%)
    X_test, X_unseen, y_test_math, y_unseen_math, y_test_reading, y_unseen_reading, y_test_writing, y_unseen_writing = train_test_split(
        X_temp, y_temp_math, y_temp_reading, y_temp_writing, test_size=1/3, random_state=42
    )

    return (
        X_train, X_test, X_unseen,
        y_train_math, y_test_math, y_unseen_math,
        y_train_reading, y_test_reading, y_unseen_reading,
        y_train_writing, y_test_writing, y_unseen_writing
    )

# Apply split
X_train, X_test, X_unseen, y_train_math, y_test_math, y_unseen_math, y_train_reading, y_test_reading, y_unseen_reading, y_train_writing, y_test_writing, y_unseen_writing = split_data(features, target_math, target_reading, target_writing)

In [110]:
# **Apply Rule-Based Classification only to X_test**
y_pred_math = X_test.apply(lambda row: assign_grade(
    row["gender"], 
    row["race/ethnicity"], 
    row["parental level of education"], 
    row["lunch"], 
    row["test preparation course"]), axis=1)

y_pred_reading = X_test.apply(lambda row: assign_grade(
    row["gender"], 
    row["race/ethnicity"], 
    row["parental level of education"], 
    row["lunch"], 
    row["test preparation course"]), axis=1)

y_pred_writing = X_test.apply(lambda row: assign_grade(
    row["gender"], 
    row["race/ethnicity"], 
    row["parental level of education"], 
    row["lunch"], 
    row["test preparation course"]), axis=1)

# Evaluation function
def evaluate_rule_based(y_true, y_pred, target_name):
    print(f"{target_name} Grade Prediction (Rule-Based Classifier)")
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print(classification_report(y_true, y_pred))
    print()

# Evaluate the rule-based classifier against ACTUAL grades
evaluate_rule_based(y_test_math, y_pred_math, "Math")
evaluate_rule_based(y_test_reading, y_pred_reading, "Reading")
evaluate_rule_based(y_test_writing, y_pred_writing, "Writing")

Math Grade Prediction (Rule-Based Classifier)
Accuracy: 0.52
               precision    recall  f1-score   support

Above Average       0.00      0.00      0.00         7
      Average       0.00      0.00      0.00        13
Below Average       0.00      0.00      0.00        19
    Excellent       0.12      0.17      0.14         6
      Failure       0.66      0.81      0.73       121
         Good       0.10      0.44      0.16         9
      Passing       0.00      0.00      0.00        19
     Superior       0.00      0.00      0.00         6

     accuracy                           0.52       200
    macro avg       0.11      0.18      0.13       200
 weighted avg       0.41      0.52      0.45       200


Reading Grade Prediction (Rule-Based Classifier)
Accuracy: 0.46
               precision    recall  f1-score   support

Above Average       0.00      0.00      0.00        19
      Average       0.00      0.00      0.00        20
Below Average       0.00      0.00      0.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Set Seaborn style
sns.set_style("whitegrid")

# Create scatter plots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Scatter plot for Math vs Reading
sns.scatterplot(x=shuffled_data["math score"], y=shuffled_data["reading score"],
                hue=shuffled_data["math grade"], palette="coolwarm", ax=axes[0])
axes[0].set_title("Math vs. Reading Scores")
axes[0].set_xlabel("Math Score")
axes[0].set_ylabel("Reading Score")

# Scatter plot for Math vs Writing
sns.scatterplot(x=shuffled_data["math score"], y=shuffled_data["writing score"],
                hue=shuffled_data["math grade"], palette="coolwarm", ax=axes[1])
axes[1].set_title("Math vs. Writing Scores")
axes[1].set_xlabel("Math Score")
axes[1].set_ylabel("Writing Score")

# Scatter plot for Reading vs Writing
sns.scatterplot(x=shuffled_data["reading score"], y=shuffled_data["writing score"],
                hue=shuffled_data["reading grade"], palette="coolwarm", ax=axes[2])
axes[2].set_title("Reading vs. Writing Scores")
axes[2].set_xlabel("Reading Score")
axes[2].set_ylabel("Writing Score")

# Adjust layout and show plot
plt.tight_layout()
plt.show()

In [None]:
# Function to evaluate unseen data
def evaluate_on_unseen(y_true, y_pred, subject):
    print(f"\n{subject} Evaluation on Unseen Data")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2f}")
    print(classification_report(y_true, y_pred))

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y_true), yticklabels=np.unique(y_true))
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(f"Confusion Matrix for {subject}")
    plt.show()

# Apply rule-based classifier with categorical columns
y_unseen_math = shuffled_data.apply(lambda row: assign_grade(row["math score"], row["lunch"], row["test preparation course"], row["parental level of education"]), axis=1)
y_unseen_reading = shuffled_data.apply(lambda row: assign_grade(row["reading score"], row["lunch"], row["test preparation course"], row["parental level of education"]), axis=1)
y_unseen_writing = shuffled_data.apply(lambda row: assign_grade(row["writing score"], row["lunch"], row["test preparation course"], row["parental level of education"]), axis=1)


# Evaluate on unseen dataset
evaluate_on_unseen(y_unseen_math, y_unseen_math, "Math")
evaluate_on_unseen(y_unseen_reading, y_unseen_reading, "Reading")
evaluate_on_unseen(y_unseen_writing, y_unseen_writing, "Writing")