In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from mlxtend.frequent_patterns import apriori, association_rules

# Fix the file path to use proper escape characters
csv_path = r"graded_exams.csv"  # Using raw string to avoid escape sequence issues
df = pd.read_csv(csv_path)

# Ensure that we don't modify original data
df = df.copy()

In [13]:
# Features (excluding actual grade columns)
features = df.drop(columns=['math grade', 'reading grade', 'writing grade'])

# Target variables
target_math = df['math grade']
target_reading = df['reading grade']
target_writing = df['writing grade']

# Reset indices to avoid potential index mismatches
features.reset_index(drop=True, inplace=True)
target_math.reset_index(drop=True, inplace=True)
target_reading.reset_index(drop=True, inplace=True)
target_writing.reset_index(drop=True, inplace=True)

In [14]:
def split_data(features, target_math, target_reading, target_writing):
    # First split: Training (70%) and Temp (30%)
    X_train, X_temp, y_train_math, y_temp_math, y_train_reading, y_temp_reading, y_train_writing, y_temp_writing = train_test_split(
        features, target_math, target_reading, target_writing, test_size=0.3, random_state=42
    )

    # Second split: Testing (20%) and Unseen (10%) from Temp (30%)
    X_test, X_unseen, y_test_math, y_unseen_math, y_test_reading, y_unseen_reading, y_test_writing, y_unseen_writing = train_test_split(
        X_temp, y_temp_math, y_temp_reading, y_temp_writing, test_size=1/3, random_state=42
    )

    return (
        X_train, X_test, X_unseen,
        y_train_math, y_test_math, y_unseen_math,
        y_train_reading, y_test_reading, y_unseen_reading,
        y_train_writing, y_test_writing, y_unseen_writing
    )

# Apply split
X_train, X_test, X_unseen, y_train_math, y_test_math, y_unseen_math, y_train_reading, y_test_reading, y_unseen_reading, y_train_writing, y_test_writing, y_unseen_writing = split_data(features, target_math, target_reading, target_writing)

In [None]:
from collections import Counter

def analyze_feature_combinations(X_data, y_data, features_to_analyze, min_samples=10):
    """
    This function enhances a rule-based classifier by identifying common feature 
    combinations that lead to specific grade outcomes. Similar to how **Apriori** is used in 
    association rule mining to discover frequent itemsets and generate rules from transaction data, 
    `analyze_feature_combinations` also identifies patterns, but with a focus on classification.

    The function works by grouping the data based on selected features and determining the most 
    frequently occurring grade for each feature combination. This process helps uncover patterns 
    that strongly correlate with specific grades, much like how Apriori identifies frequent itemsets. 
    By associating these feature combinations with the most common grade, the function builds a 
    structured lookup table that aids in classification. This lookup table acts like a decision rule 
    system for the classifier, mapping feature combinations to the most common grade observed for each 
    combination, along with the distribution of grades.

    While **Apriori** generates itemsets and association rules, which are typically used for uncovering 
    relationships in transactional data, `analyze_feature_combinations` creates decision rules in a 
    classification context. The key difference is that **Apriori** is focused on finding item associations 
    in the dataset, whereas `analyze_feature_combinations` is specifically designed to improve a 
    **rule-based classifier** by associating feature combinations with outcome predictions, like grades. 
    This enhances the classifier's ability to make predictions by forming structured, human-readable 
    decision rules that can be easily interpreted and applied.

    In summary, while both methods identify patterns in data, **Apriori** is used for association rule mining 
    and itemset discovery, and `analyze_feature_combinations` enhances rule-based classification by creating 
    a structured decision-making framework based on feature combinations.
    """
    results = {}

    # Create a copy of X_data and add the target variable
    data = X_data.copy()
    data['grade'] = y_data  # Attach the target variable for analysis

    # Group dataset by the selected features
    grouped = data.groupby(features_to_analyze)

    for name, group in grouped:
        # Ensure only meaningful samples are considered
        if len(group) >= min_samples:
            # Count occurrences of each grade
            grade_counts = Counter(group['grade'])
            
            # Identify the most frequently occurring grade
            if grade_counts:
                most_common_grade = grade_counts.most_common(1)[0][0]
                
                # Store the results, ensuring consistency in key format
                if isinstance(name, tuple):
                    results[name] = (most_common_grade, grade_counts)
                else:
                    results[(name,)] = (most_common_grade, grade_counts)
    
    return results


In [16]:
class AdvancedRuleClassifier:
    def __init__(self, subject):
        self.subject = subject
        self.rules = []
        self.default_grade = None
        
    def build(self, X_train, y_train):
        """
        Build a hierarchy of rules based on the training data
        """
        # First level: Parental education + test prep
        education_test_prep_rules = analyze_feature_combinations(
            X_train, y_train, 
            ['parental level of education', 'test preparation course'],
            min_samples=10
        )
        
        # Second level: Parental education + test prep + gender
        education_test_prep_gender_rules = analyze_feature_combinations(
            X_train, y_train, 
            ['parental level of education', 'test preparation course', 'gender'],
            min_samples=8
        )
        
        # Third level: Parental education + test prep + lunch
        education_test_prep_lunch_rules = analyze_feature_combinations(
            X_train, y_train, 
            ['parental level of education', 'test preparation course', 'lunch'],
            min_samples=8
        )
        
        # Fourth level: Parental education + test prep + race/ethnicity
        education_test_prep_race_rules = analyze_feature_combinations(
            X_train, y_train, 
            ['parental level of education', 'test preparation course', 'race/ethnicity'],
            min_samples=5
        )
        
        # Store rules in order of specificity (most specific first)
        self.rules = [
            (education_test_prep_race_rules, ['parental level of education', 'test preparation course', 'race/ethnicity']),
            (education_test_prep_lunch_rules, ['parental level of education', 'test preparation course', 'lunch']),
            (education_test_prep_gender_rules, ['parental level of education', 'test preparation course', 'gender']),
            (education_test_prep_rules, ['parental level of education', 'test preparation course'])
        ]
        
    def predict(self, X):
        """
        Predict grades using the rule hierarchy
        """
        predictions = []
        
        for _, instance in X.iterrows():
            prediction = None
            
            # Try each rule level
            for rule_set, features in self.rules:
                if prediction:
                    break
                
                feature_values = tuple(instance[feature] for feature in features)
                
                if feature_values in rule_set:
                    prediction = rule_set[feature_values][0]
            
            # Use default grade if no rules match
            if not prediction:
                prediction = self.default_grade
                
            predictions.append(prediction)
        
        return predictions

In [17]:
# Train advanced rule-based classifiers for each subject
math_classifier = AdvancedRuleClassifier("Math")
math_classifier.build(X_train, y_train_math)

reading_classifier = AdvancedRuleClassifier("Reading")
reading_classifier.build(X_train, y_train_reading)

writing_classifier = AdvancedRuleClassifier("Writing")
writing_classifier.build(X_train, y_train_writing)

In [18]:
# Make predictions
y_pred_math = math_classifier.predict(X_test)
y_pred_reading = reading_classifier.predict(X_test)
y_pred_writing = writing_classifier.predict(X_test)

In [19]:
def evaluate_classifier(y_true, y_pred, subject):
    print(f"Evaluation for {subject}:")

    # Ensure y_true and y_pred are Pandas Series with the same dtype
    y_pred = pd.Series(y_pred).astype(str).reset_index(drop=True)
    y_true = pd.Series(y_true).astype(str).reset_index(drop=True)

    # Fill NaN values
    y_true = y_true.fillna("Unknown")
    y_pred = y_pred.fillna("Unknown")

    # Ensure predictions contain only known classes
    valid_classes = set(y_true.unique())
    y_pred = y_pred.apply(lambda x: x if x in valid_classes else "Unknown")

    # Compute accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.2f}\n")

    # Print classification report
    print(classification_report(y_true, y_pred, zero_division=0))

# Evaluate the classifiers
evaluate_classifier(y_test_math, y_pred_math, "Math")
evaluate_classifier(y_test_reading, y_pred_reading, "Reading")
evaluate_classifier(y_test_writing, y_pred_writing, "Writing")


Evaluation for Math:
Accuracy: 0.00

               precision    recall  f1-score   support

Above Average       0.00      0.00      0.00       7.0
      Average       0.00      0.00      0.00      13.0
Below Average       0.00      0.00      0.00      19.0
    Excellent       0.00      0.00      0.00       6.0
      Failure       0.00      0.00      0.00     121.0
         Good       0.00      0.00      0.00       9.0
      Passing       0.00      0.00      0.00      19.0
     Superior       0.00      0.00      0.00       6.0
      Unknown       0.00      0.00      0.00       0.0

     accuracy                           0.00     200.0
    macro avg       0.00      0.00      0.00     200.0
 weighted avg       0.00      0.00      0.00     200.0

Evaluation for Reading:
Accuracy: 0.00

               precision    recall  f1-score   support

Above Average       0.00      0.00      0.00      19.0
      Average       0.00      0.00      0.00      20.0
Below Average       0.00      0.00     

In [None]:
def plot_confusion_matrix(y_true, y_pred, subject):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=set(y_true), yticklabels=set(y_true))
    
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title(f"Confusion Matrix for {subject}")
    plt.show()

# Confusion matrices for each subject
plot_confusion_matrix(y_test_math, y_pred_math, "Math")
plot_confusion_matrix(y_test_reading, y_pred_reading, "Reading")
plot_confusion_matrix(y_test_writing, y_pred_writing, "Writing")