In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori, association_rules
from IPython.display import display

In [None]:
# Load dataset
dataframe = pd.read_csv('graded_exams.csv')

In [None]:
# Display the first few rows as a formatted table
display(dataframe.head())

In [None]:
# Define X (student characteristics) and Y (grade categories)
X_columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
Y_columns = ['math grade', 'reading grade', 'writing grade']

In [None]:
# One-hot encode X (characteristics) and Y (grade categories)
X_encoded = pd.get_dummies(dataframe[X_columns])
Y_encoded = pd.get_dummies(dataframe[Y_columns])

# Combine X and Y into a single DataFrame for Apriori
dataframe_encoded = pd.concat([X_encoded, Y_encoded], axis=1)

# Apply Apriori algorithm
frequent_itemsets = apriori(dataframe_encoded, min_support=0.05, use_colnames=True)

# Generate association rules (X → Y relationships only)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.80)

In [None]:
# Filter rules so that antecedents contain ONLY X values and consequents contain ONLY Y values
X_labels = set(X_encoded.columns)
Y_labels = set(Y_encoded.columns)

rules_filtered = rules[
    rules['antecedents'].apply(lambda antecedent: all(item in X_labels for item in antecedent)) &
    rules['consequents'].apply(lambda consequent: all(item in Y_labels for item in consequent))
]

In [None]:
# Sort by highest lift value (strongest relationships)
rules_sorted = rules_filtered.sort_values(by="lift", ascending=False)

# Display the strongest association rules
display(rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))

In [None]:
# Filter association rules where the consequent is a failure in any subject
failure_rules = rules_sorted[rules_sorted['consequents'].astype(str).str.contains("Failure")]

# Group by failure category and sum lift values (higher lift means stronger relationship)
failure_lift_sums = failure_rules.groupby(failure_rules['consequents'].astype(str))['lift'].sum()

# Convert to percentages
failure_percentages = (failure_lift_sums / failure_lift_sums.sum()) * 100

# Define colors
colors = ['lightcoral', 'lightblue', 'lightgreen']

# Create pie chart
plt.figure(figsize=(8, 8))
plt.pie(failure_percentages, labels=failure_percentages.index, autopct='%1.1f%%', colors=colors, startangle=140, wedgeprops={'edgecolor': 'white'})

# Title
plt.title("Failure Distribution Based on Characteristics with Strongest Relationships")

# Show chart
plt.show()