<a href="https://colab.research.google.com/github/VladimirBoshnjakovski/explainable-ai-thesis-code/blob/main/04_whitebox_RIPER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# STEP 1: Install required libraries (if not already present)
!pip install pandas scikit-learn matplotlib seaborn

# Load core libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset from correct public URL
url = "https://raw.githubusercontent.com/nmiuddin/UCI-Heart-Disease-Dataset/master/data/heart-disease-UCI.csv"
df = pd.read_csv(url)

# Preview the data
print(f"Shape: {df.shape}")
df.head()

df.rename(columns={
    'age': 'Age',
    'sex': 'Sex (0=Female, 1=Male)',
    'cp': 'Chest Pain Type (4 Categories)',
    'trestbps': 'Resting Blood Pressure (mm Hg)',
    'chol': 'Serum Cholesterol (mg/dL)',
    'fbs': 'Fasting Blood Sugar > 120 mg/dL (1=Yes)',
    'restecg': 'Resting Electrocardiographic Results',
    'thalach': 'Maximum Heart Rate Achieved',
    'exang': 'Exercise-Induced Angina (1=Yes)',
    'oldpeak': 'ST Depression Induced by Exercise',
    'slope': 'Slope of the Peak Exercise ST Segment',
    'ca': 'Number of Major Vessels Colored by Fluoroscopy',
    'thal': 'Type of Thalassemia',
    'target': 'Presence of Heart Disease (1=Yes)'
}, inplace=True)

# Remove rows where the number of vessels equals 4
df = df[df['Number of Major Vessels Colored by Fluoroscopy'] != 4]

# Convert relevant columns to 'category' dtype
categorical_columns = [
    'Sex (0=Female, 1=Male)',
    'Chest Pain Type (4 Categories)',
    'Fasting Blood Sugar > 120 mg/dL (1=Yes)',
    'Resting Electrocardiographic Results',
    'Exercise-Induced Angina (1=Yes)',
    'Slope of the Peak Exercise ST Segment',
    'Number of Major Vessels Colored by Fluoroscopy',
    'Type of Thalassemia',
    'Presence of Heart Disease (1=Yes)'
]

for col in categorical_columns:
    df[col] = df[col].astype('category')

# Print all columns and their data types
for col in df.columns:
    dtype = df[col].dtype
    cat_flag = " (categorical)" if dtype.name == 'category' else ""
    print(f"{col}: {dtype}{cat_flag}")

# Manually hot-wire the categorical variables into human-readable dummy columns

# Sex (0 = Female, 1 = Male)
df['Sex: Female'] = df['Sex (0=Female, 1=Male)'].apply(lambda x: 1 if x == 0 else 0)
df['Sex: Male'] = df['Sex (0=Female, 1=Male)'].apply(lambda x: 1 if x == 1 else 0)

# Chest Pain Type: 0 = Typical Angina, 1 = Atypical Angina, 2 = Non-anginal pain, 3 = Asymptomatic
df['Chest Pain: Typical Angina'] = df['Chest Pain Type (4 Categories)'].apply(lambda x: 1 if x == 0 else 0)
df['Chest Pain: Atypical Angina'] = df['Chest Pain Type (4 Categories)'].apply(lambda x: 1 if x == 1 else 0)
df['Chest Pain: Non-Anginal'] = df['Chest Pain Type (4 Categories)'].apply(lambda x: 1 if x == 2 else 0)
df['Chest Pain: Asymptomatic'] = df['Chest Pain Type (4 Categories)'].apply(lambda x: 1 if x == 3 else 0)

# Fasting Blood Sugar > 120 mg/dL (1 = Yes, 0 = No)
df['Fasting Blood Sugar: Yes'] = df['Fasting Blood Sugar > 120 mg/dL (1=Yes)'].apply(lambda x: 1 if x == 1 else 0)
df['Fasting Blood Sugar: No'] = df['Fasting Blood Sugar > 120 mg/dL (1=Yes)'].apply(lambda x: 1 if x == 0 else 0)

# Resting ECG: 0 = Normal, 1 = ST Abnormality, 2 = Left Ventricular Hypertrophy
df['Resting ECG: Normal'] = df['Resting Electrocardiographic Results'].apply(lambda x: 1 if x == 0 else 0)
df['Resting ECG: ST Abnormality'] = df['Resting Electrocardiographic Results'].apply(lambda x: 1 if x == 1 else 0)
df['Resting ECG: Left Ventricular Hypertrophy'] = df['Resting Electrocardiographic Results'].apply(lambda x: 1 if x == 2 else 0)

# Exercise-Induced Angina (1 = Yes, 0 = No)
df['Exercise-Induced Angina: Yes'] = df['Exercise-Induced Angina (1=Yes)'].apply(lambda x: 1 if x == 1 else 0)
df['Exercise-Induced Angina: No'] = df['Exercise-Induced Angina (1=Yes)'].apply(lambda x: 1 if x == 0 else 0)

# Slope of the Peak Exercise ST Segment: 0 = Upsloping, 1 = Flat, 2 = Downsloping
df['ST Slope: Upsloping'] = df['Slope of the Peak Exercise ST Segment'].apply(lambda x: 1 if x == 0 else 0)
df['ST Slope: Flat'] = df['Slope of the Peak Exercise ST Segment'].apply(lambda x: 1 if x == 1 else 0)
df['ST Slope: Downsloping'] = df['Slope of the Peak Exercise ST Segment'].apply(lambda x: 1 if x == 2 else 0)

# Thalassemia Type: 0 = Normal, 1 = Fixed Defect, 2 = Reversible Defect
df['Thalassemia: Normal'] = df['Type of Thalassemia'].apply(lambda x: 1 if x == 0 else 0)
df['Thalassemia: Fixed Defect'] = df['Type of Thalassemia'].apply(lambda x: 1 if x == 1 else 0)
df['Thalassemia: Reversible Defect'] = df['Type of Thalassemia'].apply(lambda x: 1 if x == 2 else 0)

# Number of Major Vessels Colored by Fluoroscopy: 0 to 3 (value 4 already removed)
df['Fluoroscopy: 0 Vessels'] = df['Number of Major Vessels Colored by Fluoroscopy'].apply(lambda x: 1 if x == 0 else 0)
df['Fluoroscopy: 1 Vessel'] = df['Number of Major Vessels Colored by Fluoroscopy'].apply(lambda x: 1 if x == 1 else 0)
df['Fluoroscopy: 2 Vessels'] = df['Number of Major Vessels Colored by Fluoroscopy'].apply(lambda x: 1 if x == 2 else 0)
df['Fluoroscopy: 3 Vessels'] = df['Number of Major Vessels Colored by Fluoroscopy'].apply(lambda x: 1 if x == 3 else 0)

# Drop original categorical columns
df.drop(columns=[
    'Sex (0=Female, 1=Male)',
    'Chest Pain Type (4 Categories)',
    'Fasting Blood Sugar > 120 mg/dL (1=Yes)',
    'Resting Electrocardiographic Results',
    'Exercise-Induced Angina (1=Yes)',
    'Slope of the Peak Exercise ST Segment',
    'Type of Thalassemia',
    'Number of Major Vessels Colored by Fluoroscopy'
], inplace=True)

# Convert all 0/1 dummy variables to category dtype (optional)
dummy_cols = [col for col in df.columns if set(df[col].unique()) <= {0, 1}]
df[dummy_cols] = df[dummy_cols].astype('category')

# Preview the updated DataFrame
df.head(10)


Shape: (303, 14)
Age: int64
Sex (0=Female, 1=Male): category (categorical)
Chest Pain Type (4 Categories): category (categorical)
Resting Blood Pressure (mm Hg): int64
Serum Cholesterol (mg/dL): int64
Fasting Blood Sugar > 120 mg/dL (1=Yes): category (categorical)
Resting Electrocardiographic Results: category (categorical)
Maximum Heart Rate Achieved: int64
Exercise-Induced Angina (1=Yes): category (categorical)
ST Depression Induced by Exercise: float64
Slope of the Peak Exercise ST Segment: category (categorical)
Number of Major Vessels Colored by Fluoroscopy: category (categorical)
Type of Thalassemia: category (categorical)
Presence of Heart Disease (1=Yes): category (categorical)


Unnamed: 0,Age,Resting Blood Pressure (mm Hg),Serum Cholesterol (mg/dL),Maximum Heart Rate Achieved,ST Depression Induced by Exercise,Presence of Heart Disease (1=Yes),Sex: Female,Sex: Male,Chest Pain: Typical Angina,Chest Pain: Atypical Angina,...,ST Slope: Upsloping,ST Slope: Flat,ST Slope: Downsloping,Thalassemia: Normal,Thalassemia: Fixed Defect,Thalassemia: Reversible Defect,Fluoroscopy: 0 Vessels,Fluoroscopy: 1 Vessel,Fluoroscopy: 2 Vessels,Fluoroscopy: 3 Vessels
0,63,145,233,150,2.3,1,0,1,0,0,...,1,0,0,0,1,0,1,0,0,0
1,37,130,250,187,3.5,1,0,1,0,0,...,1,0,0,0,0,1,1,0,0,0
2,41,130,204,172,1.4,1,1,0,0,1,...,0,0,1,0,0,1,1,0,0,0
3,56,120,236,178,0.8,1,0,1,0,1,...,0,0,1,0,0,1,1,0,0,0
4,57,120,354,163,0.6,1,1,0,1,0,...,0,0,1,0,0,1,1,0,0,0
5,57,140,192,148,0.4,1,0,1,1,0,...,0,1,0,0,1,0,1,0,0,0
6,56,140,294,153,1.3,1,1,0,0,1,...,0,1,0,0,0,1,1,0,0,0
7,44,120,263,173,0.0,1,0,1,0,1,...,0,0,1,0,0,0,1,0,0,0
8,52,172,199,162,0.5,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
9,57,150,168,174,1.6,1,0,1,0,0,...,0,0,1,0,0,1,1,0,0,0


In [None]:
!pip install wittgenstein

import wittgenstein as lw
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define features and target
X = df.drop(columns=['Presence of Heart Disease (1=Yes)'])
y = df['Presence of Heart Disease (1=Yes)'].astype(int)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Train RIPPER model
ripper = lw.RIPPER()
ripper.fit(X_train, y_train)

# Predict and evaluate
y_pred = ripper.predict(X_test)

print(f"✅ RIPPER Accuracy: {accuracy_score(y_test, y_pred)}\n")

print("📊 Classification Report:")
print(classification_report(y_test, y_pred))

print("🧮 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n📜 Learned RIPPER Rules:")
print(ripper.ruleset_)




Collecting wittgenstein
  Downloading wittgenstein-0.3.4-py3-none-any.whl.metadata (11 kB)
Downloading wittgenstein-0.3.4-py3-none-any.whl (110 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.6/110.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wittgenstein
Successfully installed wittgenstein-0.3.4
✅ RIPPER Accuracy: 0.7111111111111111

📊 Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.89      0.71        36
           1       0.89      0.59      0.71        54

    accuracy                           0.71        90
   macro avg       0.74      0.74      0.71        90
weighted avg       0.77      0.71      0.71        90

🧮 Confusion Matrix:
[[32  4]
 [22 32]]

📜 Learned RIPPER Rules:
[[Thalassemia:ReversibleDefect=1^Fluoroscopy:0Vessels=1^STSlope:Flat=0] V [Exercise-InducedAngina:Yes=0^Fluoroscopy:0Vessels=1^Sex:Female=1] V [STDepressionInducedbyExercise=0.1-0.4]]


In [None]:
def find_matching_rule_safe(ruleset, instance_df):
    instance_df = instance_df.astype(object)
    for rule in ruleset.rules:
        result = rule.covers(instance_df)
        if not result.empty:
            val = result.iloc[0]
            if isinstance(val, pd.Series):
                val = val.iloc[0]
            elif isinstance(val, (np.ndarray, list)):
                val = val[0]
            if bool(val) is True:
                return rule
    return "No rule matched (default rule used)"


In [None]:
def explain_rule_readably(rule):
    if rule == "No rule matched (default rule used)":
        return rule

    readable_conditions = []
    for cond in rule.conds:
        feature = cond.feature
        value = cond.val

        if feature.startswith('Fluoroscopy'):
            vessels = feature.split(":")[-1]
            readable_conditions.append(f"Fluoroscopy = {vessels.replace('Vessels', ' Vessels')}")
        elif feature.startswith('Exercise-InducedAngina:Yes'):
            readable_conditions.append("Exercise-Induced Angina = No")
        elif feature.startswith('Exercise-InducedAngina:No'):
            readable_conditions.append("Exercise-Induced Angina = Yes")
        elif feature.startswith('Sex:Female'):
            readable_conditions.append("Sex = Female")
        elif feature.startswith('Sex:Male'):
            readable_conditions.append("Sex = Male")
        elif ":" in feature:
            base, label = feature.split(":")
            readable_conditions.append(f"{base.strip()} = {label.strip()}")
        else:
            readable_conditions.append(f"{feature} = {value}")

    return "IF " + " AND ".join(readable_conditions) + " THEN Heart Disease = Yes"


In [None]:
for idx in range(len(X_test)):
    instance = X_test.iloc[[idx]]
    true_label = y_test.iloc[idx]
    predicted = ripper.predict(instance)[0]
    matched_rule = find_matching_rule_safe(ripper.ruleset_, instance)

    if matched_rule != "No rule matched (default rule used)":
        print(f"\n✅ Found instance with non-default rule at index #{idx}")
        print(f"✅ True label: {true_label}")
        print(f"🤖 Predicted:  {predicted}")
        print("📜 Matched rule:")
        print(matched_rule)
        break



✅ Found instance with non-default rule at index #0
✅ True label: 1
🤖 Predicted:  True
📜 Matched rule:
[Thalassemia:ReversibleDefect=1^Fluoroscopy:0Vessels=1^STSlope:Flat=0]


In [None]:
for idx in range(len(X_test)):
    instance = X_test.iloc[[idx]]
    true_label = y_test.iloc[idx]
    predicted = ripper.predict(instance)[0]
    matched_rule = find_matching_rule_safe(ripper.ruleset_, instance)

    if matched_rule != "No rule matched (default rule used)":
        print(f"\n✅ Found instance with non-default rule at index #{idx}")
        print(f"✅ True label: {true_label}")
        print(f"🤖 Predicted:  {predicted}")
        print("📜 Matched rule (raw):")
        print(matched_rule)
        print("\n📜 Matched rule (readable):")
        print(explain_rule_readably(matched_rule))
        break



✅ Found instance with non-default rule at index #0
✅ True label: 1
🤖 Predicted:  True
📜 Matched rule (raw):
[Thalassemia:ReversibleDefect=1^Fluoroscopy:0Vessels=1^STSlope:Flat=0]

📜 Matched rule (readable):
IF Thalassemia = Reversible Defect AND Fluoroscopy =  0  Vessels AND ST Slope = Flat THEN Heart Disease = Yes
