In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
columns = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"]
data = pd.read_csv(url, names=columns)

# Preprocess the dataset
# Replace missing values with the median of the column
data = data.replace('?', pd.NA)
data = data.apply(pd.to_numeric, errors='coerce')
data = data.fillna(data.median())

# Binarize the target variable: presence (1) or absence (0) of heart disease
data['num'] = data['num'].apply(lambda x: 1 if x > 0 else 0)

# Split the data into features and target
X = data.drop('num', axis=1)
y = data['num']

# Encode categorical variables
label_encoder = LabelEncoder()
for column in X.select_dtypes(include=['object']).columns:
    X[column] = label_encoder.fit_transform(X[column])

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [5]:
from sklearn.feature_selection import mutual_info_classif
import numpy as np

def compute_mutual_info(X, y):
    """Compute mutual information between features and the target variable."""
    mutual_info = mutual_info_classif(X, y.astype(int))  # Ensure y is treated as discrete
    return mutual_info

def create_tan_structure(X, y):
    """Create the TAN structure based on mutual information."""
    n_features = X.shape[1]
    mutual_info = compute_mutual_info(X, y)

    # Create the adjacency matrix for the TAN structure
    adj_matrix = np.zeros((n_features, n_features))

    # Compute mutual information between features
    for i in range(n_features):
        for j in range(i + 1, n_features):
            adj_matrix[i, j] = mutual_info_classif(X[:, i].reshape(-1, 1), X[:, j].astype(int))  # Treat X[:, j] as discrete
            adj_matrix[j, i] = adj_matrix[i, j]

    # Select the maximum spanning tree using Prim's algorithm
    selected = [False] * n_features
    selected[0] = True
    edges = []

    for _ in range(n_features - 1):
        max_weight = -1
        max_edge = (-1, -1)
        for i in range(n_features):
            if selected[i]:
                for j in range(n_features):
                    if not selected[j] and adj_matrix[i, j] > max_weight:
                        max_weight = adj_matrix[i, j]
                        max_edge = (i, j)
        edges.append(max_edge)
        selected[max_edge[1]] = True

    return edges

tan_edges = create_tan_structure(X_train, y_train)
print("TAN structure edges:", tan_edges)


TAN structure edges: [(0, 7), (7, 6), (0, 3), (3, 2), (2, 8), (2, 12), (2, 4), (0, 11), (8, 9), (12, 1), (9, 10), (6, 5)]


In [6]:
from collections import defaultdict

def compute_conditional_probabilities(X, y, edges):
    """Compute conditional probabilities for the TAN structure."""
    n_features = X.shape[1]
    conditional_probs = defaultdict(dict)

    for (i, j) in edges:
        for val_i in [0, 1]:
            for val_j in [0, 1]:
                subset = X[(X[:, i] == val_i) & (X[:, j] == val_j)]
                conditional_probs[(i, j)][(val_i, val_j)] = len(subset) / len(X)

    return conditional_probs

conditional_probs = compute_conditional_probabilities(X_train, y_train, tan_edges)
print("Conditional probabilities:", conditional_probs)


Conditional probabilities: defaultdict(<class 'dict'>, {(0, 7): {(0, 0): 0.0, (0, 1): 0.0, (1, 0): 0.0, (1, 1): 0.0}, (7, 6): {(0, 0): 0.0, (0, 1): 0.0, (1, 0): 0.0, (1, 1): 0.0}, (0, 3): {(0, 0): 0.0, (0, 1): 0.0, (1, 0): 0.0, (1, 1): 0.0}, (3, 2): {(0, 0): 0.0, (0, 1): 0.0, (1, 0): 0.0, (1, 1): 0.0}, (2, 8): {(0, 0): 0.0, (0, 1): 0.0, (1, 0): 0.0, (1, 1): 0.0}, (2, 12): {(0, 0): 0.0, (0, 1): 0.0, (1, 0): 0.0, (1, 1): 0.0}, (2, 4): {(0, 0): 0.0, (0, 1): 0.0, (1, 0): 0.0, (1, 1): 0.0}, (0, 11): {(0, 0): 0.0, (0, 1): 0.0, (1, 0): 0.0, (1, 1): 0.0}, (8, 9): {(0, 0): 0.0, (0, 1): 0.0, (1, 0): 0.0, (1, 1): 0.0}, (12, 1): {(0, 0): 0.0, (0, 1): 0.0, (1, 0): 0.0, (1, 1): 0.0}, (9, 10): {(0, 0): 0.0, (0, 1): 0.0, (1, 0): 0.0, (1, 1): 0.0}, (6, 5): {(0, 0): 0.0, (0, 1): 0.0, (1, 0): 0.0, (1, 1): 0.0}})


In [7]:
def infer_heart_disease(X_test, conditional_probs, tan_edges):
    """Infer the presence of heart disease using the TAN structure."""
    predictions = []

    for x in X_test:
        probs = {0: 1.0, 1: 1.0}

        for (i, j) in tan_edges:
            val_i = x[i]
            val_j = x[j]
            probs[0] *= conditional_probs[(i, j)].get((val_i, val_j), 1e-6)
            probs[1] *= conditional_probs[(i, j)].get((val_i, val_j), 1e-6)

        if probs[1] > probs[0]:
            predictions.append(1)
        else:
            predictions.append(0)

    return predictions

predictions = infer_heart_disease(X_test, conditional_probs, tan_edges)

# Evaluate the performance
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 47.54%
