In [2]:
import numpy as np
import pandas as pd
from data import load_heart_failure_dataset

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, confusion_matrix, auc, roc_curve, f1_score


In [8]:
class DecisionTreeRegressor:
    def __init__(self, max_depth=3):
        self.max_depth = max_depth
        self.tree_ = None

    def _variance(self, y):
        return np.var(y)

    def _split(self, X, y, feature_index, threshold):
        left_mask = X[:, feature_index] < threshold
        right_mask = ~left_mask
        return X[left_mask], y[left_mask], X[right_mask], y[right_mask]

    def _best_split(self, X, y):
        best_feat, best_thresh, best_var = None, None, float("inf")
        n_samples, n_features = X.shape

        for feat in range(n_features):
            thresholds = np.unique(X[:, feat])
            for t in thresholds:
                X_left, y_left, X_right, y_right = self._split(X, y, feat, t)
                if len(y_left) == 0 or len(y_right) == 0:
                    continue
                var = (len(y_left)*self._variance(y_left) + len(y_right)*self._variance(y_right)) / len(y)
                if var < best_var:
                    best_feat, best_thresh, best_var = feat, t, var
        return best_feat, best_thresh

    # --- recursive build ---
    def _build(self, X, y, depth=0):
        if depth == self.max_depth or len(X) == 0:
            return np.mean(y)

        feat, thresh = self._best_split(X, y)
        if feat is None:
            return np.mean(y)

        X_left, y_left, X_right, y_right = self._split(X, y, feat, thresh)
        left_branch = self._build(X_left, y_left, depth + 1)
        right_branch = self._build(X_right, y_right, depth + 1)

        return (feat, thresh, left_branch, right_branch)

    # --- fit & predict ---
    def fit(self, X, y):
        self.tree_ = self._build(X, y)
        return self

    def _predict_one(self, x, tree):
        if not isinstance(tree, tuple):
            return tree
        feat, thresh, left, right = tree

        if x[feat] < thresh:
            return self._predict_one(x, left)
        else:
            return self._predict_one(x, right)

    def predict(self, X):
        return np.array([self._predict_one(x, self.tree_) for x in X])

In [9]:
class MiniXGBoost:
    def __init__(self, n_estimators=5, learning_rate=0.1, max_depth=3, task="regression"):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.task = task

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def fit(self, X, y):
        y_pred = np.zeros_like(y, dtype=float)

        for i in range(self.n_estimators):
            if self.task == "classification":
                y_pred_proba = self._sigmoid(y_pred)
                residuals = y - y_pred_proba
            else:
                residuals = y - y_pred

            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)

            update = tree.predict(X)
            y_pred += self.learning_rate * update
            self.trees.append(tree)

    def predict(self, X, as_proba=False):
        y_pred = np.zeros(X.shape[0])
        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X)

        if self.task == "classification":
            y_pred = self._sigmoid(y_pred)
            if not as_proba:
                y_pred = (y_pred >= 0.5).astype(int)

        return y_pred


In [3]:
def load_data():
    df, X, y = load_heart_failure_dataset("tan5577/heart-failure-dataset", "HeartDisease")

    return df, X, y

In [4]:
df, X, y = load_data()

In [5]:
X = X[["ChestPainType","MaxHR", "ExerciseAngina", "ST_Slope", "Oldpeak"]]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
df, X, y = load_data()

X = X[["ChestPainType","MaxHR", "ExerciseAngina", "ST_Slope", "Oldpeak"]]

model = MiniXGBoost(n_estimators=200, learning_rate=0.1, max_depth=5, task="classification")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train.values, y_train.values)
y_test_probs = model.predict(X_test.values, as_proba=True)
y_test_labels = model.predict(X_test.values, as_proba=False)

print("Accuracy:", accuracy_score(y_test, y_test_labels))
print("ROC-AUC:", roc_auc_score(y_test, y_test_probs))

thresholds = np.linspace(0, 1, 100)
accuracies = [accuracy_score(y_test, y_test_probs >= t) for t in thresholds]
precisions = [precision_score(y_test, y_test_probs >= t, zero_division=0) for t in thresholds]
recalls = [recall_score(y_test, y_test_probs >= t) for t in thresholds]

best_t = thresholds[np.argmax(accuracies)]
best_acc = np.max(accuracies)
f1 = f1_score(y_test, y_test_probs >= best_t)
print("Best threshold:", best_t)
print("Best accuracy:", best_acc)
print("F1 score:", f1)

# 5️⃣ Plot results
plt.figure(figsize=(8,5))
plt.plot(thresholds, accuracies, label='Accuracy', lw=2)
plt.plot(thresholds, precisions, label='Precision', linestyle='--')
plt.plot(thresholds, recalls, label='Recall', linestyle=':')
plt.axvline(best_t, color='r', linestyle='--', label=f'Best Threshold = {best_t:.2f}')
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Metrics vs. Decision Threshold")
plt.legend()
plt.grid(alpha=0.3)
plt.show()


target_names = ["normal", "abnormal"]
def plot_confusion_matrix(Y_true, Y_pred):
    cm = confusion_matrix(Y_true, Y_pred)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=target_names, yticklabels=target_names)

    # Set plot labels and title
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix Heatmap')
    plt.show()

def plot_roc_auc_curve(y_true, y_prob):
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')
    plt.show()

plot_confusion_matrix(y_test, y_test_labels)