In [3]:
import numpy as np
import pandas as pd

In [4]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class Decision_Tree:
    def __init__(self, max_depth=10, min_samples_split=5):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def entropy(self, y):
        counts = np.bincount(y.ravel())
        probs = counts[counts > 0] / len(y)
        return -np.sum(probs * np.log2(probs))

    def information_gain(self, X_column, y, threshold):
        parent_entropy = self.entropy(y)

        left_idx = X_column < threshold
        right_idx = ~left_idx

        if left_idx.sum() == 0 or right_idx.sum() == 0:
            return 0

        n = len(y)
        n_left = left_idx.sum()
        n_right = right_idx.sum()

        left_entropy = self.entropy(y[left_idx])
        right_entropy = self.entropy(y[right_idx])

        child_entropy = (n_left/n) * left_entropy + (n_right/n) * right_entropy

        return parent_entropy - child_entropy

    def best_split(self, X, y, indices):
        best_gain = -1
        best_feature = None
        best_threshold = None

        X_sub = X[indices]
        y_sub = y[indices]

        n_features = X.shape[1]

        for feature in range(n_features):
            values = X_sub[:, feature]
            thresholds = np.unique(values)

            for threshold in thresholds:
                gain = self.information_gain(values, y_sub, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold, best_gain

    def buildTree(self, X, y, indices, depth):
        y_sub = y[indices]

        if depth >= self.max_depth or len(indices) < self.min_samples_split or len(np.unique(y_sub)) == 1:
            return Node(value=self.majority_vote(y_sub))

        feature, threshold, gain = self.best_split(X, y, indices)

        if gain <= 0:
            return Node(value=self.majority_vote(y_sub))

        X_sub = X[indices]

        left_idx = indices[X_sub[:, feature] < threshold]
        right_idx = indices[X_sub[:, feature] >= threshold]

        left_child = self.buildTree(X, y, left_idx, depth+1)
        right_child = self.buildTree(X, y, right_idx, depth+1)

        return Node(feature=feature, threshold=threshold,
                    left=left_child, right=right_child)

    def majority_vote(self, y):
        values, counts = np.unique(y, return_counts=True)
        return values[np.argmax(counts)]

    def fit(self, X, y):
        indices = np.arange(len(X))
        self.root = self.buildTree(X, y, indices, 0)

    def predict_one(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] < node.threshold:
            return self.predict_one(x, node.left)
        else:
            return self.predict_one(x, node.right)

    def predict(self, X):
        return np.array([self.predict_one(x, self.root) for x in X])

In [None]:
df = pd.read_csv("..\synthetic_lifestyle_disease_transformed.csv")

target = df.iloc[:,1].values
features = df.iloc[:,1:-1].values

x_train = features[:400]
y_train = target[:400]
x_test = features[400:]
y_test = target[400:]

FileNotFoundError: [Errno 2] No such file or directory: '../synthetic_lifestyle_disease_transformed.csv'

In [112]:
model = Decision_Tree(max_depth=10)
model.fit(x_train,y_train)

In [113]:
y_pred = model.predict(x_test)
mse = np.mean((y_pred - y_test) ** 2)
print("accuracy: ",(1 - mse) * 100)

accuracy:  61.31087847064178


In [103]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=15)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
mse = np.mean((y_pred - y_test) ** 2)
print("accuracy: ",(1 - mse) * 100)

accuracy:  59.717796995903505
