In [2]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))
        return entropy

    def information_gain(self, X, y, feature_index, threshold):
        left_mask = X[:, feature_index] <= threshold
        right_mask = ~left_mask

        entropy_before = self.entropy(y)
        entropy_left = self.entropy(y[left_mask])
        entropy_right = self.entropy(y[right_mask])

        n = len(y)
        information_gain = entropy_before - (
            (np.sum(left_mask) / n) * entropy_left + (np.sum(right_mask) / n) * entropy_right
        )

        return information_gain

    def find_best_split(self, X, y):
        num_features = X.shape[1]
        best_gain = 0
        best_feature_index = None
        best_threshold = None

        for feature_index in range(num_features):
            unique_values = np.unique(X[:, feature_index])
            thresholds = (unique_values[:-1] + unique_values[1:]) / 2

            for threshold in thresholds:
                gain = self.information_gain(X, y, feature_index, threshold)

                if gain > best_gain:
                    best_gain = gain
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold

    def build_tree(self, X, y, depth):
        if depth == 0 or len(np.unique(y)) == 1:
            return np.argmax(np.bincount(y))

        best_feature_index, best_threshold = self.find_best_split(X, y)

        if best_feature_index is None:
            return np.argmax(np.bincount(y))

        left_mask = X[:, best_feature_index] <= best_threshold
        right_mask = ~left_mask

        left_subtree = self.build_tree(X[left_mask], y[left_mask], depth - 1)
        right_subtree = self.build_tree(X[right_mask], y[right_mask], depth - 1)

        return (best_feature_index, best_threshold, left_subtree, right_subtree)

    def fit(self, X, y):
        self.tree = self.build_tree(X, y, self.max_depth)

    def predict_sample(self, sample, tree):
        if not isinstance(tree, tuple):
            return tree

        feature_index, threshold, left_subtree, right_subtree = tree

        if sample[feature_index] <= threshold:
            return self.predict_sample(sample, left_subtree)
        else:
            return self.predict_sample(sample, right_subtree)

    def predict(self, X):
        return np.array([self.predict_sample(sample, self.tree) for sample in X])

# Load the dataset
df = pd.read_csv("playgolf_data.csv")

# Encode categorical variables to numerical values
label_encoder = LabelEncoder()
df['Outlook'] = label_encoder.fit_transform(df['Outlook'])
df['Temperature'] = label_encoder.fit_transform(df['Temperature'])
df['Humidity'] = label_encoder.fit_transform(df['Humidity'])
df['Wind'] = label_encoder.fit_transform(df['Wind'])
df['PlayGolf'] = label_encoder.fit_transform(df['PlayGolf'])

# Features and target variable
X = df.drop('PlayGolf', axis=1).values
y = df['PlayGolf'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the decision tree
tree = DecisionTree(max_depth=3)
tree.fit(X_train, y_train)

# Make predictions on the test set
y_pred = tree.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.6666666666666666
