In [1]:
import pandas as pd
import numpy as np
import kagglehub

In [2]:
path = kagglehub.dataset_download("tareqjoy/trainplaytennis")

Downloading from https://www.kaggle.com/api/v1/datasets/download/tareqjoy/trainplaytennis?dataset_version_number=1...


100%|██████████| 314/314 [00:00<00:00, 540kB/s]

Extracting files...





In [67]:
train_dataset = pd.read_csv(f"{path}/PlayTennis.csv")
data = [
    {"Outlook": "Overcast", "Temperature": "Mild", "Humidity": "Normal", "Wind": "Weak", "Play Tennis": "Yes"},
    {"Outlook": "Sunny", "Temperature": "Mild", "Humidity": "Normal", "Wind": "Weak", "Play Tennis": "Yes"},
    {"Outlook": "Rain", "Temperature": "Mild", "Humidity": "Normal", "Wind": "Weak", "Play Tennis": "Yes"},
    {"Outlook": "Rain", "Temperature": "Cool", "Humidity": "High", "Wind": "Strong", "Play Tennis": "No"}
]
test_dataset = pd.DataFrame(data)

print(train_dataset.head())

    Outlook Temperature Humidity    Wind Play Tennis
0     Sunny         Hot     High    Weak          No
1     Sunny         Hot     High  Strong          No
2  Overcast         Hot     High    Weak         Yes
3      Rain        Mild     High    Weak         Yes
4      Rain        Cool   Normal    Weak         Yes


In [71]:
target_column = 'Play Tennis'

In [80]:
def calculate_entropy(data):
    total_count = len(data)
    class_counts = data[target_column].value_counts()
    entropy = 0

    for count in class_counts:
        probability = count / total_count
        entropy -= probability * np.log2(probability)

    return entropy

In [None]:
def subset_entropy(subset):
    total_count = len(subset)
    entropy = 0

    for _, class_count in subset[target_column].value_counts().items():
        if class_count == 0:
            continue
        probability = class_count / total_count
        entropy -= probability * np.log2(probability)

    return entropy

In [None]:
def information_gain(data, split_feature):
    total_entropy = calculate_entropy(data)
    values = data[split_feature].unique()
    weighted_entropy_sum = 0

    for value in values:
        subset = data[data[split_feature] == value]
        subset_entropy_value = subset_entropy(subset)
        weighted_entropy_sum += (len(subset) / len(data)) * subset_entropy_value

    return total_entropy - weighted_entropy_sum

In [None]:
def choose_best_feature(data):
    features = [column for column in data.columns if column != target_column]
    best_feature = None
    max_gain = -1

    for feature in features:
        gain = information_gain(data, feature)
        if gain > max_gain:
            max_gain = gain
            best_feature = feature

    return best_feature

In [None]:
def build_decision_tree(data):
    if len(data[target_column].unique()) == 1:
        return data[target_column].iloc[0]

    if len(data.columns) == 1:
        return data[target_column].mode()[0]

    best_feature = choose_best_feature(data)
    tree = {best_feature: {}}

    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value].drop(columns=[best_feature])
        subtree = build_decision_tree(subset)
        tree[best_feature][value] = subtree

    return tree


In [None]:
def predict(tree, instance):
    if not isinstance(tree, dict):
        return tree

    root_feature = next(iter(tree))
    feature_value = instance[root_feature]

    if feature_value not in tree[root_feature]:
        return None 

    return predict(tree[root_feature][feature_value], instance)


In [86]:
def evaluate_model(tree, test_data):
    correct_predictions = 0
    for index, row in test_data.iterrows():
        instance = row.to_dict()
        actual = instance.pop(target_column)
        predicted = predict(tree, instance)
        if predicted == actual:
            correct_predictions += 1
    accuracy = correct_predictions / len(test_data)
    return accuracy

In [None]:
decision_tree = build_decision_tree(train_dataset)
print("Decision Tree:", decision_tree)

Decision Tree: {'Outlook': {'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}, 'Overcast': 'Yes', 'Rain': {'Wind': {'Weak': 'Yes', 'Strong': 'No'}}}}


In [88]:
test_instance_row = test_dataset.iloc[0].to_dict()
actual_label = test_instance_row.pop('Play Tennis')
prediction = predict(decision_tree, test_instance_row)
print("Test instance:", test_instance_row)
print("Actual label:", actual_label)
print("Predicted label:", prediction)


Test instance: {'Outlook': 'Overcast', 'Temperature': 'Mild', 'Humidity': 'Normal', 'Wind': 'Weak'}
Actual label: Yes
Predicted label: Yes


In [89]:
accuracy = evaluate_model(decision_tree, test_dataset)
print("Model accuracy on test dataset:", accuracy)

Model accuracy on test dataset: 1.0


In [41]:
!pip install anytree

Collecting anytree
  Downloading anytree-2.12.1-py3-none-any.whl.metadata (8.1 kB)
Downloading anytree-2.12.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.9/44.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anytree
Successfully installed anytree-2.12.1


In [90]:
from anytree import Node, RenderTree
from anytree.exporter import DotExporter
import json

def build_tree(data, parent=None):
    if isinstance(data, dict):
        for key, value in data.items():
            node = Node(key, parent=parent)
            build_tree(value, node)
    else:
        Node(data, parent=parent)

root = Node("Decision Tree")
build_tree(decision_tree, root)

for pre, fill, node in RenderTree(root):
    print(f"{pre}{node.name}")

Decision Tree
└── Outlook
    ├── Sunny
    │   └── Humidity
    │       ├── High
    │       │   └── No
    │       └── Normal
    │           └── Yes
    ├── Overcast
    │   └── Yes
    └── Rain
        └── Wind
            ├── Weak
            │   └── Yes
            └── Strong
                └── No
