In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('PlayTennis.csv')
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Outlook      14 non-null     object
 1   Temperature  14 non-null     object
 2   Humidity     14 non-null     object
 3   Wind         14 non-null     object
 4   Play Tennis  14 non-null     object
dtypes: object(5)
memory usage: 692.0+ bytes


Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [3]:
map = {"Sunny":2, "Overcast":0, "Rain":1, "Hot":1, "Mild":2, "Cool":0, "High":0, "Normal":1, "Weak":1, "Strong":0}

In [4]:
# for decode purpose
feature_names = data.columns[:-1]
target_names = np.unique(data.iloc[:, -1].values)
print(feature_names)
print(target_names)

Index(['Outlook', 'Temperature', 'Humidity', 'Wind'], dtype='object')
['No' 'Yes']


In [5]:
# Data preprocessing
data = data.dropna()  # Remove any rows with missing values

In [6]:
# Converting categorical data to numerical using LabelEncoder
le = LabelEncoder()
for column in data.columns:
    data[column] = le.fit_transform(data[column])
data.head()


Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,2,1,0,1,0
1,2,1,0,0,0
2,0,1,0,1,1
3,1,2,0,1,1
4,1,0,1,1,1


In [7]:
# Spliting the dataset into features and target
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [8]:
# Defineing the Node class for the decision tree
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

def entropy(y):
    hist = np.bincount(y) 
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])

def information_gain(X, y, threshold):
    parent_entropy = entropy(y)
    left_indices = X < threshold
    right_indices = X >= threshold
    if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
        return 0
    n = len(y)
    left_entropy = entropy(y[left_indices]) * len(y[left_indices]) / n
    right_entropy = entropy(y[right_indices]) * len(y[right_indices]) / n
    return parent_entropy - (left_entropy + right_entropy)

# Find the best feature and threshold for splitting the data
def find_best_split(X, y):
    best_gain = 0
    best_feature, best_threshold = None, None
    for feature in range(X.shape[1]):
        thresholds = np.unique(X[:, feature])
        for threshold in thresholds:
            gain = information_gain(X[:, feature], y, threshold)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold
    return best_feature, best_threshold

def build_tree(X, y, depth=0, max_depth=None):
    if len(set(y)) == 1:
        return Node(value=y[0])
    if max_depth is not None and depth >= max_depth:
        return Node(value=Counter(y).most_common(1)[0][0])
    feature, threshold = find_best_split(X, y)
    if feature is None or threshold is None:
        return Node(value=Counter(y).most_common(1)[0][0])
    left_indices = X[:, feature] < threshold
    right_indices = X[:, feature] >= threshold
    left = build_tree(X[left_indices], y[left_indices], depth + 1, max_depth)
    right = build_tree(X[right_indices], y[right_indices], depth + 1, max_depth)
    return Node(feature=feature, threshold=threshold, left=left, right=right)


In [9]:
# Building the tree
tree = build_tree(X, y)

In [10]:
import graphviz

def plot_tree(tree, feature_names, class_names):
    dot = graphviz.Digraph()

    def build_graph(node, dot=None):
        if node.value is not None:
            dot.node(str(id(node)), class_names[node.value])
        else:
            dot.node(str(id(node)), feature_names[node.feature])
            build_graph(node.left, dot)
            build_graph(node.right, dot)
            dot.edge(str(id(node)), str(id(node.left)), '< ' + str(node.threshold))
            dot.edge(str(id(node)), str(id(node.right)), '>= ' + str(node.threshold))

    build_graph(tree, dot)
    return dot

# Plotting the tree
dot = plot_tree(tree, data.columns[:-1], target_names)
dot.render('decision_tree', format='png', cleanup=True)

'decision_tree.png'

In [11]:
def predict_example(x, tree):
    if tree.value is not None:
        return tree.value
    feature_value = x[tree.feature]
    if feature_value < tree.threshold:
        return predict_example(x, tree.left)
    else:
        return predict_example(x, tree.right)

# Example usage
sample_data = X[2]  # Assuming X is the feature data
prediction = predict_example(sample_data, tree)
print(f"The prediction for the sample data {sample_data} is: {target_names[prediction]}")

The prediction for the sample data [0 1 0 1] is: Yes


In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Example usage to predict on the entire dataset
y_pred = [predict_example(x, tree) for x in X]

# Calculate evaluation metrics
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, average='binary')
recall = recall_score(y, y_pred, average='binary')
f1 = f1_score(y, y_pred, average='binary')
conf_matrix = confusion_matrix(y, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")


Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
Confusion Matrix:
[[5 0]
 [0 9]]
