In [3]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics import accuracy_score

In [4]:
df=pd.read_csv("PlayTennis.csv")

In [5]:
df.head(15)

Unnamed: 0,Outlook,Temperature,Humidity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [6]:
X=df.iloc[:,0:4]
y=df.iloc[:,-1]

In [7]:
X

Unnamed: 0,Outlook,Temperature,Humidity,Wind
0,Sunny,Hot,High,Weak
1,Sunny,Hot,High,Strong
2,Overcast,Hot,High,Weak
3,Rain,Mild,High,Weak
4,Rain,Cool,Normal,Weak
5,Rain,Cool,Normal,Strong
6,Overcast,Cool,Normal,Strong
7,Sunny,Mild,High,Weak
8,Sunny,Cool,Normal,Weak
9,Rain,Mild,Normal,Weak


In [8]:
y

0      No
1      No
2     Yes
3     Yes
4     Yes
5      No
6     Yes
7      No
8     Yes
9     Yes
10    Yes
11    Yes
12    Yes
13     No
Name: PlayTennis, dtype: object

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [10]:
def entropy(y):
    class_counts = {}
    for label in y:
        if label in class_counts:
            class_counts[label] += 1
        else:
            class_counts[label] = 1

    entropy = 0
    total_samples = len(y)

    for count in class_counts.values():
        probability = count / total_samples
        entropy -= probability * math.log2(probability)

    return entropy

In [11]:
ent=entropy(y)
print(ent)

0.9402859586706311


In [12]:
# def information_gain(X, y, feature_index, feature_values):
#     total_entropy = entropy(y)
#     #feature_values=set(row[feature_index] for row in X)
#     weighted_entropy = 0
#     total_samples = len(y)
    
#     for value in feature_values:
#         subset_indices = [i for i in range(total_samples) if X.iloc[i][feature_index] == value]
#         subset_labels = [y.iloc[i] for i in subset_indices]
#         subset_weight = len(subset_indices) / total_samples
#         weighted_entropy += subset_weight * entropy(subset_labels)

#     information_gain = total_entropy - weighted_entropy
#     return information_gain

In [13]:
def information_gain(y, feature):
    total_entropy = entropy(y)
    
    unique_values = feature.unique()
    weighted_entropies = 0

    for value in unique_values:
        subset_y = y[feature == value]
        weighted_entropies += len(subset_y) / len(y) * entropy(subset_y)
    
    information_gain= total_entropy - weighted_entropies
    return information_gain


In [14]:
# ig= calculate_information_gain(X_train, y_train, 1)
# print(ig)

In [15]:
# def find_best_split(X, y):
#     num_features = len(X.iloc[0])
#     best_feature_index = -1
#     best_information_gain = 0

#     for feature_index in range(num_features):
#         feature_values = set(row[feature_index] for row in X)
#         information_gain = calculate_information_gain(X, y, feature_index, feature_values)

#         if information_gain > best_information_gain:
#             best_information_gain = information_gain
#             best_feature_index = feature_index

#     return best_feature_index


In [16]:
# idx= find_best_split(X_train, y_train)
# print(idx)

In [17]:
# def split_data(X, y, feature_index, feature_values):
#     left_data, left_labels, right_data, right_labels = [], [], [], []
#     for i in range(len(X)):
#         if X.iloc[i][feature_index] in feature_values:
#             left_data.append(X.iloc[i])
#             left_labels.append(y.iloc[i])
#         else:
#             right_data.append(X.iloc[i])
#             right_labels.append(y.iloc[i])
#     return left_data, left_labels, right_data, right_labels

In [18]:
# def build_decision_tree(X, y):
#     if len(set(y)) == 1:
#         return y.iloc[0]

#     if len(X.values[0]) == 1:
#         majority_label = max(set(y), key=y.count)
#         return majority_label

#     best_feature_index = find_best_split(X, y)
#     best_feature_values = set(row[best_feature_index] for row in X)

#     left_data, left_labels, right_data, right_labels = split_data(X, y, best_feature_index, best_feature_values)

#     left_subtree = build_decision_tree(left_data, left_labels)
#     right_subtree = build_decision_tree(right_data, right_labels)

#     return {'feature': best_feature_index, 'left': left_subtree, 'right': right_subtree}


In [19]:
class Node:
    def __init__(self, feature=None, value=None, entropy=None, information_gain=None, left=None, right=None):
        self.feature = feature
        self.value = value
        self.entropy = entropy
        self.information_gain = information_gain
        self.left = left
        self.right = right

In [20]:
def build_decision_tree(X, y):
    if entropy(y) == 0:
        # If all instances have the same class, create a leaf node
        return Node(value=y.iloc[0])

    if X.empty:
        # If no features left, create a leaf node with the majority class
        return Node(value=y.value_counts().idxmax())

    # Find the best feature to split on
    best_feature = None
    max_info_gain = -1

    for feature_name in X.columns:
        current_info_gain = information_gain(y, X[feature_name])
        if current_info_gain > max_info_gain:
            max_info_gain = current_info_gain
            best_feature = feature_name

    # Create a node with the best feature
    node = Node(feature=best_feature, entropy=entropy(y), information_gain=max_info_gain, value={})

    # Recursively build the left and right subtrees
    unique_values = X[best_feature].unique()
    for value in unique_values:
        subset_X = X[X[best_feature] == value].drop(columns=[best_feature])
        subset_y = y[X[best_feature] == value]
        child_node = build_decision_tree(subset_X, subset_y)

        if node.value is None:
            node.value = {value: child_node}
        else:
            node.value[value] = child_node

    return node

In [21]:
tree = build_decision_tree(X_train, y_train)

In [22]:
def predict(node, instance):
    if node.feature is None:
        return node.value
    else:
        value = instance[node.feature]
        if value in node.value:
            return predict(node.value[value], instance)
        else:
            return node.value

In [23]:
Predictions = [predict(tree, instance) for _, instance in X_test.iterrows()]

In [24]:
accuracy_score = accuracy_score(y_test, Predictions)
print(f"Accuracy: {accuracy_score:.2f}")

Accuracy: 1.00


In [25]:
# from sklearn import tree
# tree.plot_tree(Predictions)

In [26]:
# import graphviz
# # DOT data
# dot_data = tree.export_graphviz(Predictions, out_file=None, 
#                                 feature_names=['Outlook','Temprature','Humidity','Wind'],  
#                                 class_names=['Yes','No'],
#                                 filled=True)

# # Draw graph
# graph = graphviz.Source(dot_data, format="png") 
# graph


In [27]:
# text_representation = tree.export_text(Predictions)
# print(text_representation