In [59]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MachineLearning/Assignment7/PlayTennis.csv')
df

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [60]:
# def entropy(S):
#     s = S.value_counts(normalize=True)
#     return -sum(s * np.log2(s))
def entropy(S):
  s = S.value_counts()
  sum = 0
  for i in s.keys():
    p = s[i]/len(S)
    sum += (-p) * np.log2(p)
  return sum

In [61]:
def informationGain(S, F):
  s = S.value_counts()
  f = F.value_counts()

  total_entropy = entropy(S)
  sum = 0
  for i in f.keys():
    sum += (f[i]/len(S)) * entropy(S[F==i])
  return total_entropy - sum

In [62]:
def findBestAttribute(S, attributes):
    max_gain = -1
    best_attr = None
    total_entropy = entropy(S['play'])
    print(f"Total entropy of the dataset for target play: {total_entropy:.4f}")
    for attr in attributes:
        gain = informationGain(S['play'], S[attr])
        print(f"Information Gain of {attr}: {np.round(gain, 4)}")
        if gain > max_gain:
            max_gain = gain
            best_attr = attr
    return best_attr, max_gain

In [63]:
attributes = ['outlook', 'temp', 'humidity', 'windy']
best_feature, max_gain= findBestAttribute(df, attributes)
print(f"Best attribute to split on is {best_feature} with information gain of {max_gain:.4f}")

Total entropy of the dataset for target play: 0.9403
Information Gain of outlook: 0.2467
Information Gain of temp: 0.0292
Information Gain of humidity: 0.1518
Information Gain of windy: 0.0481
Best attribute to split on is outlook with information gain of 0.2467


In [64]:
class DecisionTree:
    def __init__(self, data, attributes):
        self.data = data
        self.attributes = attributes
        self.tree = {}

    def build_tree(self, S, attributes):
        # Base cases
        if len(S['play'].unique()) == 1:  # Pure subset (all labels are the same)
            label = S['play'].iloc[0]
            print(f"Reached leaf: {label.upper()} (Pure subset)")
            return label
        if len(attributes) == 0:  # No more attributes to split on
            label = S['play'].mode()[0]
            print(f"Reached leaf: {label.upper()} (No more attributes)")
            return label
        # Find the best attribute to split on
        best_attr, max_gain = findBestAttribute(S, attributes)
        print(f"\nBest attribute to split on is '{best_attr}' with information gain of {max_gain:.4f}")
        if not best_attr:
            return S['play'].mode()[0]

        # Create the subtree
        tree = {best_attr: {}}
        for value in S[best_attr].unique():
            subset = S[S[best_attr] == value]
            new_attributes = [attr for attr in attributes if attr != best_attr]
            print(f"\nSplitting on '{best_attr}' = '{value}'")
            subtree = self.build_tree(subset, new_attributes)
            tree[best_attr][value] = subtree

        return tree
    def fit(self):
        self.tree = self.build_tree(self.data, self.attributes)

In [65]:
decision_tree = DecisionTree(df, attributes)
decision_tree.fit()

Total entropy of the dataset for target play: 0.9403
Information Gain of outlook: 0.2467
Information Gain of temp: 0.0292
Information Gain of humidity: 0.1518
Information Gain of windy: 0.0481

Best attribute to split on is 'outlook' with information gain of 0.2467

Splitting on 'outlook' = 'sunny'
Total entropy of the dataset for target play: 0.9710
Information Gain of temp: 0.571
Information Gain of humidity: 0.971
Information Gain of windy: 0.02

Best attribute to split on is 'humidity' with information gain of 0.9710

Splitting on 'humidity' = 'high'
Reached leaf: NO (Pure subset)

Splitting on 'humidity' = 'normal'
Reached leaf: YES (Pure subset)

Splitting on 'outlook' = 'overcast'
Reached leaf: YES (Pure subset)

Splitting on 'outlook' = 'rainy'
Total entropy of the dataset for target play: 0.9710
Information Gain of temp: 0.02
Information Gain of humidity: 0.02
Information Gain of windy: 0.971

Best attribute to split on is 'windy' with information gain of 0.9710

Splitting on 

In [66]:
print("Decision Tree:")
decision_tree.tree

Decision Tree:


{'outlook': {'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}},
  'overcast': 'yes',
  'rainy': {'windy': {False: 'yes', True: 'no'}}}}