In [21]:
import numpy as np
import pandas as pd
import pprint

raw_data = """outlook,temp,humidity,windy,play
sunny,hot,high,false,no
sunny,hot,high,true,no
overcast,hot,high,false,yes
rainy,mild,high,false,yes
rainy,cool,normal,false,yes
rainy,cool,normal,true,no
overcast,cool,normal,true,yes
sunny,mild,high,false,no
sunny,cool,normal,false,yes
rainy,mild,normal,false,yes
sunny,mild,normal,true,yes
overcast,mild,high,true,yes
overcast,hot,normal,false,yes
rainy,mild,high,true,no"""

# Writing data into a pPlayTennis dataset
with open("PlayTennis.csv", "w") as f:
    f.write(raw_data)  # Writing all the data from _raw_data to ths sports data file
df = pd.read_csv("PlayTennis.csv") #reading the sports dataset
df

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [16]:
# Steps

# 1. Get the entropy of the entire dataset
# 2. Function that calculates the entropy of each attribute within the dataset
# 3. Function to calculate the information gain of any given attribute
# 4. Find the attribute with the largest information gain
# 5. Iterate until the desired decision tree is built

# Step 1
def get_df_ent(df):

  # find the value counts of the data
  counts = df.value_counts()

  # Find the probabilities of each value
  probs = counts / len(df)

  # The np.finfo(float).eps ensures that the log calculation doesn't encounter a zero value and avoids any undefined behavior.
  ent_val = -np.sum(probs * np.log2(probs + np.finfo(float).eps))
  return ent_val

get_df_ent(df['play'])

0.9402859586706304

In [17]:
# Step 2
def get_attr_ent(df, attr):

  # Get the unique values of the attribute
  unique_vals = df[attr].unique()

  # Initialise
  attr_ent = 0

  # Iterate over unique values
  for val in unique_vals:

    # Get the subset where the attribute has the current value
    sub_df = df[df[attr] == val]

    # Find the entropy of the 'play' column of the subset
    feature_ent = get_df_ent(sub_df['play'])

    # Calculate subset weight
    sub_weight = len(sub_df)/len(df)

    # Calculate attribute entropy
    attr_ent += sub_weight * feature_ent

  return attr_ent

for col in df.columns:
  if col != 'play':
    print(f"'{col}':", get_attr_ent(df, col))

'outlook': 0.6935361388961914
'temp': 0.9110633930116756
'humidity': 0.7884504573082889
'windy': 0.892158928262361


In [18]:
# Step 3
def get_info_gain(df, attr):

  # Calculate total entropy for the column 'play'
  total_ent = get_df_ent(df['play'])

  # Find entropy of attribute
  attr_ent = get_attr_ent(df, attr)

  # Information gain = total entropy - attribute entropy
  info_gain = total_ent - attr_ent
  return info_gain

for col in df.columns:
  if col != 'play':
    print(f"'{col}':", get_info_gain(df, col))

'outlook': 0.246749819774439
'temp': 0.029222565658954758
'humidity': 0.15183550136234147
'windy': 0.04812703040826938


In [19]:
# Step 4
def get_highest_attr(df):

  # Find the attribute with the highest weight (best)
  info_gains = {attr: get_info_gain(df, attr) for attr in df.columns[:-1]}
  return max(info_gains, key=info_gains.get)

get_highest_attr(df)

'outlook'

In [20]:
# Step 5
# Helper functions
def isPure(sub_df):
  # Check if the data subset contains only one class
  return len(sub_df['play'].unique()) == 1

def subset(df, node, val):
  # Return a data subset where the attribute node equals the given value
  return df[df[node] == val].reset_index(drop=True)

def get_mode(sub_df):
  # Get the most frequent class in the data subset
  return sub_df['play'].mode()[0]

# Tree build
def build_tree(df, tree=None):

  # Initialise
  if tree is None:
    tree = {}

  # Find the best/highest attribute to split on
  best_attr = get_highest_attr(df)
  tree[best_attr] = {}

  # Iterate over the each unique value for the best attribute
  for val in df[best_attr].unique():

    # Get the data subset where the best/highest attribute has the current value
    sub_df = subset(df, best_attr, val)

    # Check if the subset is pure
    if isPure(sub_df):
      tree[best_attr][val] = get_mode(sub_df) # Assign the most frequently occurring class to the current branch
    else:
      tree[best_attr][val] = build_tree(sub_df) # If not pure, build the tree for the subset using recursion

  return tree

tree = build_tree(df)
pprint.pprint(tree)

{'outlook': {'overcast': 'yes',
             'rainy': {'windy': {False: 'yes', True: 'no'}},
             'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}
