In [1]:
import numpy as np
import pandas as pd

In [7]:
data = {
    "AGE": ["Young","Young","Young","Young","Young",
            "Middle","Middle","Middle","Middle","Middle",
            "Old","Old","Old","Old","Old"],
    "JOB_STATUS": [False,False,True,True,False,
                   False,False,True,False,False,
                   False,False,True,True,False],
    "OWNS_HOUSE": [False,False,False,True,False,
                   False,False,True,True,True,
                   True,True,False,False,False],
    "CREDIT_RATING": ["Fair","Good","Good","Fair","Fair",
                      "Fair","Good","Good","Excellent","Excellent",
                      "Excellent","Good","Good","Excellent","Fair"],
    "CLASS": ["No","No","Yes","Yes","No",
              "No","No","Yes","Yes","Yes",
              "Yes","Yes","Yes","Yes","No"]
}
df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,AGE,JOB_STATUS,OWNS_HOUSE,CREDIT_RATING,CLASS
0,Young,False,False,Fair,No
1,Young,False,False,Good,No
2,Young,True,False,Good,Yes
3,Young,True,True,Fair,Yes
4,Young,False,False,Fair,No


In [8]:
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

In [9]:
def info_gain(data, split_attribute, target_name="CLASS"):
    total_entropy = entropy(data[target_name])
    
    # Values and counts for split attribute
    vals, counts = np.unique(data[split_attribute], return_counts=True)
    
    # Weighted entropy after the split
    weighted_entropy = np.sum([
        (counts[i] / np.sum(counts)) * entropy(data[data[split_attribute] == vals[i]][target_name])
        for i in range(len(vals))
    ])
    
    # Information gain
    return total_entropy - weighted_entropy

In [10]:
# Calculate IG for all attributes
attributes = ["AGE", "JOB_STATUS", "OWNS_HOUSE", "CREDIT_RATING"]
ig_values = {attr: info_gain(df, attr) for attr in attributes}

In [11]:
# Find the attribute with highest IG
root_node = max(ig_values, key=ig_values.get)

In [12]:
print("Information Gain for each attribute:")
for attr, ig in ig_values.items():
    print(f"{attr}: {ig:.4f}")

print(f"\nRoot node of the decision tree should be: {root_node}")

Information Gain for each attribute:
AGE: 0.0830
JOB_STATUS: 0.3237
OWNS_HOUSE: 0.4200
CREDIT_RATING: 0.3630

Root node of the decision tree should be: OWNS_HOUSE
