In [1]:
import pandas as pd #for manipulating the csv data
import numpy as np #for mathematical calculation

train_data_m = pd.read_excel("input.xlsx") #importing the dataset from the disk
train_data_m.head() #viewing some row of the dataset


Unnamed: 0,age,income,student,credit_rating,buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle_aged,high,no,fair,yes
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes


In [2]:
def calc_total_entropy(train_data, label, class_list):
    total_row = train_data.shape[0] #the total size of the dataset
    total_entr = 0
    
    for c in class_list: #for each class in the label
        total_class_count = train_data[train_data[label] == c].shape[0] #number of the class
        total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row) #entropy of the class
        total_entr += total_class_entr #adding the class entropy to the total entropy of the dataset
    
    return total_entr

In [3]:
def calc_entropy(feature_value_data, label, class_list):
    class_count = feature_value_data.shape[0]
    entropy = 0
    
    for c in class_list:
        label_class_count = feature_value_data[feature_value_data[label] == c].shape[0] #row count of class c 
        entropy_class = 0
        if label_class_count != 0:
            probability_class = label_class_count/class_count #probability of the class
            entropy_class = - probability_class * np.log2(probability_class)  #entropy
        entropy += entropy_class
    return entropy

In [4]:
def calc_info_gain(feature_name, train_data, label, class_list):
    feature_value_list = train_data[feature_name].unique() #unqiue values of the feature
    total_row = train_data.shape[0]
    feature_info = 0.0
    
    for feature_value in feature_value_list:
        feature_value_data = train_data[train_data[feature_name] == feature_value] #filtering rows with that feature_value
        feature_value_count = feature_value_data.shape[0]
        feature_value_entropy = calc_entropy(feature_value_data, label, class_list) #calculcating entropy for the feature value
        feature_value_probability = feature_value_count/total_row
        feature_info += feature_value_probability * feature_value_entropy #calculating information of the feature value
        
    return calc_total_entropy(train_data, label, class_list) - feature_info #calculating inf

In [5]:
def find_most_informative_feature(train_data, label, class_list):
    feature_list = train_data.columns.drop(label)  # features excluding the label
    max_info_gain = -1
    max_info_feature = None
    
    for feature in feature_list:
        feature_info_gain = calc_info_gain(feature, train_data, label, class_list)
        print(f"Feature: {feature}, Info Gain: {feature_info_gain}")  # This will show info gain for each feature
        if max_info_gain < feature_info_gain:
            max_info_gain = feature_info_gain
            max_info_feature = feature
            
    return max_info_feature

In [6]:
def generate_sub_tree(feature_name, train_data, label, class_list):
    feature_value_count_dict = train_data[feature_name].value_counts(sort=False) #dictionary of the count of unqiue feature value
    tree = {} #sub tree or node
    
    for feature_value, count in feature_value_count_dict.items():
        feature_value_data = train_data[train_data[feature_name] == feature_value] #dataset with only feature_name = feature_value
        
        assigned_to_node = False #flag for tracking feature_value is pure class or not
        for c in class_list: #for each class
            class_count = feature_value_data[feature_value_data[label] == c].shape[0] #count of class c

            if class_count == count: #count of (feature_value = count) of class (pure class)
                tree[feature_value] = c #adding node to the tree
                train_data = train_data[train_data[feature_name] != feature_value] #removing rows with feature_value
                assigned_to_node = True
        if not assigned_to_node: #not pure class
            tree[feature_value] = "?" #as feature_value is not a pure class, it should be expanded further, 
                                      #so the branch is marking with ?
            
    return tree, train_data

In [7]:
def make_tree(root, prev_feature_value, train_data, label, class_list):
    if train_data.shape[0] != 0: #if dataset becomes enpty after updating
        max_info_feature = find_most_informative_feature(train_data, label, class_list) #most informative feature
        tree, train_data = generate_sub_tree(max_info_feature, train_data, label, class_list) #getting tree node and updated dataset
        next_root = None
        
        if prev_feature_value != None: #add to intermediate node of the tree
            root[prev_feature_value] = dict()
            root[prev_feature_value][max_info_feature] = tree
            next_root = root[prev_feature_value][max_info_feature]
        else: #add to root of the tree
            root[max_info_feature] = tree
            next_root = root[max_info_feature]
        
        for node, branch in list(next_root.items()): #iterating the tree node
            if branch == "?": #if it is expandable
                feature_value_data = train_data[train_data[max_info_feature] == node] #using the updated dataset
                make_tree(next_root, node, feature_value_data, label, class_list) #recursive call with updated dat

In [8]:
def id3(train_data_m, label):
    train_data = train_data_m.copy() #getting a copy of the dataset
    tree = {} #tree which will be updated
    class_list = train_data[label].unique() #getting unqiue classes of the label
    make_tree(tree, None, train_data, label, class_list) #start calling recursion
    return tree

In [9]:
tree = id3(train_data_m, 'buys_computer')
print(tree)

Feature: age, Info Gain: 0.24674981977443933
Feature: income, Info Gain: 0.02922256565895487
Feature: student, Info Gain: 0.15183550136234159
Feature: credit_rating, Info Gain: 0.04812703040826949
Feature: age, Info Gain: 0.0
Feature: income, Info Gain: 0.5709505944546686
Feature: student, Info Gain: 0.9709505944546686
Feature: credit_rating, Info Gain: 0.01997309402197489
Feature: age, Info Gain: 0.0
Feature: income, Info Gain: 0.01997309402197489
Feature: student, Info Gain: 0.01997309402197489
Feature: credit_rating, Info Gain: 0.9709505944546686
{'age': {'youth': {'student': {'no': 'no', 'yes': 'yes'}}, 'middle_aged': 'yes', 'senior': {'credit_rating': {'fair': 'yes', 'excellent': 'no'}}}}


In [10]:
print("CART- GINI INDEX")

CART- GINI INDEX


In [11]:
def calc_gini(data, label):
    class_counts = data[label].value_counts()  # count occurrences of each class
    total = len(data)  # total number of rows
    gini = 1.0
    
    for count in class_counts:
        prob = count / total  # probability of a class
        gini -= prob ** 2  # subtract squared probability from 1
    
    return gini


In [12]:
def calc_gini_split(data, feature_name, label):
    feature_values = data[feature_name].unique()  # unique values for the feature
    total = len(data)  # total number of rows
    gini_split = 0.0

    for value in feature_values:
        # Subset data where feature_name == value
        subset = data[data[feature_name] == value]
        gini_value = calc_gini(subset, label)  # Gini of this subset
        weight = len(subset) / total  # weight based on subset size
        gini_split += weight * gini_value  # add weighted Gini of this subset

    return gini_split


In [13]:
def find_best_split(data, label):
    feature_list = data.columns.drop(label)  # list of features excluding the label
    best_gini = float('inf')  # start with a very high Gini
    best_feature = None

    for feature in feature_list:
        gini_split = calc_gini_split(data, feature, label)
        print(f"Feature: {feature}, Gini: {gini_split}")  # Debugging output to check Gini for each feature
        if gini_split < best_gini:
            best_gini = gini_split
            best_feature = feature

    return best_feature

In [14]:
def generate_tree(data, label, max_depth=None, depth=0):
    # If the data is pure or max depth is reached, stop splitting
    if data[label].nunique() == 1 or (max_depth and depth >= max_depth):
        return data[label].mode()[0]  # Return the majority class

    # Find the best feature to split on using Gini
    best_feature = find_best_split(data, label)

    # Create a subtree for each value of the best feature
    tree = {best_feature: {}}

    feature_values = data[best_feature].unique()
    for value in feature_values:
        subset = data[data[best_feature] == value]  # subset where feature equals this value
        tree[best_feature][value] = generate_tree(subset, label, max_depth, depth + 1)

    return tree

In [15]:
# Example usage with your dataset (train_data_m, and the label 'Class: buys_computer')
tree = generate_tree(train_data_m, 'buys_computer')
print(tree)


Feature: age, Gini: 0.34285714285714286
Feature: income, Gini: 0.44047619047619047
Feature: student, Gini: 0.3673469387755103
Feature: credit_rating, Gini: 0.42857142857142855
Feature: age, Gini: 0.48
Feature: income, Gini: 0.2
Feature: student, Gini: 0.0
Feature: credit_rating, Gini: 0.4666666666666667
Feature: age, Gini: 0.48
Feature: income, Gini: 0.4666666666666667
Feature: student, Gini: 0.4666666666666667
Feature: credit_rating, Gini: 0.0
{'age': {'youth': {'student': {'no': 'no', 'yes': 'yes'}}, 'middle_aged': 'yes', 'senior': {'credit_rating': {'fair': 'yes', 'excellent': 'no'}}}}
