# 20220905 | Decision Trees | Naïve Bayes

In [1]:
from math import prod

import numpy as np
import pandas as pd

import yaml

## The Dataset

In [2]:
def I(*args):
    total = sum(args)
    ans = 0
    for arg in args:
        ans -= np.log2(arg / total) * (arg / total)
        # print(f"- {arg}/{total}*log2({arg}/{total})", end=" ")
    # print(f"= {ans}")
    return ans

In [3]:
age = {
    1: "<=30",
    2: "31..40",
    3: ">40"
}
income = {
    1: "high",
    2: "medium",
    3: "low"
}
credit_rating = {
    1: "fair",
    2: "excellent"
}

df = pd.DataFrame({
    "age": [1, 1, 2, 3, 3, 3, 2, 1, 1, 3, 1, 2, 2, 3],
    "income": [1, 1, 1, 2, 3, 3, 3, 2, 3, 2, 2, 2, 1, 2],
    "student": [False, False, False, False, True, True, True, False, True, True, True, False, True, False],
    "credit_rating": [1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2],
    "buys_computer": [False, False, True, True, True, False, True, False, True, True, True, True, True, False]
})
df

Unnamed: 0,age,income,student,credit_rating,buys_computer
0,1,1,False,1,False
1,1,1,False,2,False
2,2,1,False,1,True
3,3,2,False,1,True
4,3,3,True,1,True
5,3,3,True,2,False
6,2,3,True,2,True
7,1,2,False,1,False
8,1,3,True,1,True
9,3,2,True,1,True


## Decision Tree
### Some Functions Needed

In [4]:
def get_val_counts(df):
    y_vals = df.iloc[:, -1].value_counts().to_dict()
    x_vals = {attr: df[attr].value_counts().to_dict() for attr in df.iloc[:, :-1]}
    return x_vals, y_vals

In [5]:
def print_yaml(dict):
    print(yaml.dump(dict, default_flow_style=False))

### Formulas

$entropy = info(D) = -\sum(p_ilog_2(p_i))$

$information = info_A(D) = \sum_{j=1}^{v}(\frac{|D_j|}{|D|}\cdot I(D_j))$

$information\ gained = gain(A) = info(D) - info_A(D)$

### Implementation

In [6]:
# finding the gain for the given df
def gains(df):
    y = df.iloc[:, -1]
    x_vals, y_vals = get_val_counts(df)

    total = y.count()
    entropy = I(*[y_vals[y] for y in y_vals])
    x_info = {}
    for x in x_vals: # for each attribute
        x_info[x] = 0 # the information gain is set to 0
        for attr in x_vals[x]: # for each possible value of attribute x
            val = x_vals[x][attr] # the number of records with that value
            y_for_x = df.loc[df[x] == attr].iloc[:, -1].value_counts().to_list() # the class label distribution for each record with that value
            x_info[x] += (val / total) * I(*y_for_x)
    x_gain = {x: (entropy - x_info[x]) for x in x_info}
    return x_gain

In [7]:
def make_decision_tree(df):
    # getting all the values of x and y for the given node
    x_vals, y_vals = get_val_counts(df)

    # if there is only one class label left
    if len(y_vals) == 1:
        class_label = list(y_vals.keys())[0]
        return {'class': class_label}

    # if we need to calculate further
    else:
        gain = gains(df) # get the gain for each attribute
        x_max = max(gain, key=lambda x: gain[x]) # get the attribute with the max gain
        x_branches = [
            {
                val: make_decision_tree(df.loc[df[x_max] == val])
            } for val in x_vals[x_max]
        ]
        return {x_max: x_branches}

print_yaml(make_decision_tree(df))
print(make_decision_tree(df))

age:
- 1:
    student:
    - false:
        class: false
    - true:
        class: true
- 3:
    credit_rating:
    - 1:
        class: true
    - 2:
        class: false
- 2:
    class: true

{'age': [{1: {'student': [{False: {'class': False}}, {True: {'class': True}}]}}, {3: {'credit_rating': [{1: {'class': True}}, {2: {'class': False}}]}}, {2: {'class': True}}]}
