# 1. Decision Tree for Classification

## 1.1 Simple Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
salary = {
    "Age": [23, 25, 27, 29, 29],
    "Likes English": [0, 1, 1, 0, 0],
    "Likes AI": [0, 1, 0, 1, 0],
    "Raise Salary": [0, 0, 1, 1, 0]
}
salary = pd.DataFrame(salary)
salary[["Likes English", "Likes AI", "Raise Salary"]] = salary[["Likes English", "Likes AI", "Raise Salary"]].astype("category")
salary

Unnamed: 0,Age,Likes English,Likes AI,Raise Salary
0,23,0,0,0
1,25,1,1,0
2,27,1,0,1
3,29,0,1,1
4,29,0,0,0


In [3]:
def find_gini(x):
    return 1 - np.sum(x**2)

def find_entropy(x):
    return -1 * (np.sum(np.multiply(x[x>0], np.log2(x[x>0]))))

def find_total(df, func_type):
    x = df.columns[:-1]
    y = df.columns[-1]
    D_count = []
    D_cols = []
    for column in x:
        x_count = []
        if df[column].dtype.name == "category":
            x_count = np.array(df.groupby([column, y], observed = False)[y].count().tolist())
            D_count.append(x_count)
            D_cols.append(column)
        else:
            conti_feature = np.array(df[column])
            conti_conditions = [np.mean([conti_feature[i], conti_feature[i+1]]) for i in range(len(conti_feature)-1)]
            for v in conti_conditions:
                left_lst = df[df[column] <= v].groupby(y, observed = False)[y].count().tolist()
                right_lst = df[df[column] > v].groupby(y, observed = False)[y].count().tolist()
                x_count = np.array(left_lst + right_lst)
                D_count.append(x_count)
                D_cols.append(str(v))

    result = {}
    for c, v in zip(D_cols, D_count):
        left_prob = v[:2]
        right_prob = v[2:]
        if np.sum(v[:2]) != 0:
            left_prob = v[:2]/np.sum(v[:2])
        if np.sum(v[2:]) != 0:
            right_prob = v[2:]/np.sum(v[2:])

        if func_type == "gini":
            Di = [find_gini(left_prob), find_gini(right_prob)]
        elif func_type == "entropy":
            Di = [find_entropy(left_prob), find_entropy(right_prob)]

        D = (np.sum(v[:2])/np.sum(v))*Di[0] + (np.sum(v[2:])/np.sum(v))*Di[1]

        if func_type == "gini":
            result[c] = D = round(D, 2)
        elif func_type == "entropy":
            result[c] = D = round(1 - D, 2)

    return result

In [8]:
print("Gini:", find_total(salary, "gini"), "\n")
print("Infomation gain:", find_total(salary, "entropy"))

Gini: {'24.0': 0.4, '26.0': 0.27, '28.0': 0.47, '29.0': 0.48, 'Likes English': 0.47, 'Likes AI': 0.47} 

Infomation gain: {'24.0': 0.2, '26.0': 0.45, '28.0': 0.05, '29.0': 0.03, 'Likes English': 0.05, 'Likes AI': 0.05}


## 1.2 Iris data

In [13]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Load the diabetes dataset
iris_X, iris_y = datasets.load_iris(return_X_y = True)

# Split train:test = 8:2
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size = 0.2, random_state = 42)

# Define model
dt_classifier = DecisionTreeClassifier()

# Train
dt_classifier.fit(X_train, y_train)

# Preidct and evaluate
y_pred = dt_classifier.predict(X_test)
accuracy_score(y_test, y_pred )

1.0

# 2. Decision Tree for Regression

## 2.1 Simple Data

In [None]:
import pandas as pd
import numpy as np

In [5]:
salary_reg = {
    "Age": [23, 25, 27, 29, 29],
    "Likes English": [0, 1, 1, 0, 0],
    "Likes AI": [0, 1, 0, 1, 0],
    "Salary": [200, 400, 300, 500, 400]
}
salary_reg = pd.DataFrame(salary_reg)
salary_reg[["Likes English", "Likes AI"]] = salary_reg[["Likes English", "Likes AI"]].astype("category")
salary_reg

Unnamed: 0,Age,Likes English,Likes AI,Salary
0,23,0,0,200
1,25,1,1,400
2,27,1,0,300
3,29,0,1,500
4,29,0,0,400


In [6]:
def find_sum_squared(x):
    result = 0
    if len(x) > 0:
        result = (np.sum((x - np.mean(x))**2))/len(x)
    return result

def find_total_reg(df):
    x = df.columns[:-1]
    y = df.columns[-1]
    D_count = []
    D_cols = []
    for column in x:
        x_count = 0
        if df[column].dtype.name == "category":
            cate_conditions = salary_reg[column].unique()
            for v in cate_conditions:
                x_count += find_sum_squared(df[df[column] == v][y].tolist())
                x_count = round(x_count, 2)
            D_count.append(x_count)
            D_cols.append(column)
        else:
            conti_feature = np.array(df[column])
            conti_conditions = [np.mean([conti_feature[i], conti_feature[i+1]]) for i in range(len(conti_feature)-1)]
            for v in conti_conditions:
                x_count = find_sum_squared(df[df[column] <= v][y].tolist()) + find_sum_squared(df[df[column] > v][y].tolist())
                x_count = round(x_count, 2)
                D_count.append(x_count)
                D_cols.append(str(v))

    result = {}
    for c, v in zip(D_cols, D_count):
        result[c] = v

    return result

In [9]:
print("SSE:", find_total_reg(salary_reg))

SSE: {'24.0': 5000.0, '26.0': 16666.67, '28.0': 9166.67, '29.0': 10400.0, 'Likes English': 18055.56, 'Likes AI': 9166.67}


## 2.2 CPU Machine data

In [14]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

# Load dataset
machine_cpu = fetch_openml(name = "machine_cpu")
machine_data = machine_cpu.data
machine_labels = machine_cpu.target

# Split train:test = 8:2
X_train, X_test, y_train, y_test = train_test_split(machine_data, machine_labels, test_size = 0.2, random_state = 42)

# Define model
tree_reg = DecisionTreeRegressor()

# Train
tree_reg.fit(X_train, y_train)

# Preidct and evaluate
y_pred = tree_reg.predict(X_test)
mean_squared_error(y_test, y_pred)

  warn(
  warn(


9045.008597883598