# Part 1

In [1]:
class Node:
    def __init__(self, feature=None, threshold=None, value=None, left=None, right=None):
        self.feature = feature # feature index
        self.threshold = threshold # feature threshold
        self.value = value # feature index majority
        self.left = left # child nodes
        self.right = right

In [2]:
import random
from collections import Counter
from statistics import mode
import numpy as np

def pure(y):
    if len(set(y)) == 1: # if all the same classes in remaining 
        return True
    else:
        return False
    
def gini(y):
    counts = list(Counter(y).values())
    p = 0
    for count in counts:
        p += (count / len(y)) ** 2
    return 1 - p

def split(x, y, minleaf, nfeat):
    features = random.sample(range(len(x[0])), nfeat) # random features selecting
    
    best_gini = 1
    best_feature = None
    best_threshold = None
    best_left_indices = []
    best_right_indices = []

    for feature_index in range(len(features)):
        values = [a[feature_index] for a in x]
        for t in range(len(values)):
            left_indices = []
            right_indices = []
            right_values = []
            for i, a in enumerate(x): # indices lower than threshold
                if a[feature_index] <= values[t]:
                    left_indices.append(i)
                else:
                    right_indices.append(i)
                    right_values.append(a[feature_index])

            if len(left_indices) >= minleaf and len(right_indices) >= minleaf: # minleaf criteria
                left_gini = gini([y[i] for i in left_indices])
                right_gini = gini([y[i] for i in right_indices])

                weighted_gini = (len(left_indices) / len(y)) * left_gini + (len(right_indices) / len(y)) * right_gini # weighted based on size

                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    best_feature = feature_index
                    right_values.sort()
                    best_threshold = (values[t] + right_values[0]) / 2
                    best_left_indices = left_indices
                    best_right_indices = right_indices

    # from indices to rows
    best_left_x = [a for i, a in enumerate(x) if i in best_left_indices]
    best_right_x = [a for i, a in enumerate(x) if i in best_right_indices]
    best_left_y = [a for i, a in enumerate(y) if i in best_left_indices]
    best_right_y = [a for i, a in enumerate(y) if i in best_right_indices]

    return best_feature, best_threshold, best_left_x, best_right_x, best_left_y, best_right_y


def tree_grow(x, y, nmin, minleaf, nfeat):
    # nmin: if a node contains fewer cases than nmin, it becomes a leaf node.
    # minleaf: a split that creates a node with fewer than minleaf observations is not acceptable.
    #If there is no split that meets the minleaf constraint, the node becomes a leaf node. use GINI
    #nfeat we first draw at random nfeat features from which the best split is to be selected.

    if pure(y): # if pure return majority class
        return Node(value=mode(y))

    if len(y) < nmin: # if fewer cases than nmin majority class
        return Node(value=mode(y))

    feature, threshold, leftx, rightx, lefty, righty = split(x, y, minleaf, nfeat) # GINI search
    print()
    # print('feature split', feature)
    # print('leftx', leftx)
    # print('lefty', lefty)
    # print('rightx', rightx)
    # print('righty', righty)
    # print()
    if feature == None: # no split so node becomes leaf
        return Node(value=mode(y))

    left_child = tree_grow(leftx, lefty, nmin, minleaf, nfeat)
    right_child = tree_grow(rightx, righty, nmin, minleaf, nfeat)

    parent = Node()
    parent.feature = feature
    parent.threshold = threshold 
    parent.left = left_child
    parent.right = right_child
    return parent

        
def tree_pred(x, tr):
    predictions = []
    
    for i in range(len(x)):
        current_node = tr
        while current_node.value == None:
            if x[i][current_node.feature] <= current_node.threshold:
                current_node = current_node.left
            else:
                current_node = current_node.right
        predictions.append(current_node.value)
    
    return predictions


def tree_grow_b(x, y, nmin, minleaf, nfeat, m):
    trees = []

    for _ in range(m):
        bootstrap_i = np.random.choice(len(x), len(x), replace=True)
        x_b = [x[i] for i in bootstrap_i]
        y_b = [y[i] for i in bootstrap_i]
        trees.append(tree_grow(x_b, y_b, nmin, minleaf, nfeat))

    return trees


def tree_pred_b(trees, x):
    outcomes = []
    new_y = []
    for tree in trees:
        outcomes.append(tree_pred(x, tree))

    for i in range(len(outcomes[0])):
        new_y.append([a[i] for a in outcomes])
        
    new_y = [mode(a) for a in new_y]
    return new_y



In [3]:
import pandas as pd

file_path = 'credit.txt'
df = pd.read_csv(file_path, delimiter=',')
x = df.drop(columns='class').values.tolist()
y = df['class'].values.tolist()

tree = tree_grow_b(x,y, 2, 1, 5, 4)
print(tree_pred_b(tree, x))










[0, 0, 0, 0, 0, 1, 1, 1, 1, 0]


In [4]:
import pandas as pd

file_path = 'credit.txt'
df = pd.read_csv(file_path, delimiter=',')
x = df.drop(columns='class').values.tolist()
y = df['class'].values.tolist()

tree = tree_grow(x,y, 2, 1, 5)

print('tree prediction on x:')
print('prediction', tree_pred(x,tree))
print('actual y', y)

print()
print()

print('----- node 1 ------')
print('feature', tree.feature)
print('threshold', tree.threshold)
print('----- node 1 - > 2 left -----')
print('feature', tree.left.feature)
print('threshold', tree.left.threshold)
print('value', tree.left.value)
print('----- node 1 -> 2 right -----')
print('feature', tree.right.feature)
print('threshold', tree.right.threshold)
print('value', tree.right.value)
print('----- node 2 -> 3 left -----')
print('feature', tree.left.left.feature)
print('threshold', tree.left.left.threshold)
print('value', tree.left.left.value)
print('----- node 2 -> 3 right -----')
print('feature', tree.left.right.feature)
print('threshold', tree.left.right.threshold)
print('value', tree.left.right.value)




tree prediction on x:
prediction [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
actual y [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]


----- node 1 ------
feature 3
threshold 36.0
----- node 1 - > 2 left -----
feature 0
threshold 37.0
value None
----- node 1 -> 2 right -----
feature None
threshold None
value 1
----- node 2 -> 3 left -----
feature None
threshold None
value 0
----- node 2 -> 3 right -----
feature 1
threshold 0.5
value None


## boom net niet zelfde als in slides, daar eerste split op feature 3 maar op 36

In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix

df = pd.read_csv('pima.txt', header=None)
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

tree = tree_grow(x,y, 20, 5, 8)

cm = confusion_matrix(y, tree_pred(x, tree))

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

# Part 2

In [15]:
# Load 2.0 Ecplise data as training set 
train_data = pd.read_csv('eclipse-metrics-packages-2.0.csv', delimiter=';')

# Load 3.0 Ecplise data as test set
test_data = pd.read_csv('eclipse-metrics-packages-3.0.csv', delimiter=';')

In [None]:
# All 41 predictor variables 
pred_variables = ['FOUT_avg', 'FOUT_max', 'FOUT_sum',
                  'MLOC_avg', 'MLOC_max', 'MLOC_sum',
                  'NBD_avg', 'NBD_max', 'NBD_sum',
                  'PAR_avg', 'PAR_max', 'PAR_sum',
                  'VG_avg', 'VG_max', 'VG_sum',
                  'NOF_avg', 'NOF_max', 'NOF_sum',
                  'NOM_avg', 'NOM_max', 'NOM_sum',
                  'NSF_avg', 'NSF_max', 'NSF_sum',
                  'NSM_avg', 'NSM_max', 'NSM_sum',
                  'ACD_avg', 'ACD_max', 'ACD_sum',
                  'NOI_avg', 'NOI_max', 'NOI_sum',
                  'NOT_avg', 'NOT_max', 'NOT_sum',
                  'TLOC_avg', 'TLOC_max', 'TLOC_sum',
                  'NOCU', 'pre']

# Split predictor variables from class labels (training set)
x_train = train_data[pred_variables]
y_train = [0 if x == 0 else 1 for x in train_data['post']]

# Split predictor variables from class labels (test set)
x_test = test_data[pred_variables]
y_test = [0 if x == 0 else 1 for x in test_data['post']]

In [51]:
# All 41 features for each data instance (377 in total)
print(x_train)

     FOUT_avg  FOUT_max  FOUT_sum   MLOC_avg  MLOC_max  MLOC_sum   NBD_avg  \
0    5.980769      29.0     311.0   9.230769      55.0     480.0  1.826923   
1    4.000000      22.0     168.0   6.666667      32.0     280.0  1.357143   
2    4.321267      33.0     955.0   7.027149      83.0    1553.0  1.452489   
3    3.752941      80.0     319.0   6.517647     118.0     554.0  1.564706   
4    6.552632      63.0     996.0  10.736842      75.0    1632.0  2.052632   
..        ...       ...       ...        ...       ...       ...       ...   
372  1.750000      11.0      28.0   3.437500      16.0      55.0  1.250000   
373  2.764706      19.0      47.0   5.705882      29.0      97.0  1.352941   
374  2.000000      10.0      32.0   3.500000      16.0      56.0  1.375000   
375  6.700000      61.0     201.0  12.200000      92.0     366.0  1.733333   
376  3.759259      16.0     203.0   8.333333      30.0     450.0  1.888889   

     NBD_max  NBD_sum   PAR_avg  ...  NOI_max  NOI_sum   NOT_av

In [52]:
# All 377 class labels
print(y_train)

[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 

In [None]:
eclipse_tr = tree_grow(x_train, y_train, nmin=15, minleaf=5, nfeat=41)