In [3]:
# def entropy
# def variance
# def info_gain
# def categorical_split
# def max_info (choose one of the unique value in one feature to have the highest info_gain => then choose the highest ig among the features)

In [4]:
import numpy as np
import pandas as pd 

colname_d3 = ['x1', 'x2', 'y']
df_d3 = pd.read_csv('D3leaves.txt', names= colname_d3, sep = ' ')

In [5]:
def entropy(y):
    prob = y.value_counts()/y.shape[0]
    entropy = np.sum(-prob* np.log2(prob))
    return entropy

In [6]:
def variance(y):
    if len(y) == 1:
        return 0
    else : 
        return y.var()

In [7]:
def info_gain(y, mask, func = entropy):
    a = sum(mask)
    b = mask.shape[0] - a 
    if a == 0 or b == 0:
         return 0
    return func(y) - (func(y[mask])*a/(a+b) + func(y[-mask])*b/(a+b))

In [8]:
def max_info_gain(x, y, func = entropy):
    ig = 0
    split_value = 0
    num = 0

    is_numeric = True if x.dtypes != 'O' else False

    #creating options according to variable types
    options = x.sort_values().unique()[1:] #not starting by the first element since there's nothing to split before the first element
    
    #calculating information gain for different value
    for value in options:
        mask = x >= value
        value_ig = info_gain(y, mask, func)

        if value_ig > ig:
            ig = value_ig
            split_value = value
            num = num + 1
    
    if num == 0:
        return (None, None, False)

    else: 
        return (ig, split_value, True)


In [None]:
#highest Information Gain is Weight
#include the different hyperparameters that a decision tree generally offers, thus avoiding overfitting:
    #max_depth: maximum depth of the tree. If we set it to None, the tree will 
              # grow until all the leaves are pure or the hyperparameter min_samples_split has been reached.
    #min_samples_split: indicates the minimum number of observations a sheet must have to continue creating new nodes.
    #min_information_gain: the minimum amount the Information Gain must increase for the tree to continue growing.

In [10]:
#y: name of the target features
def get_best_split(y, data):
    masks = data.drop(y, axis = 1).apply(max_info_gain, y = data[y])
    if sum(masks.loc[3,:]) == 0:
        return (None, None, None, False)

    else:
        #choose only masks that can be splitted
        masks = masks.loc[:, masks.loc[3,:]] # choose all columns that has the 3th index True box
        split_vari = max(masks) #choose split feature
        split_ig = masks[split_vari][0]
        split_val = masks[split_vari][1]
    return (split_vari, split_val, split_ig)
        

In [None]:
'''
  Given a data and a split conditions, do the split.
  variable: variable with which make the split.
  value: value of the variable to make the split.
  data: data to be splitted.
  is_numeric: boolean considering if the variable to be splitted is numeric or not.
  '''
def make_split(variable, value, data, is_numeric):
    if is_numeric:
      data1 = data[data[variable] < value]
      data2 = data[(data[variable] < value) == False]
    else:
      data1 = data[data[variable].isin(value)]
      data2 = data[data[variable].isin(value) == False]
    return data1, data2


In [None]:
#Example/ Demonstration of the previous code
value = 'Weight'
print(type(value))
print(data[value] < 103)
data1 = data[data['Weight'] < 103] 
print(data1)

<class 'str'>
0       True
1       True
2      False
3      False
4       True
       ...  
495    False
496    False
497    False
498     True
499    False
Name: Weight, Length: 500, dtype: bool
     Gender  Height  Weight  Obese
0      Male     174      96      1
1      Male     189      87      0
4      Male     149      61      0
6      Male     147      92      1
8      Male     174      90      0
..      ...     ...     ...    ...
490  Female     164      59      0
492  Female     198      50      0
493  Female     170      53      0
494    Male     152      98      1
498    Male     150      95      1

[229 rows x 4 columns]


In [None]:
def make_pred(data, target_factor):
    '''
  Given the target variable, make a prediction.
  data: pandas series for target variable
  target_factor: boolean considering if the variable is a factor or not
  '''
    if target_factor:
        pred = data.value_counts().idxmax() #return which value has more occurence, in this case, there an "1" or "0"
    else:
        pred = data.mean()
    return pred


In [None]:
def train_tree(data, y, target_factor, max_depth = None, min_samples_split = None, min_information_gain = 1e-20, max_categories = 20, counter = 0):
    '''
  Trains a Decission Tree
  data: Data to be used to train the Decission Tree
  y: target variable column name
  target_factor: boolean to consider if target variable is factor or numeric.
  max_depth: maximum depth to stop splitting.
  min_samples_split: minimum number of observations to make a split.
  min_information_gain: minimum ig gain to consider a split to be valid.
  max_categories: maximum number of different values accepted for categorical values. High number of values will slow down learning process. R
'''
    # Check that max_categories is fulfilled
    if counter ==0:
        types = data.dtypes
        check_columns = types[types == "object"].index #deo hieu
        for columns in check_columns:
            var_length = len(data[columns].value_counts())
            if var_length > max_categories:
                raise ValueError('The variable ' + columns + ' has '+ str(var_length) + ' unique values, which is more than the accepted ones: ' +  str(max_categories))
    #check for depth condition
    if max_depth == None:
        dep_cond = True
    else:
        if counter < max_depth:
            dep_cond = True
        else: 
            dep_cond = False
    #check for sample condition
    if min_samples_split == None:
        sample_cond = True
    else:
        if data.shape[0] > min_samples_split: #number of rows
            sample_cond = True
        else: 
            sample_cond = False
    # Check for ig condition
    if dep_cond & sample_cond:
        var,val, ig, var_type = get_best_split(y, data)
        print(val)
        # If ig condition is fulfilled, make split
        if ig is not None and ig >= min_information_gain:
            counter = counter + 1
            left, right = make_split(var, val, data, var_type) #2 sub datasets

            #instantiate sub-tree (deo hieu)
            split_type = "<=" if var_type else "in"
            question = "{} {}  {}".format(var, split_type, val)
            subtree = {question: []} #[] represents values of 2 branches, {} represent one such branch 
    
    # Find answers (recursion)
            yes_ans = train_tree(left, y, target_factor, max_depth, min_samples_split, min_information_gain, max_categories, counter)
            no_ans = train_tree(right, y, target_factor, max_depth, min_samples_split, min_information_gain, max_categories, counter)
            print(yes_ans, no_ans)
            if yes_ans == no_ans: #neu cung class het thi ko chia ra
                subtree = yes_ans
            else:
                subtree[question].append(yes_ans)
                subtree[question].append(no_ans)
        #If it doesn't match IG condition, make prediction
        else:
            pred = make_pred(data[y], target_factor)
            return pred
    else:
        pred = make_pred(data[y], target_factor)
        return pred

    return subtree


In [None]:
max_depth = 5
min_samples_split = 20
min_information_gain = 1e-5

decisiones = train_tree(data, 'Obese', True, max_depth, min_samples_split, min_information_gain)

decisiones
#                W <= 103
#                 /   \
#           W<=66      1
#          /     \
#         0     W <=84
#                /    \
#            W<=74     W <=98
#           /  \         /  \
#        0   W <=75     1    0
#             /  \
#            1    0

Check1
Check2
Check3
103
Check1
Check2
Check3
66
Check1
Check2
Check3
None
Check1
Check2
Check3
84
Check1
Check2
Check3
74
Check1
Check2
Check3
72
Check1
Check2
Check3
Check1
Check2
Check3
0 0
Check1
Check2
Check3
75
Check1
Check2
Check3
Check1
Check2
Check3
1 0
0 {'Weight <=  75': [1, 0]}
Check1
Check2
Check3
98
Check1
Check2
Check3
97
Check1
Check2
Check3
Check1
Check2
Check3
1 1
Check1
Check2
Check3
1 0
{'Weight <=  74': [0, {'Weight <=  75': [1, 0]}]} {'Weight <=  98': [1, 0]}
0 {'Weight <=  84': [{'Weight <=  74': [0, {'Weight <=  75': [1, 0]}]}, {'Weight <=  98': [1, 0]}]}
Check1
Check2
Check3
116
Check1
Check2
Check3
104
Check1
Check2
Check3
Check1
Check2
Check3
107
Check1
Check2
Check3
Check1
Check2
Check3
108
Check1
Check2
Check3
Check1
Check2
Check3
1 1
1 1
1 1
Check1
Check2
Check3
None
1 1
{'Weight <=  66': [0, {'Weight <=  84': [{'Weight <=  74': [0, {'Weight <=  75': [1, 0]}]}, {'Weight <=  98': [1, 0]}]}]} 1


{'Weight <=  103': [{'Weight <=  66': [0,
    {'Weight <=  84': [{'Weight <=  74': [0, {'Weight <=  75': [1, 0]}]},
      {'Weight <=  98': [1, 0]}]}]},
  1]}

In [None]:
colname_d3 = ['x1', 'x2', 'y']
df_d1 = pd.read_csv('D1.txt', names = colname_d3, sep = " ")
train_tree(df_d1,'y', True, max_depth, min_samples_split, min_information_gain)

Check1
Check2
Check3
0.201829
Check1
Check2
Check3
None
Check1
Check2
Check3
None
0 1


{'x2 <=  0.201829': [0, 1]}

In [None]:
df_d2 = pd.read_csv('D2.txt', names = colname_d3, sep = " ")
train_tree(df_d2,'y', True, max_depth, min_samples_split, min_information_gain)

Check1
Check2
Check3
0.506048
Check1
Check2
Check3
0.224236
Check1
Check2
Check3
0.053292
Check1
Check2
Check3
0.037708
Check1
Check2
Check3
None
Check1
Check2
Check3
0 0
Check1
Check2
Check3
0.053702
Check1
Check2
Check3
Check1
Check2
Check3
0.061886
Check1
Check2
Check3
Check1
Check2
Check3
0 0
1 0
0 {'x2 <=  0.053702': [1, 0]}
Check1
Check2
Check3
0.383738
Check1
Check2
Check3
0.379696
Check1
Check2
Check3
0.235379
Check1
Check2
Check3
Check1
Check2
Check3
1 0
Check1
Check2
Check3
{'x2 <=  0.235379': [1, 0]} 0
Check1
Check2
Check3
0.388699
Check1
Check2
Check3
Check1
Check2
Check3
0.390262
Check1
Check2
Check3
Check1
Check2
Check3
0 0
1 0
{'x2 <=  0.379696': [{'x2 <=  0.235379': [1, 0]}, 0]} {'x2 <=  0.388699': [1, 0]}
{'x2 <=  0.053292': [0, {'x2 <=  0.053702': [1, 0]}]} {'x2 <=  0.383738': [{'x2 <=  0.379696': [{'x2 <=  0.235379': [1, 0]}, 0]}, {'x2 <=  0.388699': [1, 0]}]}
Check1
Check2
Check3
0.88635
Check1
Check2
Check3
0.641964
Check1
Check2
Check3
0.516515
Check1
Check2
Check

{'x2 <=  0.506048': [{'x2 <=  0.224236': [{'x2 <=  0.053292': [0,
      {'x2 <=  0.053702': [1, 0]}]},
    {'x2 <=  0.383738': [{'x2 <=  0.379696': [{'x2 <=  0.235379': [1, 0]}, 0]},
      {'x2 <=  0.388699': [1, 0]}]}]},
  {'x2 <=  0.88635': [{'x2 <=  0.641964': [{'x2 <=  0.516515': [1,
        {'x2 <=  0.635533': [1, 0]}]},
      {'x2 <=  0.654331': [1, {'x2 <=  0.658212': [0, 1]}]}]},
    {'x2 <=  0.919236': [{'x2 <=  0.917219': [1, 0]}, 1]}]}]}