In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
import csv
from operator import le, eq
from more_itertools import numeric_range

In [3]:
from sklearn.model_selection import train_test_split
import functools
import time

In [4]:
os.chdir(r'C:\Users\saura\OneDrive\Documents\IIT Hyderabad\Assignments\ML\Assignment 1')
os.listdir()

['Assignment 1 Answer.docx',
 'assignment_1.pdf',
 'Decision Tree results.xlsx',
 'KNN.pdf',
 'links.txt',
 'Sauradeep-Debnath-CV_result.txt',
 'Sauradeep-Debnath-result.txt',
 'skeleton DT.pdf',
 'wine-dataset.csv',
 '~$Decision Tree results.xlsx',
 '~$signment 1 Answer.docx',
 '~WRL2643.tmp',
 '~WRL2709.tmp']

In [5]:
myname = 'Sauradeep-Debnath-'

# LOAD datasets

In [6]:
wine_df = pd.read_csv('wine-dataset.csv')
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0


In [7]:
wine_df.quality.value_counts()

0    3838
1    1060
Name: quality, dtype: int64

In [8]:
wine_df_X = wine_df.iloc[:,:-1]
wine_df_X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


# Decision Tree Helper functions 

In [9]:
def get_impurity(labels, criteria = 'entropy'):
    '''
        Gets the impurity of a given node
        Input Parameters
        ----------
        labels : list
            holds the list of class labels
        criteria : string
            possible values : 'entropy', 'gini'
            default value : 'entropy'
        Returns
        --------------
        impurity : float
            impurity of a given node either as per entropy or gini impurity criteria
        '''
    _,count_labels = np.unique(labels, return_counts=True) # get the counts of elements belonging to each class
    prob_i = count_labels / len(labels) # probability of i-th class 
    if criteria == 'entropy':
        return -np.sum(prob_i*np.log2(prob_i)) # Entropy
    elif criteria == 'gini':
        return 1-np.sum(prob_i*prob_i) # Gini impurity
counts = get_impurity([2,2,4,4,4])
print(counts)

0.9709505944546686


In [10]:
counts = get_impurity([2,2,4,4,4],'gini')
print(counts)

0.48


In [11]:
counts = get_impurity([4,4,4,4,4],'gini')
print(counts)

0.0


In [12]:
counts = get_impurity([1,2],'gini')
print(counts)

0.5


In [13]:
def information_gain(child_nodes, parent_node, criteria = 'entropy'):
        '''
        Gets the impurity reduction when a parent node is divided into child nodes
        Input Parameters
        ----------
        child_nodes : list
            holds child node element as lists inside another list
        parent_node : list
            holds the parent elements as a list
        criteria : string
            possible values : 'entropy', 'gini'
            default value : 'entropy'
        Returns
        --------------
        info_gain : float
            information gain ( reduction in impurity) resulting from the split 
        '''
        parent_impurity = get_impurity(parent_node, criteria)# get entropy/gini impurity , based on parameter "criteria"
        child_impurity_list = [get_impurity(child,criteria) for child in child_nodes]
        child_counts = [len(child) for child in child_nodes] # get the number of elments in each child node
        # get a weighted average of child impurities, weighted by child node sizes
        weighted_avg_child_impurity = np.sum([child_impurity_list[i]*child_counts[i] for i in range(len(child_nodes))])/len(parent_node)
        # get the difference with parent node impurity
        info_gain = parent_impurity - weighted_avg_child_impurity ## get the impurity difference
        return info_gain
    
information_gain([[1,1,1],[2,1,2,2]],[1,1,1,2,1,2,2],'gini')#example of values    

0.27551020408163274

In [14]:
#training_set.iloc[:,-1].value_counts().to_dict()

In [15]:
information_gain([np.array([1,1,1]),np.array([2,1,2,2])],np.array([1,1,1,2,1,2,2]))

0.5216406363433185

In [16]:
def split_tree(dataset, feature, threshold):
    '''
    Splits the 'dataset' dataframe into two parts based on the chosen threshold & returns them
    
    '''
    condition_check_success_records = dataset[dataset[feature].astype(float)>=threshold]## satisfies the condition
    condition_check_failure_records = dataset[dataset[feature].astype(float)<threshold]## fails the condition
    return condition_check_success_records, condition_check_failure_records

In [17]:

def get_best_threshold(dataset, feature,criteria ,threshold):
    '''for a given feature , find the  best threshold in a Node
    Parameters
    -------------
    dataset - dataframe
        The set of records at a given node we need to split further
    feature - str
        the feature / column/attribute we are checking currently
    threshold -
        the current threshold. We iterate through all the values in of a feature in the function check_all_features() and
        pass them one by one to this function
    Returns 
    -------------
    info_gain : float
        the info gain resulting from the current splitting condition ( threshold, feature)
    threshold : threshold 
        corresponding [info_gain, threshold] are passed a list to check_all_features()
    
    '''
    left_tree, right_tree = split_tree(dataset, feature, threshold)# success , failure branches
    if left_tree.shape[1] == 0 or right_tree.shape[1] == 0:# If Row number of any branch=0, skip
                return 0, none
    info_gain = information_gain([left_tree.iloc[ :, -1].values, right_tree.iloc[ :, -1].values], dataset.iloc[ :, -1].values, criteria)
    return [info_gain, threshold]
    
                
def check_all_features(dataset,criteria, feature):
    '''for a given node, check all features to find best splitting condition
    Returns
    ------------
    returns [highest_info_gain, feature, best_threshold] where best_threshold is the best threshold to split a given node
    for a given features
    '''
    all_values = list( set(dataset[feature].astype(float)))
    highest_info_gain, optimal_feature, best_threshold =0 ,None, None
    info_gain_list = list(map(functools.partial(get_best_threshold, dataset, feature, criteria),all_values))
    #info_gain_list = list(map(functools.partial(get_best_threshold, dataset, feature, criteria),eq_100_parts))
    highest_info_gain, best_threshold = max(info_gain_list, key=lambda element: element[0])
    return [highest_info_gain, feature, best_threshold]
                
def best_split_conditions(dataset, criteria):
    '''for a given node , find the best splitting condition ( i.e. which feature to split, at what threshold & 
    what is the resulting information gain)'''
    highest_info_gain = 0  
    num_features = dataset.shape[1] # number of attributes
    optimal_feature, best_threshold =None, None
    col_list  = dataset.columns[:len(dataset.columns)-1].tolist()
    #info_gain, optimal_feature, best_threshold = map(check_all_features, col_list, dataset)
    info_gain_list = list(map(functools.partial(check_all_features, dataset, criteria), col_list))
    #best_index = np.argmax(info_gain_list[0])
    highest_info_gain, optimal_feature, best_threshold =  max(info_gain_list, key=lambda element: element[0])
    #print(highest_info_gain, optimal_feature, best_threshold)
    return highest_info_gain, optimal_feature, best_threshold

## Tree Node Classes 

In [18]:
class Non_Terminal_DT_Node:
    """
    A class that represents a Non Terminal Node (Decision Node) in a Decision Tree i.e. a node that is subsequently split into other nodes
    ...
    Attributes
    ----------
    true_child_tree : object of the class DecisionTree
        holds the the sub tree / child tree / branch which has elements that satisfy the splitting condition
    false_child_tree : object of the class DecisionTree
        holds the sub tree / child tree / branch which has elements that fails the splitting condition
    optimal_feature : str, best_threshold: float
        together these two attributes give us the best splitting condition at any node
    """

    def __init__(self, true_child_tree, false_child_tree, optimal_feature, best_threshold):
       
        self.true_child_tree = true_child_tree
        self.false_child_tree = false_child_tree
        self.optimal_feature = optimal_feature
        self.best_threshold = best_threshold

In [19]:
class Terminal_Node:
    """
    A class that represents a Terminal Node (Leaf Node) in a Decision Tree i.e. a node that is NOT subsequently split into other nodes
    ...
    Attributes
    ----------
    predicted_classlabel : str
    The Majority class label in a given Leaf Node
    """
    def __init__(self, dataset):
        self.predicted_classlabel = dataset.iloc[:,-1].value_counts().idxmax()#to_dict()

# FUNCTIONS used for  BUILDING the TREE 

In [20]:
 def build_decision_tree(training_set, ig_threshold, criteria, minimum_leaf =5):#
        '''
        builds the decision tree by learning the optimum splitting condition
        Input Parameters
        ----------
        training_set : dataframe
            Holds the records in the current node.
        ig_threshold : float
            If info gain is less than this threshold, stop splitting & ignore the current split
        criteria : string
            possible values : 'entropy', 'gini'
            default value : 'entropy'
        Returns
        --------------
        Non_Terminal_DT_Node : object of class Non_Terminal_DT_Node
            holds the Decision Nodes i.e. the nodes that are split further
        Terminal_Node : object of class Terminal_Node
            holds the terminal /Leaf nodes
        
        '''
        #minimum_leaf = 5 #
        if len(training_set.iloc[:,-1].unique().tolist()) ==1: # IF a node has just one class, declare it as Terminal /Leaf
            return Terminal_Node(training_set)
        elif len(training_set.iloc[:,-1].unique().tolist()) > 1:
            if training_set.shape[0]<=minimum_leaf: 
                return Terminal_Node(training_set)
            info_gain, optimal_feature, best_threshold = best_split_conditions(training_set, criteria)
            if info_gain <= ig_threshold :      #or training_set.shape[0]<=minimum_leaf: 
                return Terminal_Node(training_set)
            true_child_set, false_child_set = split_tree(training_set, optimal_feature, best_threshold)
            true_child_tree = build_decision_tree(true_child_set, ig_threshold, criteria)
            false_child_tree = build_decision_tree(false_child_set, ig_threshold, criteria)
            #print(' optimal_feature : '+str(optimal_feature)+' best_threshold : '+str(best_threshold))
            return Non_Terminal_DT_Node(true_child_tree, false_child_tree, optimal_feature, best_threshold)

In [21]:
def classify_test_instance(test_instance, tree_node):
        '''
        classifies a new test instance based on a previously learnt/built decision tree
        Input Parameters
        ----------
        test_instance : nd-array
            Holds the current test instance
        tree_node : object of class DecisionTree
        Returns
        --------------
        predicted_classlabel : str
            the predicted label for the test instance
            '''
        
        result = 0 #
        
        if isinstance(tree_node, Terminal_Node):
            #print('node.label_prediction_dict :'+str(node.label_prediction_dict))
            return tree_node.predicted_classlabel
        
        test_instance = pd.Series(test_instance)
        if float(test_instance[tree_node.optimal_feature])>=tree_node.best_threshold:
            return classify_test_instance(test_instance, tree_node.true_child_tree )
        else:
            return classify_test_instance(test_instance, tree_node.false_child_tree)

In [22]:
class DecisionTree():
    """
    A class that represents a Decision Tree
    ...
    Attributes
    ----------
    ig_threshold : float
        a information gain threshold that tells when to stop splitting the nodes
    criteria : str
        the criteria to measure impurity. Two options entropy(default) and gini impurity

    Methods
    -------
    learn (training_set)
        Learns / builds the decision tree as per the criteria & information gain minimum threshold
    classify (training_set)
        Given a test data, predicts the label based on the existing tree
    """
    def __init__(self,ig_threshold, criteria, minimum_leaf =5):
        self.ig_threshold = ig_threshold
        self.criteria = criteria
        self.minimum_leaf =minimum_leaf 
        tree = {}
    
    def learn(self, training_set):
        # implement this function
        self.tree = build_decision_tree(training_set,self.ig_threshold ,criteria, minimum_leaf)
        
    # implement this function
    def classify(self, test_instance):
        return classify_test_instance(test_instance, self.tree)

### THE BELOW FUNCTION IS FROM THE SKELETON CODE . THIS IS MOSTLY KEPT UNCHANGED

In [23]:
def run_decision_tree(ig_threshold=0.005, criteria='gini',  minimum_leaf =5):
    # Load data set
    with open("wine-dataset.csv") as f:
        next(f, None)
        data = [tuple(line) for line in csv.reader(f, delimiter=",")]
    print("Number of records: %d" % len(data))
    # Split training/test sets
    # You need to modify the following code for cross validation.
    K = 10
    training_set = [x for i, x in enumerate(data) if i % K != 9]
    test_set = [x for i, x in enumerate(data) if i % K == 9]
    training_set = pd.DataFrame(training_set)
    test_set = pd.DataFrame(test_set)
    training_set.apply(pd.to_numeric, errors='ignore')
    
    tree = DecisionTree(ig_threshold, criteria,  minimum_leaf)
    # Construct a tree using training set
    tree.learn( training_set )
    # Classify the test set using the tree we just constructed
    results = []
    for instance in test_set.astype(float).values:
        result = tree.classify( instance[:-1] )
        
        results.append( float(result) == float(instance[-1]))# check if y_true == y_pred
    # Accuracy
    accuracy = float(results.count(True))/float(len(results))
    print("accuracy: %.4f" % accuracy)
    # Writing results to a file (DO NOT CHANGE)
    f = open(myname+"result.txt", "w")
    f.write("accuracy: %.4f" % accuracy)
    f.close()
    return test_set

In [24]:
def run_decision_tree_cross_val(ig_threshold=0.005, criteria='gini', minimum_leaf =5):
    # Load data set
    with open("wine-dataset.csv") as f:
        next(f, None)
        data = [tuple(line) for line in csv.reader(f, delimiter=",")]
    print("Number of records: %d" % len(data))
    # Split training/test sets
    # You need to modify the following code for cross validation.
    K = 10
    cross_val_accuracy_list =[]
    for kfold in range(K):
        training_set = [x for i, x in enumerate(data) if i%K !=kfold]
        test_set = [x for i, x in enumerate(data) if i % K == kfold]
        training_set = pd.DataFrame(training_set)
        test_set = pd.DataFrame(test_set)
        training_set.apply(pd.to_numeric, errors='ignore')

        tree = DecisionTree(ig_threshold, criteria, minimum_leaf)
        # Construct a tree using training set
        tree.learn( training_set )
        # Classify the test set using the tree we just constructed
        results = []
        for instance in test_set.astype(float).values:
            result = tree.classify( instance[:-1] )

            results.append( float(result) == float(instance[-1]))# check if y_true == y_pred
        # Accuracy
        accuracy = float(results.count(True))/float(len(results))
        
        cross_val_accuracy_list.append(accuracy)
    print(cross_val_accuracy_list)
    cross_val_accuracy = sum(cross_val_accuracy_list)/len(cross_val_accuracy_list)
    print("cross_val_accuracy: %.4f " % cross_val_accuracy)
    # Writing results to a file (DO NOT CHANGE)
    f = open(myname+"CV_result.txt", "w")
    f.write("accuracy: %.4f" % cross_val_accuracy)
    f.close()
    return test_set

# We start checking the accuracy for differet hyperparameters next. 

# NOTE - We only NEED to RUN ONE of the below blocks 

In [25]:
start = time.time()
if __name__ == "__main__":
    ig_threshold =0 #0.01 ALL VALUES IN EACH FEATURE
    criteria ='gini'# 'entropy' 'gini'
    minimum_leaf =5
    test_set = run_decision_tree(ig_threshold, criteria,minimum_leaf)
end = time.time()
print(end - start)# EXECUTION TIME in SECONDs

Number of records: 4898
accuracy: 0.8139
202.66212797164917


In [26]:
if __name__ == "__main__":
    ig_threshold =0 #0.01
    criteria ='entropy'# 'entropy' 'gini'
    minimum_leaf =5
    test_set = run_decision_tree(ig_threshold, criteria, minimum_leaf)

Number of records: 4898
accuracy: 0.8180


# CROSS VALIDATION 

In [27]:
start = time.time()
if __name__ == "__main__":
    ig_threshold =0
    criteria ='entropy'# 'entropy' 'gini'
    minimum_leaf =5
    test_set = run_decision_tree_cross_val(ig_threshold, criteria, minimum_leaf)
start = time.time()# EXECUTION TIME in SECONDs

Number of records: 4898
[0.8510204081632653, 0.8612244897959184, 0.8163265306122449, 0.8346938775510204, 0.8408163265306122, 0.8020408163265306, 0.8326530612244898, 0.8346938775510204, 0.8302658486707567, 0.8179959100204499]
cross_val_accuracy: 0.8322 


In [210]:
if __name__ == "__main__":
    ig_threshold =0.01
    criteria ='entropy'# 'entropy' 'gini'
    minimum_leaf =5
    test_set = run_decision_tree(ig_threshold, criteria, minimum_leaf)

Number of records: 4898
accuracy: 0.8160


In [211]:
start = time.time()
if __name__ == "__main__":
    ig_threshold =0#
    criteria ='gini'# 'entropy' 'gini'
    test_set = run_decision_tree_cross_val(ig_threshold, criteria)
end = time.time()
print(end-start)# EXECUTION TIME in SECONDs

Number of records: 4898
[0.8367346938775511, 0.8428571428571429, 0.7979591836734694, 0.8285714285714286, 0.8510204081632653, 0.7959183673469388, 0.8285714285714286, 0.8428571428571429, 0.8404907975460123, 0.8139059304703476]
cross_val_accuracy: 0.8279 
1150.323976278305


In [33]:
start = time.time()
if __name__ == "__main__":
    ig_threshold =0.03
    criteria ='entropy'# 'entropy' 'gini'
    minimum_leaf =5
    test_set = run_decision_tree_cross_val(ig_threshold, criteria, minimum_leaf =5) #0.8040 
    
end = time.time()
print('time taken to execute : '+ str(end-start))## EXECUTION TIME in SECONDs

Number of records: 4898
[0.8040816326530612, 0.8244897959183674, 0.7918367346938775, 0.8020408163265306, 0.8040816326530612, 0.8122448979591836, 0.810204081632653, 0.8061224489795918, 0.7770961145194274, 0.8077709611451943]
cross_val_accuracy: 0.8040 
time taken to execute : 340.13444328308105


In [213]:
if __name__ == "__main__":
    ig_threshold =0.001
    criteria ='entropy'# 'entropy' 'gini'
    test_set = run_decision_tree_cross_val(ig_threshold, criteria)

Number of records: 4898
[0.8510204081632653, 0.8612244897959184, 0.8163265306122449, 0.8346938775510204, 0.8408163265306122, 0.8020408163265306, 0.8326530612244898, 0.8346938775510204, 0.8302658486707567, 0.8179959100204499]
cross_val_accuracy: 0.8322 


In [None]:
if __name__ == "__main__":
    ig_threshold =0.0005
    criteria ='entropy'# 'entropy' 'gini'
    test_set = run_decision_tree_cross_val(ig_threshold, criteria)

In [None]:
# ENTROPY INCREASED for IG =0.01

## Entropy + Pruning -- Improved Accuracy

In [34]:
start = time.time()
if __name__ == "__main__":
    ig_threshold =0.01 # better than entropy 0 
    criteria ='entropy'# 'entropy' 'gini'
    test_set = run_decision_tree_cross_val(ig_threshold, criteria)
    
end = time.time()
print(end - start)# EXECUTION TIME in SECONDs

Number of records: 4898
[0.8510204081632653, 0.8693877551020408, 0.8183673469387756, 0.8367346938775511, 0.8428571428571429, 0.8, 0.8326530612244898, 0.8346938775510204, 0.8343558282208589, 0.8159509202453987]
cross_val_accuracy: 0.8336 
1075.1261570453644


In [None]:
start = time.time()
if __name__ == "__main__":
    ig_threshold =0.01 # better than entropy 0 
    criteria ='entropy'# 'entropy' 'gini'
    minimum_leaf = 15
    test_set = run_decision_tree_cross_val(ig_threshold, criteria, minimum_leaf)
    
end = time.time()
print(end - start)# EXECUTION TIME in SECONDs

In [41]:

if __name__ == "__main__":
    ig_threshold =0.02 #  
    criteria ='entropy'# 'entropy' 'gini'
    test_set = run_decision_tree_cross_val(ig_threshold, criteria)

Number of records: 4898
[0.8061224489795918, 0.826530612244898, 0.8040816326530612, 0.8244897959183674, 0.8408163265306122, 0.8061224489795918, 0.8163265306122449, 0.8244897959183674, 0.8077709611451943, 0.7975460122699386]
cross_val_accuracy: 0.8154 


In [42]:
start = time.time()
if __name__ == "__main__":
    ig_threshold =0.015 # 
    criteria ='entropy'# 'entropy' 'gini'
    test_set = run_decision_tree_cross_val(ig_threshold, criteria)
end = time.time()
print(end- start)# EXECUTION TIME in SECONDs

Number of records: 4898
[0.8448979591836735, 0.8326530612244898, 0.8183673469387756, 0.8428571428571429, 0.8551020408163266, 0.8142857142857143, 0.8326530612244898, 0.8306122448979592, 0.8343558282208589, 0.7975460122699386]
cross_val_accuracy: 0.8303 
1289.4171986579895


In [39]:
if __name__ == "__main__":
    ig_threshold =0.008 # better than entropy 0 
    criteria ='entropy'# 'entropy' 'gini'
    test_set = run_decision_tree_cross_val(ig_threshold, criteria)

Number of records: 4898
[0.8510204081632653, 0.8612244897959184, 0.8183673469387756, 0.8367346938775511, 0.8428571428571429, 0.8020408163265306, 0.8326530612244898, 0.8346938775510204, 0.8302658486707567, 0.8179959100204499]
cross_val_accuracy: 0.8328 


In [40]:
if __name__ == "__main__":
    ig_threshold =0.012 
    criteria ='entropy'# 'entropy' 'gini'
    test_set = run_decision_tree_cross_val(ig_threshold, criteria)

Number of records: 4898
[0.8448979591836735, 0.8693877551020408, 0.8183673469387756, 0.8367346938775511, 0.8428571428571429, 0.8, 0.8408163265306122, 0.8346938775510204, 0.8343558282208589, 0.7995910020449898]
cross_val_accuracy: 0.8322 


In [36]:
if __name__ == "__main__":
    ig_threshold =0.007 
    criteria ='gini'# 'entropy' 'gini'
    minimum_leaf = 10
    test_set = run_decision_tree_cross_val(ig_threshold, criteria, minimum_leaf)

Number of records: 4898
[0.8346938775510204, 0.8571428571428571, 0.8183673469387756, 0.8306122448979592, 0.8551020408163266, 0.8142857142857143, 0.8387755102040816, 0.8346938775510204, 0.8282208588957055, 0.8241308793456033]
cross_val_accuracy: 0.8336 


In [35]:
#minimum_leaf
if __name__ == "__main__":
    ig_threshold =0.005 # better than gini 0 
    criteria ='gini'# 'entropy' 'gini'
    minimum_leaf = 10
    test_set = run_decision_tree_cross_val(ig_threshold, criteria, minimum_leaf)

Number of records: 4898
[0.8346938775510204, 0.8551020408163266, 0.8183673469387756, 0.8306122448979592, 0.8489795918367347, 0.8163265306122449, 0.8387755102040816, 0.8408163265306122, 0.8425357873210634, 0.8241308793456033]
cross_val_accuracy: 0.8350 


## MAXIMUM ACCURACY from Gini Impurity + Pruning

In [57]:
#minimum_leaf
start = time.time()
if __name__ == "__main__":
    ig_threshold =0.005 # better than gini 0 
    criteria ='gini'# 'entropy' 'gini'
    minimum_leaf = 5
    test_set = run_decision_tree_cross_val(ig_threshold, criteria, minimum_leaf) #.8350
end = time.time()
print(end-start)

Number of records: 4898
[0.8346938775510204, 0.8551020408163266, 0.8183673469387756, 0.8306122448979592, 0.8489795918367347, 0.8163265306122449, 0.8387755102040816, 0.8408163265306122, 0.8425357873210634, 0.8241308793456033]
cross_val_accuracy: 0.8350 
1345.705882549286


In [45]:
#minimum_leaf
if __name__ == "__main__":
    ig_threshold =0.006 # better than gini 0 
    criteria ='gini'# 'entropy' 'gini'
    minimum_leaf = 5
    test_set = run_decision_tree_cross_val(ig_threshold, criteria, minimum_leaf) #.8350

Number of records: 4898
[0.8346938775510204, 0.8591836734693877, 0.8183673469387756, 0.8306122448979592, 0.8489795918367347, 0.8244897959183674, 0.8387755102040816, 0.8346938775510204, 0.83640081799591, 0.8241308793456033]
cross_val_accuracy: 0.8350 


In [75]:
#minimum_leaf
start = time.time()
if __name__ == "__main__":
    ig_threshold =0.006 # better than gini 0 
    criteria ='entropy'# 'entropy' 'gini'
    minimum_leaf = 5
    test_set = run_decision_tree_cross_val(ig_threshold, criteria, minimum_leaf) #.8350
end = time.time()
print(end - start)

Number of records: 4898
[0.8510204081632653, 0.8612244897959184, 0.8183673469387756, 0.8367346938775511, 0.8408163265306122, 0.8020408163265306, 0.8326530612244898, 0.8346938775510204, 0.8302658486707567, 0.8179959100204499]
cross_val_accuracy: 0.8326 
1104.9212188720703


In [38]:
#minimum_leaf
if __name__ == "__main__":
    ig_threshold =0.004 # better than gini 0 
    criteria ='gini'# 'entropy' 'gini'
    minimum_leaf = 5
    test_set = run_decision_tree_cross_val(ig_threshold, criteria, minimum_leaf)

Number of records: 4898
[0.8367346938775511, 0.8551020408163266, 0.8183673469387756, 0.8306122448979592, 0.8469387755102041, 0.8040816326530612, 0.8408163265306122, 0.8408163265306122, 0.8425357873210634, 0.8241308793456033]
cross_val_accuracy: 0.8340 


In [58]:
if __name__ == "__main__":
    run_decision_tree_cross_val()

Number of records: 4898
[0.8346938775510204, 0.8551020408163266, 0.8183673469387756, 0.8306122448979592, 0.8489795918367347, 0.8163265306122449, 0.8387755102040816, 0.8408163265306122, 0.8425357873210634, 0.8241308793456033]
cross_val_accuracy: 0.8350 
