In [1]:
import csv
import os
import numpy
from random import randrange
import math
from collections import Counter
import pandas as pd

In [2]:
myname = "Tuhin-Dutta_"

In [6]:
def run_decision_tree():
    # Load data set
    file_name = os.path.join(os.getcwd(), "wine-dataset.csv")
    df = pd.read_csv(file_name)
    # remove the quality column
    trainSet = df[df.columns[:-1]]
    # normalize all data columns
    trainSet = (trainSet - trainSet.min())/(trainSet.max() - trainSet.min())
    # get the quality column labels
    trainLabel = df.iloc[:,-1:]
    # concat trainSet with trainLabel to create final DataFrame df
    df = pd.concat([trainSet, trainLabel], axis=1)
    #declare K for k fold
    K = 9 #folds
    # calculate fold size
    fold_size = int(trainSet.shape[0]/K)
    # divide train set into K folds each of size fold_size
    folds = [df.iloc[i:i+fold_size] for i in range(0,len(df)-fold_size+1, fold_size)] 
    # Get all attributes names
    attributes = list(df.columns)
    # Remove the class attribute
    attributes.remove('quality')
    tot_acc = 0
    for k in range(K):
        test = folds[k] # validation set
        training = [rows for j,rows in enumerate(folds) if k != j] # k-fold training set
        train_df = pd.DataFrame()
        # append all the k-folds into one data frame
        for i in range(int(8)):
            train_df = pd.concat([train_df, training[i]])
        # call learn method of tree
        train_tree = learn(train_df, 'quality', attributes)
        # predict
        test['predicted'] = test.apply(classify,axis=1,args=(train_tree,1))
        # evaluate accuracy
        acc = sum(test['quality'] == test['predicted'] ) / (1.0*len(test.index))
        # total accuracy over k-folds
        tot_acc += acc
    
    # Writing results to a file (DO NOT CHANGE)
    f = open(myname+"result.txt", "a")
    #f.write("Using Entropy, Accuracy: %.4f\n" % float(tot_acc/(1.0*K)))
    f.write("Using Gini, Accuracy: %.4f\n" % float(tot_acc/(1.0*K)))
    f.close()


In [7]:
def learn(df,target,attributes,default_class = 0):
    # count distinct target values
    count = Counter(x for x in df[target])
    if len(count) == 1:
        return next(iter(count))
    elif df.empty or (not attributes):
        return default_class
    else:
        default_class = max(count.keys())
        # calculate information gain using Entropy
        """Comment/Uncomment out the below gain if using entropy"""
        #gains = [info_gain_entropy(df,attr,target) for attr in attributes]
        
        """Comment/Uncomment out the below gain if using gini"""
        # calculate information gain using Gini Impurity
        gains = [info_gain_gini(df,attr,target) for attr in attributes]
        
        # arg max of gains
        index_max = gains.index(max(gains))
        # get best attribute at the current node
        best_attr = attributes[index_max]
        # create new tree
        tree = { best_attr:{ } }
        # get remaining attributes
        remaining_attr = [x for x in attributes if x != best_attr]
        
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = learn(data_subset,target,remaining_attr,default_class) # recursive call to build tree
            tree[best_attr][attr_val] = subtree
        return tree
    
def classify(instance,tree,default = 0):
    attribute = next(iter(tree))
    if instance[attribute] in tree[attribute].keys():
        result = tree[attribute][instance[attribute]]
        if isinstance(result,dict):
            return classify(instance,result) # recursive call to classify instance 
        else:
            return result
    else:
        return default

def calcEntropy(prob):
    return sum([-p*math.log(p,2) for p in prob])

def info_gain_entropy(df,split,target,trace=0):
    df_split = df.groupby(split)
    n = len(df.index)*1.0
    df_agg_ent = df_split.agg({ target:[getEntropyList, lambda x: len(x)/n] })
    df_agg_ent.columns = ['E','PO']
    new_E = sum( df_agg_ent['E'] * df_agg_ent["PO"])
    old_E = getEntropyList(df[target])
    return old_E - new_E


def getEntropyList(l):
    count = Counter(x for x in l)
    n = len(l)*1.0
    p = [x/n for x in count.values()]
    return calcEntropy(p)

def calcGini(rows):
    counts = Counter(x for x in rows)
    im = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        im -= prob_of_lbl**2
    return im

def info_gain_gini(l, r, curr):
    p = float(len(l)) / (len(l) + len(r))
    return calcGini(curr) - p * calcGini(l) - (1 - p) * calcGini(r)

In [8]:
run_decision_tree()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['predicted'] = test.apply(classify,axis=1,args=(train_tree,1))


We saw our accuracy improved from 0.6863 using Entropy to 0.7433 using Gini.
The Gini Index and the Entropy have two main differences:
1. Gini Index has values inside the interval [0, 0.5] (Gini’s maximum impurity is 0.5 and maximum purity is 0) whereas the interval of the Entropy is [0, 1] (Entropy’s maximum impurity is 1 and maximum purity is 0). 
2. Computationally, entropy is more complex since it makes use of logarithms and consequently, the calculation of the Gini Index will be faster. Entropy is more computationally heavy due to the log in the equation.