In [1]:
# Author: Simon Swenson
# Collaborated with: Subin Sapkota

In [2]:
import pandas as pd
import numpy as np
from sklearn import model_selection as ms
from sklearn import preprocessing as pp
from sklearn import tree as tr

data = pd.read_csv("iris1.csv").values

X = data[:, :4]
y = data[:, 4]

accuracy = {'entropy': 0.0, 'gini': 0.0}

In [3]:
num_folds = 5

# Just doing a shuffled k-fold might be sufficient, but that's random. Stratification ensures
# an equal distribution of the labels.
k_folds = ms.StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 0)
# kFolds = ms.KFold(n_splits = 5, shuffle = True)
for train_indeces_i, test_indeces_i in k_folds.split(X, y):
    # Get the current k-fold data points
    X_train_i = X[train_indeces_i]
    X_test_i = X[test_indeces_i]
    y_train_i = y[train_indeces_i]
    y_test_i = y[test_indeces_i]
    
    accuracy = {'entropy': 0.0, 'gini': 0.0}
    
    # Split the test data into inner folds. Same num splits.
    # Already shuffled, so don't bother again.
    k_folds = ms.StratifiedKFold(n_splits = num_folds)
    for train_indeces_j, test_indeces_j in k_folds.split(X_train_i, y_train_i):
        X_train_j = X_train_i[train_indeces_j]
        X_test_j = X_train_i[test_indeces_j]
        y_train_j = y_train_i[train_indeces_j]
        y_test_j = y_train_i[test_indeces_j]
        
        # Begin classifier pipeline
    
        # Scale and norm the input data
        standard_scalar = pp.StandardScaler()
        normalizer = pp.Normalizer(norm = 'l1')
        X_train_j = normalizer.fit_transform(standard_scalar.fit_transform(X_train_j))
        X_test_j = normalizer.transform(standard_scalar.transform(X_test_j))
    
        # Encode the labels
        le = pp.LabelEncoder()
        y_train_j = le.fit_transform(y_train_j)
        y_test_j = le.transform(y_test_j)
        
        # Train/Test model with different hyperparameters. Made this extensible with a
        # loop for some reason. Didn't want to have to basically write out the same code twice.
        for key in accuracy:
            tree = tr.DecisionTreeClassifier(criterion = key)
            tree.fit(X_train_j, y_train_j)
            X_test_j_results = tree.predict(X_test_j)
            # Compute the accuracy by summing all instances where the classifier's output
            # matches the actual class, then dividing by the number of rows.
            accuracy[key] += float(np.sum([
                1 if xcr_cur == yte_cur else 0 \
                for xcr_cur, yte_cur \
                in zip(X_test_j_results, y_test_j)])) \
                / len(y_test_j)
                
        # End classifier pipeline
        
    # Normalize the accuracy, since we had 5 folds [0, 5] -> [0, 1]
    max_accuracy = 0.0
    max_accuracy_criterion = ''
    for key in accuracy:
        accuracy[key] /= num_folds
        print('Accuracy of ' + key + ": " + str(accuracy[key]))
        if accuracy[key] > max_accuracy:
            max_accuracy = accuracy[key]
            max_accuracy_criterion = key
            
    print('Maximum accuracy hyperparameter was ' + max_accuracy_criterion + ': ' + str(max_accuracy))

    # Now that we know which has the highest accuracy, let's use that for the outer
    # train/test data.
    
    # Begin classifier pipeline
    
    # Scale and norm the input data
    standard_scalar = pp.StandardScaler()
    normalizer = pp.Normalizer(norm = 'l1')
    X_train_i = normalizer.fit_transform(standard_scalar.fit_transform(X_train_i))
    X_test_i = normalizer.transform(standard_scalar.transform(X_test_i))
    
    # Encode the labels
    le = pp.LabelEncoder()
    y_train_i = le.fit_transform(y_train_i)
    y_test_i = le.transform(y_test_i)  
    
    # Train/Test the tree
    tree = tr.DecisionTreeClassifier(criterion = max_accuracy_criterion)
    tree.fit(X_train_i, y_train_i)
    X_test_i_results = tree.predict(X_test_i)
    
    # Compute the accuracy by summing all instances where the classifier's output
    # matches the actual class, then dividing by the number of rows.
    accuracy = float(np.sum([
        1 if xcr_cur == yte_cur else 0 \
        for xcr_cur, yte_cur \
        in zip(X_test_j_results, y_test_j)])) \
        / len(y_test_j)
        
    print('Outer accuracy of ' + max_accuracy_criterion + ' was ' + str(accuracy))
    
    # End classifier pipeline

Accuracy of entropy: 0.7916666666666667
Accuracy of gini: 0.825
Maximum accuracy hyperparameter was gini: 0.825
Outer accuracy of gini was 0.875
Accuracy of entropy: 0.825
Accuracy of gini: 0.85
Maximum accuracy hyperparameter was gini: 0.85
Outer accuracy of gini was 0.9583333333333334
Accuracy of entropy: 0.7583333333333333
Accuracy of gini: 0.825
Maximum accuracy hyperparameter was gini: 0.825
Outer accuracy of gini was 0.875
Accuracy of entropy: 0.7916666666666666
Accuracy of gini: 0.7666666666666666
Maximum accuracy hyperparameter was entropy: 0.7916666666666666
Outer accuracy of entropy was 0.7916666666666666
Accuracy of entropy: 0.8
Accuracy of gini: 0.8166666666666667
Maximum accuracy hyperparameter was gini: 0.8166666666666667
Outer accuracy of gini was 0.9166666666666666


