In [34]:
import numpy as np
from DecisionTree import DecisionTree
from random_forest import RandomForest
from logistic_regression import gradient_descent,sigmoid

In [35]:
def accuracy_score(Y_true, Y_predict):
    correct = [Y_true[i] for i in range(len(Y_true)) if Y_true[i] == Y_predict[i]]
    return len(correct) / len(Y_true)


def evaluate_performance():
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation

    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy

    '''

    # Load Data
    filename = 'SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    print(data[:5, :])
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n = X.shape[0]
    all_accuracies = []
    all_accuracies_RF = []
    all_accuracies_log = []
    for trial in range(10):
        idx = np.arange(n)
        np.random.seed(trial)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]
        Xtrain = X[0:200, :]  # train on first 200 instances
        Xtest = X[200:, :]
        ytrain = y[0:200, :]  # test on remaining instances
        ytest = y[200:, :]

        # train the decision tree
        classifier = DecisionTree(100)
        classifier.fit(Xtrain, ytrain)
        # train the random forest
        classifier1 = RandomForest(5,100)
        classifier1.fit(Xtrain, ytrain)
        # run logistic regression
        Xtrain = np.column_stack((np.ones(Xtrain.shape[0]), Xtrain))
        beta, feature_means = gradient_descent(Xtrain, ytrain,step_size=1e-2, max_steps=100)
        for i in range(len(beta)):
            beta[i] /= feature_means[i]
        
        # output predictions on the remaining data
        y_pred = classifier.predict(Xtest)
        
        y_pred_RF = classifier1.predict(Xtest)

        y_pred_log = []
        Xtest = np.column_stack((np.ones(Xtest.shape[0]), Xtest))
        for elem in Xtest:
            if elem.dot(beta) > 0:
                y_pred_log.append(1)
            else:
                y_pred_log.append(0)
                
        # compute the training accuracy of the model
        accuracy = accuracy_score(ytest, y_pred)
        all_accuracies.append(accuracy)
        
        accuracy_RF = accuracy_score(ytest, y_pred_RF)
        all_accuracies_RF.append(accuracy_RF)
        
        accuracy_log = accuracy_score(ytest, y_pred_log)
        all_accuracies_log.append(accuracy_log)
        
        
    meanDecisionTreeAccuracy = np.mean(all_accuracies)
    meanRandomForestAccuracy = np.mean(all_accuracies_RF)
    stddevDecisionTreeAccuracy = np.std(all_accuracies)
    stddevRandomForestAccuracy = np.std(all_accuracies_RF)
    meanLogisticRegressionAccuracy = np.mean(all_accuracies_log)
    stddevLogisticRegressionAccuracy = np.std(all_accuracies_log)
    
    # make certain that the return value matches the API specification
    stats = np.zeros((3, 2))
    stats[0, 0] = meanDecisionTreeAccuracy
    stats[0, 1] = stddevDecisionTreeAccuracy
    stats[1, 0] = meanRandomForestAccuracy
    stats[1, 1] = stddevRandomForestAccuracy
    stats[2, 0] = meanLogisticRegressionAccuracy
    stats[2, 1] = stddevLogisticRegressionAccuracy
    
    return stats   



In [36]:
stats = evaluate_performance()
print("Decision Tree Accuracy = ", stats[0, 0], " (", stats[0, 1], ")")
print("Random Forest Accuracy = ", stats[1, 0], " (", stats[1, 1], ")")
print("Logistic Reg. Accuracy = ", stats[2, 0], " (", stats[2, 1], ")")

[[  1.  59.  52.  70.  67.  73.  66.  72.  61.  58.  52.  72.  71.  70.
   77.  66.  65.  67.  55.  61.  57.  68.  66.  72.  74.  63.  64.  56.
   54.  67.  54.  76.  74.  65.  67.  66.  56.  62.  56.  72.  62.  74.
   74.  64.  67.]
 [  1.  72.  62.  69.  67.  78.  82.  74.  65.  69.  63.  70.  70.  72.
   74.  70.  71.  72.  75.  66.  65.  73.  78.  74.  79.  74.  69.  69.
   70.  71.  69.  72.  70.  62.  65.  65.  71.  63.  60.  69.  73.  67.
   71.  56.  58.]
 [  1.  71.  62.  70.  64.  67.  64.  79.  65.  70.  69.  72.  71.  68.
   65.  61.  61.  73.  71.  75.  74.  80.  74.  54.  47.  53.  37.  77.
   68.  72.  59.  72.  68.  60.  60.  73.  70.  66.  65.  64.  55.  61.
   41.  51.  46.]
 [  1.  69.  71.  70.  78.  61.  63.  67.  65.  59.  59.  66.  69.  71.
   75.  65.  58.  60.  55.  62.  59.  67.  66.  74.  74.  64.  60.  57.
   54.  70.  73.  69.  76.  62.  64.  61.  61.  66.  65.  72.  73.  68.
   68.  59.  63.]
 [  1.  70.  66.  61.  66.  61.  58.  69.  69.  72.  68.  62.  7