In [4]:
import numpy as np
from decision_tree import DecisionTree
from random_forest import RandomForest
from logistic_regression import gradient_descent


def accuracy_score(Y_true, Y_predict):
    
    true_predictions = sum([int(true_value == predict_value) for true_value, predict_value in zip(Y_true, Y_predict)])
        
    return true_predictions / len(Y_true)


def evaluate_performance():
    filename = 'SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    print(data[:10,:])
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n, d = X.shape
    tree_accuracies = []
    log_accuracies = []
    forest_accuracies = []

    def logistic_prediction(X, b):
        X = np.array(X)
        X = np.column_stack((np.ones(len(X)), X))
        res = X.dot(b)
        y_logistic_pred = []

        for item in X:
            if item.dot(b)>0:
                y_logistic_pred.append(1)
            else:
                y_logistic_pred.append(0)
        return y_logistic_pred

    for trial in range(1):
        idx = np.arange(n)
        np.random.seed(13)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]
        Xtrain = X[0:100, :]
        Xtest = X[100:, :]
        ytrain = y[0:100, :]
        ytest = y[100:, :]  

        # train the decision tree        
        tree_classifier = DecisionTree(100)
        tree_classifier.fit(Xtrain, ytrain)
        y_pred = tree_classifier.predict(Xtest)
        accuracy = accuracy_score(ytest, y_pred)
        tree_accuracies.append(accuracy)

        forest_classifier = RandomForest(num_trees=10, max_tree_depth=100, ratio_per_tree=0.7)
        forest_classifier.fit(Xtrain, ytrain)
        y_pred_forest = forest_classifier.predict(Xtest)
        forest_accuracies.append(accuracy_score(ytest, y_pred_forest))
        
        beta = gradient_descent(Xtrain, ytrain)
        y_logistic_pred = logistic_prediction(Xtest, beta)
        log_accuracies.append(accuracy_score(ytest, y_logistic_pred))

    meanDecisionTreeAccuracy = np.mean(tree_accuracies)
    stddevDecisionTreeAccuracy = np.std(tree_accuracies)
    meanRandomForestAccuracy = np.mean(forest_accuracies)
    stddevRandomForestAccuracy = np.std(forest_accuracies)
    meanLogisticRegressionAccuracy = np.mean(log_accuracies)
    stddevLogisticRegressionAccuracy = np.std(log_accuracies)

    stats = np.zeros((3, 2))
    stats[0, 0] = meanDecisionTreeAccuracy
    stats[0, 1] = stddevDecisionTreeAccuracy
    stats[1, 0] = meanRandomForestAccuracy
    stats[1, 1] = stddevRandomForestAccuracy
    stats[2, 0] = meanLogisticRegressionAccuracy
    stats[2, 1] = stddevLogisticRegressionAccuracy
    return stats


if __name__ == "__main__":
    stats = evaluate_performance()
    print("Decision Tree Accuracy = ", stats[0, 0], " (", stats[0, 1], ")")
    print ("Random Forest Tree Accuracy = ", stats[1, 0], " (", stats[1, 1], ")")
    print ("Logistic Reg. Accuracy = ", stats[2, 0], " (", stats[2, 1], ")")


[[  1.  59.  52.  70.  67.  73.  66.  72.  61.  58.  52.  72.  71.  70.
   77.  66.  65.  67.  55.  61.  57.  68.  66.  72.  74.  63.  64.  56.
   54.  67.  54.  76.  74.  65.  67.  66.  56.  62.  56.  72.  62.  74.
   74.  64.  67.]
 [  1.  72.  62.  69.  67.  78.  82.  74.  65.  69.  63.  70.  70.  72.
   74.  70.  71.  72.  75.  66.  65.  73.  78.  74.  79.  74.  69.  69.
   70.  71.  69.  72.  70.  62.  65.  65.  71.  63.  60.  69.  73.  67.
   71.  56.  58.]
 [  1.  71.  62.  70.  64.  67.  64.  79.  65.  70.  69.  72.  71.  68.
   65.  61.  61.  73.  71.  75.  74.  80.  74.  54.  47.  53.  37.  77.
   68.  72.  59.  72.  68.  60.  60.  73.  70.  66.  65.  64.  55.  61.
   41.  51.  46.]
 [  1.  69.  71.  70.  78.  61.  63.  67.  65.  59.  59.  66.  69.  71.
   75.  65.  58.  60.  55.  62.  59.  67.  66.  74.  74.  64.  60.  57.
   54.  70.  73.  69.  76.  62.  64.  61.  61.  66.  65.  72.  73.  68.
   68.  59.  63.]
 [  1.  70.  66.  61.  66.  61.  58.  69.  69.  72.  68.  62.  7