In [3]:
import numpy as np
from decision_tree import DecisionTree
from random_forest import RandomForest
from logistic_regression import gradient_descent,sigmoid

ImportError: No module named 'build'

In [4]:
def accuracy_score(Y_true, Y_predict):
    true_predictions = sum([int(true_value == predict_value) for true_value, predict_value in zip(Y_true, Y_predict)])  
    return true_predictions / len(Y_true)


In [5]:

def evaluate_performance():
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation

    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy

    '''

    # Load Data
    filename = 'SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    print(data[:5, :])
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n = X.shape[0]
    all_accuracies = []
    all_accuracies_RF = []
    all_accuracies_log = []
    for trial in range(10):
        idx = np.arange(n)
        np.random.seed(trial)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]
        Xtrain = X[0:200, :]  # train on first 200 instances
        Xtest = X[200:, :]
        ytrain = y[0:200, :]  # test on remaining instances
        ytest = y[200:, :]

        # train the decision tree
        classifier = DecisionTree(100)
        classifier.fit(Xtrain, ytrain)
        # train the random forest
        classifier1 = RandomForest(num_trees=10, max_tree_depth=100, ratio_per_tree=0.7)
        classifier1.fit(Xtrain, ytrain)
        # run logistic regression
        Xtrain = np.column_stack((np.ones(Xtrain.shape[0]), Xtrain))
        beta, feature_means = gradient_descent(Xtrain, ytrain,step_size=1e-2, max_steps=100)
        for i in range(len(beta)):
            beta[i] /= feature_means[i]
        
        # output predictions on the remaining data
        y_pred = classifier.predict(Xtest)
        
        y_pred_RF = classifier1.predict(Xtest)

        y_pred_log = []
        Xtest = np.column_stack((np.ones(Xtest.shape[0]), Xtest))
        for elem in Xtest:
            if elem.dot(beta) > 0:
                y_pred_log.append(1)
            else:
                y_pred_log.append(0)
                
        # compute the training accuracy of the model
        accuracy = accuracy_score(ytest, y_pred)
        all_accuracies.append(accuracy)
        
        accuracy_RF = accuracy_score(ytest, y_pred_RF)
        all_accuracies_RF.append(accuracy_RF)
        
        accuracy_log = accuracy_score(ytest, y_pred_log)
        all_accuracies_log.append(accuracy_log)
        
        
    meanDecisionTreeAccuracy = np.mean(all_accuracies)
    meanRandomForestAccuracy = np.mean(all_accuracies_RF)
    stddevDecisionTreeAccuracy = np.std(all_accuracies)
    stddevRandomForestAccuracy = np.std(all_accuracies_RF)
    meanLogisticRegressionAccuracy = np.mean(all_accuracies_log)
    stddevLogisticRegressionAccuracy = np.std(all_accuracies_log)
    
    # make certain that the return value matches the API specification
    stats = np.zeros((3, 2))
    stats[0, 0] = meanDecisionTreeAccuracy
    stats[0, 1] = stddevDecisionTreeAccuracy
    stats[1, 0] = meanRandomForestAccuracy
    stats[1, 1] = stddevRandomForestAccuracy
    stats[2, 0] = meanLogisticRegressionAccuracy
    stats[2, 1] = stddevLogisticRegressionAccuracy
    
    return stats   



In [None]:
stats = evaluate_performance()
print("Decision Tree Accuracy = ", stats[0, 0], " (", stats[0, 1], ")")
print("Random Forest Accuracy = ", stats[1, 0], " (", stats[1, 1], ")")
print("Logistic Reg. Accuracy = ", stats[2, 0], " (", stats[2, 1], ")")