In [11]:
from csv import reader
from collections import defaultdict 

import sys
import os
import time

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\utils")
    sys.path.append(module_path+"\\models")

import vectorizer
from decision_tree import BinaryDecisionTree
import evaluation

In [12]:
def features1(rows, word_to_index):
    # make training instances with the first hypothesis
    instances  = [(row[1] + " " + row[2] + " " + row[3], 1 if row[5] == '1' else 0) for row in rows]

    # add training instances with the second hypothesis
    instances += [(row[1] + " " + row[2] + " " + row[4], 1 if row[5] == '2' else 0) for row in rows]

    # change the text into sparse incidence vectors
    vectorized_instances = [(vectorizer.sparse_incidence_vector(text, word_to_index), label) for (text, label) in instances]

    # convert from list of (vector, label) tuples into two separate lists
    [x, y] = [list(t) for t in zip(*vectorized_instances)]

    return x, y

def features2(rows, word_to_index):
    # make training instances with the first hypothesis
    instances  = [(row[1] + " " + row[2], row[3], 1 if row[5] == '1' else 0) for row in rows]

    # add training instances with the second hypothesis
    instances += [(row[1] + " " + row[2], row[4], 1 if row[5] == '2' else 0) for row in rows]

    # change the text into sparse incidence vectors
    vectorized_instances = [(vectorizer.sparse_incidence_vector(observations, word_to_index).intersection(vectorizer.sparse_incidence_vector(hypothesis, word_to_index)), label) for (observations, hypothesis, label) in instances]

    # convert from list of (vector, label) tuples into two separate lists
    [x, y] = [list(t) for t in zip(*vectorized_instances)]

    return x, y

In [13]:
rows = vectorizer.parse_and_return_rows('../utils/data/processed_data/train.csv')
vocabulary, vocabulary_length = vectorizer.return_len_and_vocabulary(rows)
word_to_index = vectorizer.create_token_index(vocabulary)
x, y = features2(rows, word_to_index)

In [14]:
test_rows = vectorizer.parse_and_return_rows('../utils/data/processed_data/dev.csv')

x_test, y_test = features2(test_rows, word_to_index)

In [21]:
def train_tree(max_depth, subset_size, num_threads=1, print_logs=True, decision_tree=None, logs=[], accuracy_frequency=10):
    start = time.time()

    if decision_tree is None:
        decision_tree = BinaryDecisionTree()
    decision_tree.initialize_training(x, y)

    can_keep_expanding = True

    while can_keep_expanding:
        if max_depth is not None and decision_tree.current_depth >= max_depth:
            return logs, decision_tree
        one_layer_start = time.time()

        can_keep_expanding = decision_tree.expand_tree(subset_size, num_threads)

        one_layer_end = time.time()
        
        layer_time = one_layer_end - one_layer_start
        total_time = one_layer_end - start
        if print_logs:
            print("Depth: ", decision_tree.current_depth, "Total nodes: ", decision_tree.total_nodes, "Time taken on layer: ", layer_time, "Total time taken: ", total_time)
        logs.append((decision_tree.current_depth, decision_tree.total_nodes, layer_time, total_time))

        if decision_tree.current_depth % accuracy_frequency == 0:
            print("dev: ", calculate_accuracy(decision_tree, x_test, y_test))
            print("train: ", calculate_accuracy(decision_tree, x, y))

    return logs, decision_tree

def calculate_accuracy(decision_tree, x, y):
    predictions = []
    for instance in x:
        predictions.append(decision_tree.predict_class(instance))
    return evaluation.calculate_accuracy(predictions, y)

def calculate_accuracy_at_all_depths(decision_tree, x, y):
    accuracies = []
    predictions_at_all_depths = decision_tree.predict_classes_at_all_depths(x)

    # calculate transpose of predictions 
    predictions_at_all_depths = list(map(list, zip(*predictions_at_all_depths)))

    for predictions_at_depth in predictions_at_all_depths:
        accuracy = evaluation.calculate_accuracy(predictions_at_depth, y)
        accuracies.append(accuracy)

    return accuracies

def save_results(logs, train_accuracies, dev_accuracies, file_name):
    file = open(file_name, "w")
    file.write("acc_train,acc_dev,depth,nodes,time_for_layer,total_time\n")
    for i, log in enumerate(logs):
        message = "%s,%s,%s,%s,%s,%s\n" % (train_accuracies[i], dev_accuracies[i], log[0], log[1], log[2], log[3])
        file.write(message)
    file.close()

def do_experiment(max_depth, subset_size, result_file_name, num_threads=1, print_logs=True, accuracy_frequency=10, tree=None, logs=[]):
    logs, tree = train_tree(max_depth, subset_size, num_threads, print_logs, tree, logs, accuracy_frequency)
    print("Calculating accuracy at all depths...")
    start = time.time()
    train_accuracies = calculate_accuracy_at_all_depths(tree, x, y)
    dev_accuracies = calculate_accuracy_at_all_depths(tree, x_test, y_test)
    end = time.time()
    print("Time taken: ", end - start)
    print("Final train accuracy: ", train_accuracies[-1])
    print("Final dev accuracy: ", dev_accuracies[-1])
    save_results(logs, train_accuracies, dev_accuracies, result_file_name)
    return logs, tree

In [22]:
tree = BinaryDecisionTree()
logs = []
file_name = "e15_subset5000.csv"
logs, tree = do_experiment(max_depth=10000, subset_size=5000, num_threads=5, result_file_name=file_name, accuracy_frequency=10, tree=tree, logs=logs)

Depth:  1 Total nodes:  1 Time taken on layer:  3.4729671478271484 Total time taken:  3.507969856262207
Depth:  2 Total nodes:  3 Time taken on layer:  4.172904968261719 Total time taken:  7.6818718910217285
Depth:  3 Total nodes:  7 Time taken on layer:  4.1800689697265625 Total time taken:  11.861940860748291
Depth:  4 Total nodes:  13 Time taken on layer:  4.4804301261901855 Total time taken:  16.343372344970703
Depth:  5 Total nodes:  21 Time taken on layer:  5.76399827003479 Total time taken:  22.107370615005493
Depth:  6 Total nodes:  31 Time taken on layer:  6.451127767562866 Total time taken:  28.559501886367798
Depth:  7 Total nodes:  43 Time taken on layer:  5.135952949523926 Total time taken:  33.695454835891724
Depth:  8 Total nodes:  61 Time taken on layer:  5.071433067321777 Total time taken:  38.767887115478516
Depth:  9 Total nodes:  81 Time taken on layer:  7.344165086746216 Total time taken:  46.11305284500122
Depth:  10 Total nodes:  105 Time taken on layer:  7.79045

In [13]:
# use this to save the results of a run that was cancelled early  
print("Calculating accuracy at all depths...")
start = time.time()
train_accuracies = calculate_accuracy_at_all_depths(tree, x, y)
dev_accuracies = calculate_accuracy_at_all_depths(tree, x_test, y_test)
end = time.time()
print("Time taken: ", end - start)
print("Final train accuracy: ", train_accuracies[-1])
print("Final dev accuracy: ", dev_accuracies[-1])
save_results(logs, train_accuracies, dev_accuracies, result_file_name)


Calculating accuracy at all depths...
Time taken:  0.8800668716430664
Final accuracy:  49.90208877284596
