In [1]:
from csv import reader
from collections import defaultdict 

import sys
import os
import time

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\utils")
    sys.path.append(module_path+"\\models")

import vectorizer
from decision_tree import BinaryDecisionTree
import evaluation

In [2]:
rows = vectorizer.parse_and_return_rows('../utils/data/processed_data/dev.csv')
vocabulary, vocabulary_length = vectorizer.return_len_and_vocabulary(rows)
word_to_index = vectorizer.create_token_index(vocabulary)

# make training instances with the first hypothesis
training_instances  = [(row[1] + " " + row[2] + " " + row[3], 1 if row[5] == '1' else 0) for row in rows]

# add training instances with the second hypothesis
training_instances += [(row[1] + " " + row[2] + " " + row[4], 1 if row[5] == '2' else 0) for row in rows]

# change the text into sparse incidence vectors
vectorized_training_instances = [(vectorizer.sparse_incidence_vector(text, word_to_index), label) for (text, label) in training_instances]

# convert from list of (vector, label) tuples into two separate lists
[x, y] = [list(t) for t in zip(*vectorized_training_instances)]

In [3]:
test_rows = vectorizer.parse_and_return_rows('../utils/data/processed_data/dev.csv')

# make training instances with the first hypothesis
test_instances  = [(row[1] + " " + row[2] + " " + row[3], 1 if row[5] == '1' else 0) for row in test_rows]

# add training instances with the second hypothesis
test_instances += [(row[1] + " " + row[2] + " " + row[4], 1 if row[5] == '2' else 0) for row in test_rows]

# change the text into sparse incidence vectors
vectorized_test_instances = [(vectorizer.sparse_incidence_vector(text, word_to_index), label) for (text, label) in test_instances]

# convert from list of (vector, label) tuples into two separate lists
[x_test, y_test] = [list(t) for t in zip(*vectorized_test_instances)]

In [4]:
def do_experiment(max_depth, subset_size):
    start = time.time()

    decision_tree = BinaryDecisionTree()
    decision_tree.initialize_training(x, y)

    can_keep_expanding = True
    while can_keep_expanding:
        if max_depth is not None and decision_tree.current_depth >= max_depth:
            return
        start = time.time()

        can_keep_expanding = decision_tree.expand_tree(subset_size)

        end = time.time()
        
        predictions = []
        for instance in x_test:
            predictions.append(decision_tree.predict_class(instance))

        accuracy = evaluation.calculate_accuracy(predictions, y_test)
        print("Depth: ", decision_tree.current_depth, "Total nodes: ", decision_tree.total_nodes, "Current accuracy: ", accuracy, "Time taken: ", end - start)

    return decision_tree

In [None]:
tree = do_experiment(max_depth=100, subset_size=None)

In [None]:
tree = do_experiment(max_depth=100, subset_size=100)

In [5]:
tree = do_experiment(max_depth=100, subset_size=2)

TypeError: __init__() takes 2 positional arguments but 3 were given