In [7]:
from id3 import *
from imdbDataSet import *
import numpy as np
import random
import sys


#Ορισμός υπερπαραμέτρων
sys.setrecursionlimit(3000)

train_n = 5000
test_n = 10000
tree_number = 10
fv_skip_top = 5000
fv_length = 500
tree_fv_length = 300

#obtain imdb data 
imdb = IMDB()
(x_train_raw, y_train), (x_test_raw, y_test) = imdb.getTrainingData(skip_top=fv_skip_top, num_words=fv_length)

#use train_n amount of training exampl
x_train = x_train_raw[:train_n]
print("Training Examples Number: ", str(len(x_train)))
y_train = y_train[:train_n]


#use test_n amount of testing examples
x_test = x_test_raw[:test_n]
print("Testing Examples Number: ", str(len(x_test)))
y_test = y_test[:test_n]


#create encoded feature vector (index of words)
encoded_feature_vector = imdb.getFeatureVector(skip_top=fv_skip_top, num_words=fv_length)
print("Encoded Feature Vector: ", encoded_feature_vector)

#each tree has a feature vector smaller than the original
#choose each attribute at random
encoded_tree_feature_vectors = []
for i in range(tree_number):
    encoded_tree_feature_vectors.append(random.sample(encoded_feature_vector, tree_fv_length))


#create 0-1 feature vector for each training example
train_examples = np.zeros_like(x_train)
for i in range(len(x_train)):
    example = np.zeros(len(encoded_feature_vector))
    for w in x_train[i]:  
        if(w in encoded_feature_vector):
            example[encoded_feature_vector.index(w)] = 1
    train_examples[i] = example

#reshape train examples to 2D array
train_examples = np.stack(train_examples)


#create 0-1 feature vector for each test example
test_examples = np.zeros_like(x_test)
for i in range(len(x_test)):
    example = np.zeros(len(encoded_feature_vector))
    for w in x_test[i]:  
        if(w in encoded_feature_vector):
            example[encoded_feature_vector.index(w)] = 1
    test_examples[i] = example
#reshape test examples to 2D array
test_examples = np.stack(test_examples)


#create ID3 forest for the ensamble
forest = []
for i in range(tree_number):
    id3_tree = ID3(features=encoded_tree_feature_vectors[i])
    id3_tree.fit(np.array(train_examples), np.array(y_train))
    forest.append(id3_tree)


#collect test example predictions from the forest
all_tree_predictions = np.zeros((tree_number, len(y_test)))
for i in range(tree_number):
    all_tree_predictions[i] = forest[i].predict(test_examples)

#organize each tree's prediction to be calculated
tree_votes = np.zeros(len(y_test))
for i in range(len(y_test)):
    for j in range(len(forest)):
        tree_votes[i] += all_tree_predictions[j][i]



#implement majority vote for each example
majority_outcome = np.zeros(len(test_examples))
for i in range(len(all_tree_predictions)):
    #majority predicted positive outcome (1)
    if(tree_votes[i] / (len(forest)* 1.0) > 0.5):
        majority_outcome[i] = 1

    #majority predicted negative outcome (0)
    elif(tree_votes[i] / (len(forest)* 1.0) < 0.5):
        majority_outcome[i] = 0

    #majority does not exists, both outcomes equally predicted -> choose randomly 0 or 1
    else:
        majority_outcome[i] = random.randint(0,1)

#calculate errors vector
#for each example we use 0 if the predection is correct, 1 if the prediction is erroneous
errors = np.zeros(len(x_test))
for i in range(len(x_test)):
    #absolute value defines the distance of two numbers, if the distance is not 0 then 
    #the prediction is different than the actual value of y_test, therefor an error
    errors[i] = abs(y_test[i] - majority_outcome[i]) 

#show results for the first n test examples
n = len(x_test)
print("Showing the first ", n, " expected answers:")
print(y_test[:n])
print("Showing the first ", n, " predicted answers:")
print(majority_outcome[:n])

#show error as percentage 
print("Percentage of error is: ", sum(errors)/(len(errors)*1.0))


Training Examples Number:  5000
Testing Examples Number:  10000
Encoded Feature Vector:  [5001, 5002, 5003, 5004, 5005, 5006, 5007, 5008, 5009, 5010, 5011, 5012, 5013, 5014, 5015, 5016, 5017, 5018, 5019, 5020, 5021, 5022, 5023, 5024, 5025, 5026, 5027, 5028, 5029, 5030, 5031, 5032, 5033, 5034, 5035, 5036, 5037, 5038, 5039, 5040, 5041, 5042, 5043, 5044, 5045, 5046, 5047, 5048, 5049, 5050, 5051, 5052, 5053, 5054, 5055, 5056, 5057, 5058, 5059, 5060, 5061, 5062, 5063, 5064, 5065, 5066, 5067, 5068, 5069, 5070, 5071, 5072, 5073, 5074, 5075, 5076, 5077, 5078, 5079, 5080, 5081, 5082, 5083, 5084, 5085, 5086, 5087, 5088, 5089, 5090, 5091, 5092, 5093, 5094, 5095, 5096, 5097, 5098, 5099, 5100, 5101, 5102, 5103, 5104, 5105, 5106, 5107, 5108, 5109, 5110, 5111, 5112, 5113, 5114, 5115, 5116, 5117, 5118, 5119, 5120, 5121, 5122, 5123, 5124, 5125, 5126, 5127, 5128, 5129, 5130, 5131, 5132, 5133, 5134, 5135, 5136, 5137, 5138, 5139, 5140, 5141, 5142, 5143, 5144, 5145, 5146, 5147, 5148, 5149, 5150, 5151, 5152