In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from datetime import datetime

Using TensorFlow backend.


In [2]:
# print the timestamp and a given message
def log(txt):
    print(datetime.utcnow(), txt)

In [3]:
# find unigram counts of all words in positive and negative reviews
# splits the given data into n sections
def split_n(training_data, n):
    sections = [[] * n]
    i = 0
    for sentence in training_data:
        sections[i].append(sentence)
        i = (i + 1) % n
    return sections

# builds the vocabulary based off given training data
def build_vocab(training_data):
    # unigram_dict = {'<UNK>' : 0}
    vocabulary = []
    for sentence in training_data:
        for word in sentence:
            if word not in vocabulary:
                vocabulary.append(word)
    # vocabulary = list(set([word for word in sentence for sentence in training_data]))
    vocabulary.sort()
    return vocabulary

In [4]:
# read the positive and negative reviews into one-hot encodings
pos_file = open('train_positive_reviews.txt', "rt")
neg_file = open('train_negative_reviews.txt', "rt")
log("opened files")
pos_data = [sentence.lower().split() for sentence in pos_file.readlines()]
neg_data = [sentence.lower().split() for sentence in neg_file.readlines()]
log("read pos neg data")
full_data = pos_data + neg_data
Y = np.array([1] * len(pos_data) + [0] * len(neg_data))
log("created full data and Y")

full_vocab = build_vocab(full_data)
log("built full vocab")

log('building one-hot array')
hot_ones = np.array([np.array([1 if word in sentence else 0 for word in full_vocab]) for sentence in full_data])
log("one-hot done")

2020-02-28 23:20:11.655496 opened files
2020-02-28 23:20:11.684419 read pos neg data
2020-02-28 23:20:11.685416 created full data and Y
2020-02-28 23:20:18.740995 built full vocab
2020-02-28 23:20:18.740995 building one-hot array
2020-02-28 23:21:29.702297 one-hot done


In [5]:
# create the a model with the given number of weights
def make_model(nodes, input_len):
    return Sequential([
        Dense(nodes, input_dim=input_len, activation='relu'),
        Dense(10, activation='relu'),
        Dense(1, activation='sigmoid')])
nodes = [100, 200, 300]

In [6]:
# Q3.1 do the 10-fold cross validation and store the results for each model
# and store the results in 'q3_1_results'
q3_1_results = []
for model_num in range(len(nodes)):
    q3_1_results.append([])
    for i in range(10):
        training_x = []; training_y = []
        test_x = []; test_y = []
        log("Splitting test-training data (sentence #{} to test set)".format(i+1))
        for j in range(len(hot_ones)):
            if i == (j % 10):
                # add when the counter values coincide, add the hot-ones for this sentence to the test set
                test_x.append(np.array(hot_ones[j])); test_y.append(Y[j])
            else:
                # add the hot-ones for this sentence to the training set
                training_x.append(np.array(hot_ones[j])); training_y.append(Y[j])

        log("Creating model"); q3_1_model = make_model(nodes[model_num], len(full_vocab))

        log("Compiling model"); q3_1_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        log("Fitting model"); q3_1_model.fit(np.array(training_x), np.array(training_y), epochs=5, batch_size=128)

        log("Evaluating model"); _, accuracy = q3_1_model.evaluate(np.array(test_x), np.array(test_y))
        q3_1_results[model_num].append(accuracy)

        log("Done evaluating, accuracy={}".format(q3_1_results[model_num][i]))

 0.9602
Epoch 4/5
Epoch 5/5
2020-02-28 23:24:33.216871 Evaluating model
2020-02-28 23:24:33.772386 Done evaluating, accuracy=0.7378752827644348
2020-02-28 23:24:33.931959 Splitting test-training data (sentence #9 to test set)
2020-02-28 23:24:34.549306 Creating model
2020-02-28 23:24:34.596213 Compiling model
2020-02-28 23:24:34.648077 Fitting model
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
2020-02-28 23:24:58.703254 Evaluating model
2020-02-28 23:24:59.282755 Done evaluating, accuracy=0.7632794380187988
2020-02-28 23:24:59.442324 Splitting test-training data (sentence #10 to test set)
2020-02-28 23:24:59.910431 Creating model
2020-02-28 23:24:59.962934 Compiling model
2020-02-28 23:25:00.023771 Fitting model
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
2020-02-28 23:25:22.637194 Evaluating model
2020-02-28 23:25:23.237588 Done evaluating, accuracy=0.7621247172355652
2020-02-28 23:25:23.389218 Splitting test-training data (sentence #1 to test set)
2020-02-28 23:25:23.84200

In [7]:
# write the results to the answer file
q3_1_results_file = open("Q3_1_Results.txt", "w")
q3_1_results_file.write("1.\tParameter chosen for optimizations was the number of nodes in hidden layer 1\n")
q3_1_results_file.write("2.\tnode counts: {}\n".format(nodes))
q3_1_results_avg = [np.average(q3_1_results[i]) for i in range(len(q3_1_results))]
max_node_count = nodes[list(q3_1_results_avg).index(np.max(q3_1_results_avg))]
for i in range(3):
    print(str(q3_1_results[i]))
    print("average for {} nodes in 1st hidden layer: {}".format(nodes[i], q3_1_results_avg[i]))
q3_1_results_file.write("3.\tAccuracy for Node Count:\n" +\
                        "\t\t{} nodes | avg. accuracy={}\n".format(nodes[0], q3_1_results_avg[0]) +\
                        "\t\t{} nodes | avg. accuracy={}\n".format(nodes[1], q3_1_results_avg[1]) +\
                        "\t\t{} nodes | avg. accuracy={}\n".format(nodes[2], q3_1_results_avg[2]))
q3_1_results_file.write("4.\tChosen node count is {} because it had the highest average accuracy in the 10-fold cross validation.\n".format(max_node_count))
q3_1_results_file.close()

[0.7739331126213074, 0.7301037907600403, 0.7647058963775635, 0.7474048733711243, 0.7667436599731445, 0.7609699964523315, 0.7528868317604065, 0.7378752827644348, 0.7632794380187988, 0.7621247172355652]
average for 100 nodes in 1st hidden layer: 0.7560027599334717
[0.7670127153396606, 0.7370242476463318, 0.7635524868965149, 0.7404844164848328, 0.7598152160644531, 0.7644341588020325, 0.7528868317604065, 0.7459584474563599, 0.7575057744979858, 0.7575057744979858]
average for 200 nodes in 1st hidden layer: 0.7546180069446564
[0.769319474697113, 0.7370242476463318, 0.7670127153396606, 0.7381775975227356, 0.7575057744979858, 0.7702078819274902, 0.7621247172355652, 0.7551963329315186, 0.7563510537147522, 0.7621247172355652]
average for 300 nodes in 1st hidden layer: 0.7575044512748719


In [8]:
# Q3.2 retrain the model on the entire training set and report the accuracy on the training set as an upper-bound for performance on test data

log("Creating model"); q3_2_model = make_model(max_node_count, len(full_vocab))

log("Compiling model"); q3_2_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

log("Fitting model"); q3_2_model.fit(np.array(hot_ones), np.array(Y), epochs=5, batch_size=128)

log("Evaluating model"); q3_2_loss, q3_2_accuracy = q3_2_model.evaluate(np.array(hot_ones), np.array(Y))

log("Done evaluating, accuracy={}".format(q3_2_accuracy))

q3_2_results_file = open("Q3_2_Results.txt", "w")
q3_2_results_file.write("Trainied model on full data with {} nodes\n".format(max_node_count))
q3_2_results_file.write("\taccuracy = {} ; loss = {}\n".format(q3_2_accuracy, q3_2_loss))
q3_2_results_file.close()

2020-02-28 23:39:08.737921 Creating model
2020-02-28 23:39:08.817706 Compiling model
2020-02-28 23:39:08.899490 Fitting model
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
2020-02-28 23:39:58.416498 Evaluating model
2020-02-28 23:40:03.229133 Done evaluating, accuracy=0.9998846054077148


In [9]:
import gensim.models as gm
log("generating vord2vec model")
google_kv = gm.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
log("generated")

2020-02-28 23:40:03.878348 generating vord2vec model
2020-02-28 23:42:42.987589 generated


In [10]:
# a function that finds the average word vector for a sentence
def get_sentence_vector(sentence):
    vectors = []
    for word in sentence:
        try:
            vectors.append(google_kv.get_vector(word))
        except KeyError:
            vectors.append([0] * 300)
    vectors_ave = np.mean(vectors, axis=0)
    return vectors_ave

In [11]:
# read the positive and negative reviews into wprd2vec encodings
log('building word2vec array')
word2vec = []
i = 0
for sentence in full_data:
    sentence_vec = get_sentence_vector(sentence)
    if np.isnan(sentence_vec).any():
        sentence_vec = np.zeros(300)
    word2vec.append(sentence_vec)
    i += 1
log("word2vec done")

2020-02-28 23:42:43.014527 building word2vec array
2020-02-28 23:42:45.319394 word2vec done


In [12]:
# Q3.3 do the 10-fold cross validation and store the results for each model
# and store the results in 'q3_3_results'
q3_3_results = []
for model_num in range(len(nodes)):
    q3_3_results.append([])
    for i in range(10):
        training_x = []; training_y = []
        test_x = []; test_y = []
        log("Splitting test-training data (sentence #{} to test set)".format(i+1))
        for j in range(len(word2vec)):
            if i == (j % 10):
                # add when the counter values coincide, add the hot-ones for this sentence to the test set
                test_x.append(np.array(word2vec[j])); test_y.append(Y[j])
            else:
                # add the hot-ones for this sentence to the training set
                training_x.append(np.array(word2vec[j])); training_y.append(Y[j])

        log("Creating model"); q3_3_model = make_model(nodes[model_num], 300)

        log("Compiling model"); q3_3_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        log("Fitting model"); q3_3_model.fit(np.array(training_x), np.array(training_y), epochs=30, batch_size=128)

        log("Evaluating model"); _, accuracy = q3_3_model.evaluate(np.array(test_x), np.array(test_y))
        q3_3_results[model_num].append(accuracy)

        log("Done evaluating, accuracy={}".format(q3_3_results[model_num][i]))

Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
2020-02-28 23:49:43.736072 Evaluating model
2020-02-28 23:49:45.218277 Done evaluating, accuracy=0.7655889391899109
2020-02-28 23:49:45.221269 Splitting test-training data (sentence #6 to test set)
2020-02-28 23:49:45.238223 Creating model
2020-02-28 23:49:45.289088 Compiling model
2020-02-28 23:49:45.348927 Fitting model
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
2020-02-28 23:50:05.764416 Evaluating model
2020-02-28 23:50:0

In [13]:
# write the results to the answer file
q3_3_results_file = open("Q3_3_Results.txt", "w")
q3_3_results_file.write("1.\tParameter chosen for optimizations was the number of nodes in hidden layer 1\n")
q3_3_results_file.write("2.\tnode counts: {}\n".format(nodes))
q3_3_results_avg = [np.average(q3_3_results[i]) for i in range(len(q3_3_results))]
max_node_count = nodes[list(q3_3_results_avg).index(np.max(q3_3_results_avg))]
for i in range(3):
    print(str(q3_3_results[i]))
    print("average for {} nodes in 1st hidden layer: {}".format(nodes[i], q3_3_results_avg[i]))
q3_3_results_file.write("3.\tAccuracy for Node Count:\n" +\
                        "\t\t{} nodes | avg. accuracy={}\n".format(nodes[0], q3_3_results_avg[0]) +\
                        "\t\t{} nodes | avg. accuracy={}\n".format(nodes[1], q3_3_results_avg[1]) +\
                        "\t\t{} nodes | avg. accuracy={}\n".format(nodes[2], q3_3_results_avg[2]))
q3_3_results_file.write("4.\tChosen node count is {} because it had the highest average accuracy in the 10-fold cross validation.\n".format(max_node_count))

# retrain the model on the entire training set and report the accuracy on the training set as an upper-bound for performance on test data

log("Creating model"); q3_3_model = make_model(max_node_count, 300)

log("Compiling model"); q3_3_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

log("Fitting model"); q3_3_model.fit(np.array(word2vec), np.array(Y), epochs=30, batch_size=128)

log("Evaluating model"); q3_3_loss, q3_3_accuracy = q3_3_model.evaluate(np.array(word2vec), np.array(Y))

log("Done evaluating, accuracy={}".format(q3_3_accuracy))

q3_3_results_file.write("5.\tTrainied model on full data with {} nodes\n".format(max_node_count))
q3_3_results_file.write("\t\taccuracy = {} ; loss = {}\n".format(q3_3_accuracy, q3_3_loss))
q3_3_results_file.close()

[0.7623990774154663, 0.7589388489723206, 0.7658593058586121, 0.7681660652160645, 0.7621247172355652, 0.7909930944442749, 0.7436489462852478, 0.7956120371818542, 0.8002309203147888, 0.7886835932731628]
average for 100 nodes in 1st hidden layer: 0.7736656606197357
[0.754325270652771, 0.7531718611717224, 0.7739331126213074, 0.7658593058586121, 0.7632794380187988, 0.7806004881858826, 0.7436489462852478, 0.7621247172355652, 0.8025404214859009, 0.7678983807563782]
average for 200 nodes in 1st hidden layer: 0.7667381942272187
[0.7520184516906738, 0.7497116327285767, 0.769319474697113, 0.7658593058586121, 0.7655889391899109, 0.7782909870147705, 0.7482678890228271, 0.7725173234939575, 0.7921478152275085, 0.7736720442771912]
average for 300 nodes in 1st hidden layer: 0.7667393863201142
2020-02-28 23:51:40.358188 Creating model
2020-02-28 23:51:40.419026 Compiling model
2020-02-28 23:51:40.484852 Fitting model
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30

In [14]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
# read the training doccuments in to a list of sentences
pos_file = open('train_positive_reviews.txt', "rt")
neg_file = open('train_negative_reviews.txt', "rt")
full_sents = [sentence.strip() for sentence in pos_file.readlines() + neg_file.readlines()]
log("created sentence list from files")

2020-02-28 23:52:03.923518 created sentence list from files


In [16]:
# fit the tfidf vectorizer over the sentences
tfidf_vectorizer = TfidfVectorizer()
full_tfidf = tfidf_vectorizer.fit_transform(full_sents)
full_tfidf.shape

(8664, 16675)

In [17]:
# set the parameters to optimize the SVD and neural net over:
nodes = [100, 200, 300]
component_counts = [200, 300, 400]

In [18]:
# Q3.4 PART 1 do the 10-fold cross validation and store the results for each SVD component size
# and store the results in 'q3_3_results'
q3_4_svd_results = []
for component_num in range(len(component_counts)):
    q3_4_svd_results.append([])
    for i in range(10):
        svd = TruncatedSVD(n_components=component_counts[component_num])
        svd_data = svd.fit_transform(full_tfidf)
        training_x = []; training_y = []
        test_x = []; test_y = []
        log("Splitting test-training data (sentence #{} to test set)".format(i+1))
        for j in range(len(svd_data)):
            if i == (j % 10):
                # add when the counter values coincide, add the hot-ones for this sentence to the test set
                test_x.append(np.array(svd_data[j])); test_y.append(Y[j])
            else:
                # add the hot-ones for this sentence to the training set
                training_x.append(np.array(svd_data[j])); training_y.append(Y[j])

        log("Creating model"); q3_4_model = make_model(100, component_counts[component_num])

        log("Compiling model"); q3_4_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        log("Fitting model"); q3_4_model.fit(np.array(training_x), np.array(training_y), epochs=30, batch_size=128)

        log("Evaluating model"); _, accuracy = q3_4_model.evaluate(np.array(test_x), np.array(test_y))
        q3_4_svd_results[component_num].append(accuracy)

        log("Done evaluating, accuracy={}".format(q3_4_svd_results[component_num][i]))

Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
2020-02-29 00:04:19.976032 Evaluating model
2020-02-29 00:04:22.449480 Done evaluating, accuracy=0.7424942255020142
2020-02-29 00:04:27.166863 Splitting test-training data (sentence #6 to test set)
2020-02-29 00:04:27.192825 Creating model
2020-02-29 00:04:27.277565 Compiling model
2020-02-29 00:04:27.358382 Fitting model
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
2020-02-29 00:04:56.009569 Evaluating model
2020-02-29 00:04:5

In [19]:
# write the results to the answer file
q3_4_results_file = open("Q3_4_Results.txt", "w")
q3_4_results_file.write("PART 1: SVD component size optimization\n")
q3_4_results_file.write("\t1.\tParameter chosen to optimize SVD was the number of components\n")
q3_4_results_file.write("\t2.\tcomponent counts: {}\n".format(str(component_counts)))
q3_4_svd_results_avg = [np.average(q3_4_svd_results[i]) for i in range(len(q3_4_svd_results))]
max_component_count = component_counts[list(q3_4_svd_results_avg).index(np.max(q3_4_svd_results_avg))]
for i in range(3):
    print(str(q3_4_svd_results[i]))
    print("average for {} components in SVD: {}".format(component_counts[i], q3_4_svd_results_avg[i]))
q3_4_results_file.write("\t3.\tAccuracy for Node Count:\n" +\
                        "\t\t\t{} components | avg. accuracy={}\n".format(component_counts[0], q3_4_svd_results_avg[0]) +\
                        "\t\t\t{} components | avg. accuracy={}\n".format(component_counts[1], q3_4_svd_results_avg[1]) +\
                        "\t\t\t{} components | avg. accuracy={}\n".format(component_counts[2], q3_4_svd_results_avg[2]))
q3_4_results_file.write("\t4.\tChosen component count is {} because it had the highest average accuracy in the 10-fold cross validation.\n".format(max_component_count))
q3_4_results_file.close()

[0.6828143000602722, 0.6828143000602722, 0.7104959487915039, 0.6758939027786255, 0.7217090129852295, 0.7205542922019958, 0.6974595785140991, 0.6547344326972961, 0.7043879628181458, 0.7043879628181458]
average for 200 components in SVD: 0.6955251693725586
[0.7139561772346497, 0.6816608905792236, 0.7301037907600403, 0.7393310070037842, 0.7401847839355469, 0.7321016192436218, 0.7136258482933044, 0.7078521847724915, 0.7170900702476501, 0.7297921180725098]
average for 300 components in SVD: 0.7205698490142822
[0.7243368029594421, 0.7035755515098572, 0.7381775975227356, 0.7358708381652832, 0.7424942255020142, 0.7205542922019958, 0.7251732349395752, 0.6997690796852112, 0.7321016192436218, 0.7355658411979675]
average for 400 components in SVD: 0.7257619082927704


In [20]:
# Q3.4 PART 2 do the 10-fold cross validation with optimal SVD components and store the results for 
# each model's nodes in layer 1 in 'q3_4_nn_results'
q3_4_nn_results = []
for node_num in range(len(nodes)):
    q3_4_nn_results.append([])
    for i in range(10):
        svd = TruncatedSVD(n_components=max_component_count)
        svd_data = svd.fit_transform(full_tfidf)
        training_x = []; training_y = []
        test_x = []; test_y = []
        log("Splitting test-training data (sentence #{} to test set)".format(i+1))
        for j in range(len(svd_data)):
            if i == (j % 10):
                # add when the counter values coincide, add the hot-ones for this sentence to the test set
                test_x.append(np.array(svd_data[j])); test_y.append(Y[j])
            else:
                # add the hot-ones for this sentence to the training set
                training_x.append(np.array(svd_data[j])); training_y.append(Y[j])

        log("Creating model"); q3_4_model = make_model(nodes[node_num], max_component_count)

        log("Compiling model"); q3_4_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        log("Fitting model"); q3_4_model.fit(np.array(training_x), np.array(training_y), epochs=30, batch_size=128)

        log("Evaluating model"); _, accuracy = q3_4_model.evaluate(np.array(test_x), np.array(test_y))
        q3_4_nn_results[node_num].append(accuracy)

        log("Done evaluating, accuracy={}".format(q3_4_nn_results[node_num][i]))

33us/step - loss: 0.2988 - accuracy: 0.8918
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
2020-02-29 00:36:00.926139 Evaluating model
2020-02-29 00:36:04.818725 Done evaluating, accuracy=0.7667436599731445
2020-02-29 00:36:09.704176 Splitting test-training data (sentence #6 to test set)
2020-02-29 00:36:09.733099 Creating model
2020-02-29 00:36:09.804909 Compiling model
2020-02-29 00:36:09.888684 Fitting model
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
2020-02-29 00:36:54.860910 Evaluating mod

In [21]:
# write the results to the answer file
q3_4_results_file = open("Q3_4_Results.txt", "a")
q3_4_results_file.write("PART 2: Neural net node count optimization\n")
q3_4_results_file.write("\t1.\tParameter chosen to optimize model was the number of nodes in 1st hidden layer\n")
q3_4_results_file.write("\t2.\tNode counts: {}\n".format(str(nodes)))
q3_4_nn_results_avg = [np.average(q3_4_nn_results[i]) for i in range(len(q3_4_nn_results))]
max_node_count = nodes[list(q3_4_nn_results_avg).index(np.max(q3_4_nn_results_avg))]
for i in range(3):
    print(str(q3_4_nn_results[i]))
    print("average for {} components in SVD: {}".format(nodes[i], q3_4_nn_results_avg[i]))
q3_4_results_file.write("\t3.\tAccuracy for Node Count:\n" +\
                        "\t\t\t{} nodes | avg. accuracy={}\n".format(nodes[0], q3_4_nn_results_avg[0]) +\
                        "\t\t\t{} nodes | avg. accuracy={}\n".format(nodes[1], q3_4_nn_results_avg[1]) +\
                        "\t\t\t{} nodes | avg. accuracy={}\n".format(nodes[2], q3_4_nn_results_avg[2]))
q3_4_results_file.write("\t4.\tChosen node count is {} because it had the highest average accuracy in the 10-fold cross validation.\n".format(max_node_count))
q3_4_results_file.close()

[0.7370242476463318, 0.7185697555541992, 0.722029983997345, 0.7497116327285767, 0.7621247172355652, 0.7690531015396118, 0.7309468984603882, 0.6974595785140991, 0.7586604952812195, 0.7355658411979675]
average for 100 components in SVD: 0.7381146252155304
[0.7162629961967468, 0.7185697555541992, 0.7197231650352478, 0.7277969717979431, 0.7736720442771912, 0.7436489462852478, 0.7321016192436218, 0.7066974639892578, 0.7459584474563599, 0.7551963329315186]
average for 200 components in SVD: 0.7339627742767334
[0.7254902124404907, 0.70126873254776, 0.7185697555541992, 0.7427912354469299, 0.7667436599731445, 0.7482678890228271, 0.7309468984603882, 0.7124711275100708, 0.7551963329315186, 0.7459584474563599]
average for 300 components in SVD: 0.7347704291343689


In [22]:
# retrain the model on the entire training set and report the accuracy on the training set as an upper-bound for performance on test data
svd = TruncatedSVD(n_components=max_component_count)
svd_data = svd.fit_transform(full_tfidf)

log("Creating model"); q3_4_model = make_model(max_node_count, max_component_count)

log("Compiling model"); q3_4_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

log("Fitting model"); q3_4_model.fit(np.array(svd_data), np.array(Y), epochs=30, batch_size=128)

log("Evaluating model"); q3_4_loss, q3_4_accuracy = q3_4_model.evaluate(np.array(svd_data), np.array(Y))

log("Done evaluating, accuracy={}".format(q3_4_accuracy))

q3_4_results_file = open("Q3_4_Results.txt", "a")
q3_4_results_file.write("PART 3: Upper bound accuracy\n")
q3_4_results_file.write("\t5.\tTrainied model on full data with {} nodes and {} SVD components\n".format(max_node_count, max_component_count))
q3_4_results_file.write("\t\t\taccuracy = {} ; loss = {}\n".format(q3_4_accuracy, q3_4_loss))
q3_4_results_file.close()

2020-02-29 00:40:55.117908 Creating model
2020-02-29 00:40:55.194703 Compiling model
2020-02-29 00:40:55.269633 Fitting model
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
2020-02-29 00:41:38.667811 Evaluating model
2020-02-29 00:41:43.912274 Done evaluating, accuracy=0.9960756897926331


In [23]:
# find the best of the three models and use it to annotate the test data
# with positive/negative predictions

testfile = open("test_reviews.txt", "r")
testfile_sents = [sentence.strip().lower() for sentence in testfile.readlines()]
testfile_data = [[word for word in sentence] for sentence in testfile_sents]
testfile.close()

q3_5_results_file = open("Q3_5_Results.txt", "w")
model_accuracies = [q3_2_accuracy, q3_3_accuracy, q3_4_accuracy]
best_model_num = list(model_accuracies).index(np.max(model_accuracies))
q3_5_results_file.write("Accuracies for Models:\n" +\
    "\t\t- One-Hot = {}\n\t\t- Word2Vec = {}\n\t\t- SVD = {}\n".format(q3_2_accuracy, q3_3_accuracy, q3_4_accuracy))

if best_model_num == 0:
    q3_5_results_file.write("One-Hot Encoding Used:\n")

    log('building one-hot test array')
    onehot_test = np.array([np.array([1 if word in sentence else 0 for word in full_vocab]) for sentence in testfile_data])
    log("one-hot test array done")

    raw_results = q3_2_model.predict(onehot_test)
elif best_model_num == 1:
    q3_5_results_file.write("Word2Vec Encoding Used:\n")

    log('building word2vec test array')
    word2vec_test = []
    for sentence in testfile_data:
        sentence_vec = get_sentence_vector(sentence)
        if np.isnan(sentence_vec).any():
            sentence_vec = np.zeros(300)
        word2vec.append(sentence_vec)
    log("word2vec test array done")

    raw_results = q3_3_model.predict(word2vec_test)
else:
    q3_5_results_file.write("SVD Used:\n")

    log('building svd test array')
    svd = TruncatedSVD(n_components=max_component_count)
    svd_data = svd.fit_transform(full_tfidf)
    log('svd test array done')

    raw_results = q3_4_model.predict(svd_test)

# print the results of the prediction to the file, and close
results = [1 if out>0.5 else 0 for out in raw_results]
for i in range(len(results)):
    q3_5_results_file.write("{} : {}\n".format(results[i], testfile_sents[i]))
q3_5_results_file.close()

2020-02-29 00:41:43.970119 building one-hot test array
2020-02-29 00:42:43.511047 one-hot test array done
