In [4]:
g = open('reviews.txt','r')
reviews = list(map(lambda x:x[:-1], g.readlines()))
g.close()

g = open('labels.txt','r')
labels = list(map(lambda x:x[:-1].lower(), g.readlines()))
g.close()

In [5]:
from collections import Counter
import numpy as np

In [6]:
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

In [7]:
for i in range(len(reviews)):
    if(labels[i] == 'positive'):
        for word in reviews[i].split(" "):
            positive_counts[word] += 1
            total_counts[word] += 1
            
    else:
        for word in reviews[i].split(" "):
            negative_counts[word] += 1
            total_counts[word] += 1

In [8]:
pos_neg_ratios = Counter()

for term,cnt in list(total_counts.most_common()):
    if(cnt > 100):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

In [9]:
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print("vocab size :",len(vocab))

vocab size : 74074


In [10]:
layer_0 = np.zeros((1, vocab_size))
layer_0.shape

(1, 74074)

In [11]:
word_map_index = {}
for i,word in enumerate(vocab):
    word_map_index[word] = i
    
#word_map_index

In [12]:
word_map_index[word]

74073

In [13]:
def update_input_layer(review):
    global layer_0
    
    layer_0 *= 0
    #print(layer_0)
    for word in review.split(" "):
        layer_0[0][word_map_index[word]] += 1
        #print(layer_0[0][0])

In [14]:
#from the array you can find how many times each word has appeared in the given review, For example " " has appeared 18 times
#in the second review
update_input_layer(reviews[1])
layer_0
#reviews[0]

array([[5., 0., 0., ..., 0., 0., 0.]])

In [15]:
def get_target_for_label(label):
    if(label == 'positive'):
        return 1
    else:
        return 0

In [16]:
get_target_for_label(labels[0])
#print(labels[0])

1

In [17]:
import time
import sys
import numpy as np

# Encapsulate our neural network in a class
class SentimentNetwork:
    ## New for Project 6: added min_count and polarity_cutoff parameters
    def __init__(self, reviews,labels,min_count = 10,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            min_count(int) - Words should only be added to the vocabulary 
                             if they occur more than this many times
            polarity_cutoff(float) - The absolute value of a word's positive-to-negative
                                     ratio must be at least this big to be considered.
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        ## New for Project 6: added min_count and polarity_cutoff arguments to pre_process_data call
        self.pre_process_data(reviews, labels, polarity_cutoff, min_count)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    ## New for Project 6: added min_count and polarity_cutoff parameters
    def pre_process_data(self, reviews, labels, polarity_cutoff, min_count):
        
        ## ----------------------------------------
        ## New for Project 6: Calculate positive-to-negative ratios for words before
        #                     building vocabulary
        #
        positive_counts = Counter()
        negative_counts = Counter()
        total_counts = Counter()

        for i in range(len(reviews)):
            if(labels[i] == 'positive'):
                for word in reviews[i].split(" "):
                    positive_counts[word] += 1
                    total_counts[word] += 1
            else:
                for word in reviews[i].split(" "):
                    negative_counts[word] += 1
                    total_counts[word] += 1

        pos_neg_ratios = Counter()

        for term,cnt in list(total_counts.most_common()):
            if(cnt >= 50):
                pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
                pos_neg_ratios[term] = pos_neg_ratio

        for word,ratio in pos_neg_ratios.most_common():
            if(ratio > 1):
                pos_neg_ratios[word] = np.log(ratio)
            else:
                pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
        #
        ## end New for Project 6
        ## ----------------------------------------

        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                ## New for Project 6: only add words that occur at least min_count times
                #                     and for words with pos/neg ratios, only add words
                #                     that meet the polarity_cutoff
                if(total_counts[word] > min_count):
                    if(word in pos_neg_ratios.keys()):
                        if((pos_neg_ratios[word] >= polarity_cutoff) or (pos_neg_ratios[word] <= -polarity_cutoff)):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i

    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))

        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        ## New for Project 5: Removed self.layer_0; added self.layer_1
        # The input layer, a two-dimensional matrix with shape 1 x hidden_nodes
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    ## New for Project 5: Removed update_input_layer function
    
    def get_target_for_label(self,label):
        if(label == 'positive'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    ## New for Project 5: changed name of first parameter form 'training_reviews' 
    #                     to 'training_reviews_raw'
    def train(self, training_reviews_raw, training_labels):

        ## New for Project 5: pre-process training reviews so we can deal 
        #                     directly with the indices of non-zero inputs
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))

        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            ## New for Project 5: Removed call to 'update_input_layer' function
            #                     because 'layer_0' is no longer used

            # Hidden layer
            ## New for Project 5: Add in only the weights for non-zero items
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]

            # Output layer
            ## New for Project 5: changed to use 'self.layer_1' instead of 'local layer_1'
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            ## New for Project 5: changed to use 'self.layer_1' instead of local 'layer_1'
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            ## New for Project 5: Only update the weights that were used in the forward pass
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'positive'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'negative'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        ## New for Project 5: Removed call to update_input_layer function
        #                     because layer_0 is no longer used

        # Hidden layer
        ## New for Project 5: Identify the indices used in the review and then add
        #                     just those weights to layer_1 
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        ## New for Project 5: changed to use self.layer_1 instead of local layer_1
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
         
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "positive"
        else:
            return "negative"


In [18]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.05,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%


Progress:0.00% Speed(reviews/sec):14.71 #Correct:1 #Trained:2 Training Accuracy:50.0%Progress:0.00% Speed(reviews/sec):27.41 #Correct:1 #Trained:3 Training Accuracy:33.3%Progress:0.01% Speed(reviews/sec):35.31 #Correct:1 #Trained:4 Training Accuracy:25.0%Progress:0.01% Speed(reviews/sec):45.47 #Correct:1 #Trained:5 Training Accuracy:20.0%Progress:0.02% Speed(reviews/sec):54.97 #Correct:1 #Trained:6 Training Accuracy:16.6%Progress:0.02% Speed(reviews/sec):64.55 #Correct:1 #Trained:7 Training Accuracy:14.2%Progress:0.02% Speed(reviews/sec):74.51 #Correct:2 #Trained:8 Training Accuracy:25.0%Progress:0.03% Speed(reviews/sec):84.25 #Correct:2 #Trained:9 Training Accuracy:22.2%Progress:0.03% Speed(reviews/sec):91.88 #Correct:3 #Trained:10 Training Accuracy:30.0%Progress:0.04% Speed(reviews/sec):100.0 #Correct:3 #Trained:11 Training Accuracy:27.2%Progress:0.04% Speed(reviews/sec):107.9 #Correct:3 #Trained:12 Training Accuracy:25.0%Progress:0.05% Speed(reviews/sec):116.5 #Correct:3

Progress:10.4% Speed(reviews/sec):543.8 #Correct:1994 #Trained:2501 Training Accuracy:79.7%
Progress:20.8% Speed(reviews/sec):612.5 #Correct:4063 #Trained:5001 Training Accuracy:81.2%
Progress:31.2% Speed(reviews/sec):617.7 #Correct:6176 #Trained:7501 Training Accuracy:82.3%
Progress:41.6% Speed(reviews/sec):613.3 #Correct:8336 #Trained:10001 Training Accuracy:83.3%
Progress:52.0% Speed(reviews/sec):621.9 #Correct:10501 #Trained:12501 Training Accuracy:84.0%
Progress:62.5% Speed(reviews/sec):623.7 #Correct:12641 #Trained:15001 Training Accuracy:84.2%
Progress:72.9% Speed(reviews/sec):618.1 #Correct:14782 #Trained:17501 Training Accuracy:84.4%
Progress:83.3% Speed(reviews/sec):601.5 #Correct:16954 #Trained:20001 Training Accuracy:84.7%
Progress:93.7% Speed(reviews/sec):607.8 #Correct:19143 #Trained:22501 Training Accuracy:85.0%
Progress:99.9% Speed(reviews/sec):601.1 #Correct:20461 #Trained:24000 Training Accuracy:85.2%

In [19]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):500.6 #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):1001. #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):1001. #Correct:3 #Tested:4 Testing Accuracy:75.0%Progress:0.4% Speed(reviews/sec):1334. #Correct:4 #Tested:5 Testing Accuracy:80.0%Progress:0.5% Speed(reviews/sec):1250. #Correct:5 #Tested:6 Testing Accuracy:83.3%Progress:0.6% Speed(reviews/sec):1200. #Correct:6 #Tested:7 Testing Accuracy:85.7%Progress:0.7% Speed(reviews/sec):1000. #Correct:7 #Tested:8 Testing Accuracy:87.5%Progress:0.8% Speed(reviews/sec):1000. #Correct:8 #Tested:9 Testing Accuracy:88.8%Progress:0.9% Speed(reviews/sec):818.7 #Correct:9 #Tested:10 Testing Accuracy:90.0%Progress:1.0% Speed(reviews/sec):769.7 #Correct:10 #Tested:11 Testing Accuracy:90.9%Progress:1.1% Speed(reviews/sec):786.2 #Correct:11 #Tested:12 Testing Accuracy:91.6%Pr

Progress:24.4% Speed(reviews/sec):1109. #Correct:219 #Tested:245 Testing Accuracy:89.3%Progress:24.5% Speed(reviews/sec):1114. #Correct:220 #Tested:246 Testing Accuracy:89.4%Progress:24.6% Speed(reviews/sec):1113. #Correct:221 #Tested:247 Testing Accuracy:89.4%Progress:24.7% Speed(reviews/sec):1113. #Correct:222 #Tested:248 Testing Accuracy:89.5%Progress:24.8% Speed(reviews/sec):1117. #Correct:223 #Tested:249 Testing Accuracy:89.5%Progress:24.9% Speed(reviews/sec):1122. #Correct:223 #Tested:250 Testing Accuracy:89.2%Progress:25.0% Speed(reviews/sec):1121. #Correct:224 #Tested:251 Testing Accuracy:89.2%Progress:25.1% Speed(reviews/sec):1121. #Correct:225 #Tested:252 Testing Accuracy:89.2%Progress:25.2% Speed(reviews/sec):1125. #Correct:226 #Tested:253 Testing Accuracy:89.3%Progress:25.3% Speed(reviews/sec):1125. #Correct:227 #Tested:254 Testing Accuracy:89.3%Progress:25.4% Speed(reviews/sec):1124. #Correct:228 #Tested:255 Testing Accuracy:89.4%Progress:25.5% Speed(reviews/se

Progress:46.5% Speed(reviews/sec):1064. #Correct:416 #Tested:466 Testing Accuracy:89.2%Progress:46.6% Speed(reviews/sec):1064. #Correct:417 #Tested:467 Testing Accuracy:89.2%Progress:46.7% Speed(reviews/sec):1045. #Correct:418 #Tested:468 Testing Accuracy:89.3%Progress:46.8% Speed(reviews/sec):1020. #Correct:419 #Tested:469 Testing Accuracy:89.3%Progress:46.9% Speed(reviews/sec):1020. #Correct:419 #Tested:470 Testing Accuracy:89.1%Progress:47.0% Speed(reviews/sec):1020. #Correct:420 #Tested:471 Testing Accuracy:89.1%Progress:47.1% Speed(reviews/sec):1020. #Correct:420 #Tested:472 Testing Accuracy:88.9%Progress:47.2% Speed(reviews/sec):1020. #Correct:421 #Tested:473 Testing Accuracy:89.0%Progress:47.3% Speed(reviews/sec):1017. #Correct:422 #Tested:474 Testing Accuracy:89.0%Progress:47.4% Speed(reviews/sec):1017. #Correct:423 #Tested:475 Testing Accuracy:89.0%Progress:47.5% Speed(reviews/sec):1013. #Correct:424 #Tested:476 Testing Accuracy:89.0%Progress:47.6% Speed(reviews/se

Progress:58.7% Speed(reviews/sec):877.9 #Correct:523 #Tested:588 Testing Accuracy:88.9%Progress:58.8% Speed(reviews/sec):878.1 #Correct:524 #Tested:589 Testing Accuracy:88.9%Progress:58.9% Speed(reviews/sec):878.3 #Correct:525 #Tested:590 Testing Accuracy:88.9%Progress:59.0% Speed(reviews/sec):878.5 #Correct:525 #Tested:591 Testing Accuracy:88.8%Progress:59.1% Speed(reviews/sec):878.6 #Correct:526 #Tested:592 Testing Accuracy:88.8%Progress:59.2% Speed(reviews/sec):876.2 #Correct:526 #Tested:593 Testing Accuracy:88.7%Progress:59.3% Speed(reviews/sec):876.4 #Correct:527 #Tested:594 Testing Accuracy:88.7%Progress:59.4% Speed(reviews/sec):875.3 #Correct:528 #Tested:595 Testing Accuracy:88.7%Progress:59.5% Speed(reviews/sec):875.5 #Correct:529 #Tested:596 Testing Accuracy:88.7%Progress:59.6% Speed(reviews/sec):875.7 #Correct:530 #Tested:597 Testing Accuracy:88.7%Progress:59.7% Speed(reviews/sec):875.9 #Correct:531 #Tested:598 Testing Accuracy:88.7%Progress:59.8% Speed(reviews/se

Progress:72.6% Speed(reviews/sec):834.9 #Correct:627 #Tested:727 Testing Accuracy:86.2%Progress:72.7% Speed(reviews/sec):834.2 #Correct:628 #Tested:728 Testing Accuracy:86.2%Progress:72.8% Speed(reviews/sec):834.4 #Correct:629 #Tested:729 Testing Accuracy:86.2%Progress:72.9% Speed(reviews/sec):832.7 #Correct:630 #Tested:730 Testing Accuracy:86.3%Progress:73.0% Speed(reviews/sec):826.2 #Correct:631 #Tested:731 Testing Accuracy:86.3%Progress:73.1% Speed(reviews/sec):825.5 #Correct:632 #Tested:732 Testing Accuracy:86.3%Progress:73.2% Speed(reviews/sec):824.8 #Correct:633 #Tested:733 Testing Accuracy:86.3%Progress:73.3% Speed(reviews/sec):824.1 #Correct:634 #Tested:734 Testing Accuracy:86.3%Progress:73.4% Speed(reviews/sec):823.3 #Correct:635 #Tested:735 Testing Accuracy:86.3%Progress:73.5% Speed(reviews/sec):822.6 #Correct:636 #Tested:736 Testing Accuracy:86.4%Progress:73.6% Speed(reviews/sec):822.8 #Correct:637 #Tested:737 Testing Accuracy:86.4%Progress:73.7% Speed(reviews/se

Progress:84.9% Speed(reviews/sec):772.9 #Correct:729 #Tested:850 Testing Accuracy:85.7%Progress:85.0% Speed(reviews/sec):771.7 #Correct:730 #Tested:851 Testing Accuracy:85.7%Progress:85.1% Speed(reviews/sec):772.0 #Correct:731 #Tested:852 Testing Accuracy:85.7%Progress:85.2% Speed(reviews/sec):772.2 #Correct:732 #Tested:853 Testing Accuracy:85.8%Progress:85.3% Speed(reviews/sec):772.4 #Correct:733 #Tested:854 Testing Accuracy:85.8%Progress:85.4% Speed(reviews/sec):772.6 #Correct:734 #Tested:855 Testing Accuracy:85.8%Progress:85.5% Speed(reviews/sec):772.8 #Correct:735 #Tested:856 Testing Accuracy:85.8%Progress:85.6% Speed(reviews/sec):773.0 #Correct:736 #Tested:857 Testing Accuracy:85.8%Progress:85.7% Speed(reviews/sec):773.2 #Correct:736 #Tested:858 Testing Accuracy:85.7%Progress:85.8% Speed(reviews/sec):773.4 #Correct:737 #Tested:859 Testing Accuracy:85.7%Progress:85.9% Speed(reviews/sec):773.6 #Correct:738 #Tested:860 Testing Accuracy:85.8%Progress:86.0% Speed(reviews/se

In [21]:
mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.01)
mlp_full.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%


Progress:0.00% Speed(reviews/sec):333.5 #Correct:1 #Trained:2 Training Accuracy:50.0%Progress:0.00% Speed(reviews/sec):285.9 #Correct:1 #Trained:3 Training Accuracy:33.3%Progress:0.01% Speed(reviews/sec):230.9 #Correct:1 #Trained:4 Training Accuracy:25.0%Progress:0.01% Speed(reviews/sec):285.9 #Correct:1 #Trained:5 Training Accuracy:20.0%Progress:0.02% Speed(reviews/sec):312.7 #Correct:1 #Trained:6 Training Accuracy:16.6%Progress:0.02% Speed(reviews/sec):353.1 #Correct:1 #Trained:7 Training Accuracy:14.2%Progress:0.02% Speed(reviews/sec):389.1 #Correct:2 #Trained:8 Training Accuracy:25.0%Progress:0.03% Speed(reviews/sec):421.3 #Correct:2 #Trained:9 Training Accuracy:22.2%Progress:0.03% Speed(reviews/sec):409.3 #Correct:2 #Trained:10 Training Accuracy:20.0%Progress:0.04% Speed(reviews/sec):416.9 #Correct:2 #Trained:11 Training Accuracy:18.1%Progress:0.04% Speed(reviews/sec):440.2 #Correct:2 #Trained:12 Training Accuracy:16.6%Progress:0.05% Speed(reviews/sec):461.8 #Correct:2

Progress:0.88% Speed(reviews/sec):472.4 #Correct:148 #Trained:213 Training Accuracy:69.4%Progress:0.88% Speed(reviews/sec):473.6 #Correct:149 #Trained:214 Training Accuracy:69.6%Progress:0.89% Speed(reviews/sec):474.7 #Correct:149 #Trained:215 Training Accuracy:69.3%Progress:0.89% Speed(reviews/sec):472.8 #Correct:150 #Trained:216 Training Accuracy:69.4%Progress:0.9% Speed(reviews/sec):472.9 #Correct:150 #Trained:217 Training Accuracy:69.1%Progress:0.90% Speed(reviews/sec):474.0 #Correct:150 #Trained:218 Training Accuracy:68.8%Progress:0.90% Speed(reviews/sec):473.1 #Correct:150 #Trained:219 Training Accuracy:68.4%Progress:0.91% Speed(reviews/sec):474.3 #Correct:150 #Trained:220 Training Accuracy:68.1%Progress:0.91% Speed(reviews/sec):475.4 #Correct:151 #Trained:221 Training Accuracy:68.3%Progress:0.92% Speed(reviews/sec):475.5 #Correct:152 #Trained:222 Training Accuracy:68.4%Progress:0.92% Speed(reviews/sec):474.6 #Correct:153 #Trained:223 Training Accuracy:68.6%Progress:0

Progress:1.4% Speed(reviews/sec):493.6 #Correct:244 #Trained:337 Training Accuracy:72.4%Progress:1.40% Speed(reviews/sec):492.9 #Correct:244 #Trained:338 Training Accuracy:72.1%Progress:1.40% Speed(reviews/sec):493.0 #Correct:245 #Trained:339 Training Accuracy:72.2%Progress:1.41% Speed(reviews/sec):493.7 #Correct:246 #Trained:340 Training Accuracy:72.3%Progress:1.41% Speed(reviews/sec):494.4 #Correct:247 #Trained:341 Training Accuracy:72.4%Progress:1.42% Speed(reviews/sec):493.7 #Correct:248 #Trained:342 Training Accuracy:72.5%Progress:1.42% Speed(reviews/sec):493.8 #Correct:249 #Trained:343 Training Accuracy:72.5%Progress:1.42% Speed(reviews/sec):494.5 #Correct:250 #Trained:344 Training Accuracy:72.6%Progress:1.43% Speed(reviews/sec):495.2 #Correct:251 #Trained:345 Training Accuracy:72.7%Progress:1.43% Speed(reviews/sec):494.5 #Correct:251 #Trained:346 Training Accuracy:72.5%Progress:1.44% Speed(reviews/sec):495.2 #Correct:252 #Trained:347 Training Accuracy:72.6%Progress:1

Progress:10.4% Speed(reviews/sec):553.8 #Correct:1962 #Trained:2501 Training Accuracy:78.4%
Progress:20.8% Speed(reviews/sec):531.6 #Correct:4002 #Trained:5001 Training Accuracy:80.0%
Progress:31.2% Speed(reviews/sec):523.1 #Correct:6120 #Trained:7501 Training Accuracy:81.5%
Progress:41.6% Speed(reviews/sec):524.0 #Correct:8271 #Trained:10001 Training Accuracy:82.7%
Progress:52.0% Speed(reviews/sec):520.6 #Correct:10431 #Trained:12501 Training Accuracy:83.4%
Progress:62.5% Speed(reviews/sec):520.1 #Correct:12565 #Trained:15001 Training Accuracy:83.7%
Progress:72.9% Speed(reviews/sec):514.7 #Correct:14670 #Trained:17501 Training Accuracy:83.8%
Progress:83.3% Speed(reviews/sec):516.7 #Correct:16833 #Trained:20001 Training Accuracy:84.1%
Progress:93.7% Speed(reviews/sec):511.5 #Correct:19015 #Trained:22501 Training Accuracy:84.5%
Progress:99.9% Speed(reviews/sec):509.8 #Correct:20335 #Trained:24000 Training Accuracy:84.7%

In [22]:
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in mlp_full.word2index.keys():
        most_similar[word] = np.dot(mlp_full.weights_0_1[mlp_full.word2index[word]],mlp_full.weights_0_1[mlp_full.word2index[focus]])
    
    return most_similar.most_common()

In [23]:
get_most_similar_words("excellent")

[('excellent', 0.1367295075735247),
 ('perfect', 0.12548286087225943),
 ('amazing', 0.09182763392599969),
 ('today', 0.09022366269441419),
 ('wonderful', 0.08935597696221459),
 ('fun', 0.08750446667420683),
 ('great', 0.08714175888229204),
 ('best', 0.08581088561788061),
 ('liked', 0.0776976291238434),
 ('definitely', 0.07662878140696601),
 ('brilliant', 0.07342385876927902),
 ('loved', 0.07328542892812213),
 ('favorite', 0.07278113603616077),
 ('superb', 0.07173620717850507),
 ('fantastic', 0.07092219191626621),
 ('job', 0.06916061720763403),
 ('incredible', 0.0664240779526144),
 ('enjoyable', 0.06563256050288878),
 ('rare', 0.06481921266261505),
 ('highly', 0.06388945335097052),
 ('enjoyed', 0.06212754610181295),
 ('wonderfully', 0.06205517860409016),
 ('perfectly', 0.06109320881188738),
 ('fascinating', 0.06066354793749389),
 ('bit', 0.05965542704565309),
 ('gem', 0.05951085929615678),
 ('outstanding', 0.05886080814708303),
 ('beautiful', 0.05861393470316208),
 ('surprised', 0.05827

In [24]:
get_most_similar_words("terrible")

[('worst', 0.1696610725904984),
 ('awful', 0.12026847019691243),
 ('waste', 0.11945367265311002),
 ('poor', 0.09275888757443547),
 ('terrible', 0.0914253871977279),
 ('dull', 0.0842092716782236),
 ('poorly', 0.08124154451604203),
 ('disappointment', 0.0800647596213687),
 ('fails', 0.0785997737233375),
 ('disappointing', 0.07733948548032334),
 ('boring', 0.07712785874801288),
 ('unfortunately', 0.07550244970585906),
 ('worse', 0.07060183536419462),
 ('mess', 0.0705642996235904),
 ('stupid', 0.06948482283254304),
 ('badly', 0.06688890366622854),
 ('annoying', 0.06568702190337414),
 ('bad', 0.06309381453757212),
 ('save', 0.06288059749586573),
 ('disappointed', 0.06269235381207287),
 ('wasted', 0.06138718302805128),
 ('supposed', 0.060985452957725166),
 ('horrible', 0.06012177233938011),
 ('laughable', 0.05869840628546764),
 ('crap', 0.05810452866788459),
 ('basically', 0.05721884036963617),
 ('nothing', 0.05715822004303419),
 ('ridiculous', 0.05690548106893144),
 ('lacks', 0.055766565889