# Sentiment-Analysis-Network

by Qiren Sun

## 1.Get reviews and labels

In [2]:
def pretty_print_review_and_label(i):
    print(labels[i]+'\t:\t' + reviews[i])
#Get reiews & labels
g=open('reviews.txt','r')
reviews=list(map(lambda x: x[:-1],g.readlines()))
g.close()
g=open('labels.txt','r')
labels=list(map(lambda x: x[:-1].upper(),g.readlines()))
g.close()
pretty_print_review_and_label(1)

NEGATIVE	:	story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turned into an insane  violent mob by the crazy chantings of it  s singers . unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting . even those from the era should be turned off . the cryptic dialogue would make shakespeare seem easy to a third grader . on a technical level it  s better than you might think with some good cinematography by future great vilmos zsigmond . future stars sally kirkland and frederic forrest can be seen briefly .  


# 2.Building a Neural Network

Note: This part includes training and testing data, reducing neural noise, getting better weight initialization and improving the training speed. 

In [6]:
import time
import sys
import numpy as np
from collections import Counter

class SentimentNetwork:
    def __init__(self,reviews,labels,min_count=10,polarity_cutoff=0.1,hidden_nodes=10,learning_rate=0.1):
        np.random.seed(1)
        self.pre_process_data(reviews,labels,polarity_cutoff,min_count)
        self.init_network(len(self.review_vocab),hidden_nodes,1,learning_rate)
        
    def pre_process_data(self,reviews,labels,polarity_cutoff,min_count):
        ##---------------------------------------
        #Calculate positive-to-negative ratios for words before building vocabulary
        #
        positive_count=Counter()
        negative_count=Counter()
        total_count=Counter()

        for i in range(len(reviews)):
            if(labels[i]=='POSITIVE'):
                for word in reviews[i].split(' '):                
                    positive_count[word]+=1
                    total_count[word]+=1
            else:
                for word in reviews[i].split(' '):
                    negative_count[word]+=1
                    total_count[word]+=1
                    
        pos_neg_radio=Counter()
        
        
        for term,cnt in list(total_count.most_common()):
            if (cnt>=100):
                pos_neg_radios=positive_count[term]/float(negative_count[term]+1)
                pos_neg_radio[term]=pos_neg_radios
                
        for word,radio in pos_neg_radio.most_common():
            if(radio>1):
                pos_neg_radio[word]=np.log(radio)
            else:
                pos_neg_radio[word]=-np.log(1/(radio+0.01))
        #
        ##------------------------------------------
        
        # populate review_vocab with all of the words in the given reviews
        review_vocab=set()
        for review in reviews:
            for word in review.split(' '):
                if(total_count[word]>min_count):
                    if(word in pos_neg_radio.keys()):
                        if(pos_neg_radio[word]>=polarity_cutoff) or (pos_neg_radio[word]<=-polarity_cutoff):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)
        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab=list(review_vocab)
        
        label_vocab=set()
        for label in labels:
            label_vocab.add(label)
        self.label_vocab=list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index=dict()
        for i,word in enumerate(self.review_vocab):
            self.word2index[word]=i
        # Create a dictionary of labels mapped to index positions
        self.label2index=dict()
        for i,label in enumerate(self.label_vocab):
            self.label2index[label]=i
        
                
        
    def init_network(self,input_nodes,hidden_nodes,output_nodes,learning_rate):
         # Set number of nodes in input, hidden and output layers.
        self.input_nodes=input_nodes
        self.hidden_nodes=hidden_nodes
        self.output_nodes=output_nodes
        
        self.learning_rate=learning_rate
        # Initialize weights
        self.weights_0_1=np.zeros((self.input_nodes,self.hidden_nodes))
        self.weights_1_2=np.random.normal(0.0, self.hidden_nodes**-.5, (self.hidden_nodes,self.output_nodes))
        # The input layer, a two-dimensional matrix with shape 1 x hidden_nodes
        self.layer_1=np.zeros((1,hidden_nodes))
        
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_reviews_raw, training_labels):
        #pre-process training reviews so we can deal directly with the indices of non-zero inputs
        training_reviews=list()
        for review in training_reviews_raw:
            indices=set()
            for word in review.split(' '):
                if (word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))
            
       # make sure out we have a matching number of reviews and labels
       
       #assert(len(training_reviews) == len(training_labels))
       
       # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            #hidden layer
            #Add in only the weights for non-zero items
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]
            #output layer    
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))   
            
            # Output error
            # Output layer error is the difference between desired target and actual output.
            layer_2_error = layer_2 - self.get_target_for_label(label) 
            layer_2_error_term = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_error_term.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_error_term = layer_1_error # hidden layer gradients -linear

            # Update the weights
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_error_term) * self.learning_rate
            for ind in review:
                self.weights_0_1[ind]-= layer_1_error_term[0]*self.learning_rate

            
            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print('')
    
    def test(self,testing_reviews,testing_labels):
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
        
    def run(self,review):
        #hidden layer
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        #output layer
        layer_2=self.sigmoid(self.layer_1.dot(self.weights_1_2))
        
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"

# 3.Training

In [7]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.2,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):1387. #Correct:2027 #Trained:2501 Training Accuracy:81.0%
Progress:20.8% Speed(reviews/sec):1295. #Correct:4090 #Trained:5001 Training Accuracy:81.7%
Progress:31.2% Speed(reviews/sec):1282. #Correct:6235 #Trained:7501 Training Accuracy:83.1%
Progress:41.6% Speed(reviews/sec):1276. #Correct:8381 #Trained:10001 Training Accuracy:83.8%
Progress:52.0% Speed(reviews/sec):1284. #Correct:10537 #Trained:12501 Training Accuracy:84.2%
Progress:62.5% Speed(reviews/sec):1282. #Correct:12683 #Trained:15001 Training Accuracy:84.5%
Progress:72.9% Speed(reviews/sec):1289. #Correct:14839 #Trained:17501 Training Accuracy:84.7%
Progress:83.3% Speed(reviews/sec):1279. #Correct:17030 #Trained:20001 Training Accuracy:85.1%
Progress:93.7% Speed(reviews/sec):1275. #Correct:19230 #Trained:22501 Training Accuracy:85.4%
Progress:99.9% Speed(reviews/sec):1271. #Correct:20554 #Trained:24000 Training

# 4.Testing

In [9]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):997.6 #Correct:2 #Tested:2 Testing Accuracy:100.%Progress:0.2% Speed(reviews/sec):1995. #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):1496. #Correct:3 #Tested:4 Testing Accuracy:75.0%Progress:0.4% Speed(reviews/sec):1994. #Correct:4 #Tested:5 Testing Accuracy:80.0%Progress:0.5% Speed(reviews/sec):2493. #Correct:5 #Tested:6 Testing Accuracy:83.3%Progress:0.6% Speed(reviews/sec):1994. #Correct:6 #Tested:7 Testing Accuracy:85.7%Progress:0.7% Speed(reviews/sec):2326. #Correct:7 #Tested:8 Testing Accuracy:87.5%Progress:0.8% Speed(reviews/sec):1994. #Correct:8 #Tested:9 Testing Accuracy:88.8%Progress:0.9% Speed(reviews/sec):1795. #Correct:9 #Tested:10 Testing Accuracy:90.0%Progress:1.0% Speed(reviews/sec):1994. #Correct:10 #Tested:11 Testing Accuracy:90.9%Progress:1.1% Speed(reviews/sec):1828. #Correct:11 #Tested:12 Testing Accuracy:91.6%Pr

Progress:41.0% Speed(reviews/sec):1910. #Correct:360 #Tested:411 Testing Accuracy:87.5%Progress:41.1% Speed(reviews/sec):1906. #Correct:361 #Tested:412 Testing Accuracy:87.6%Progress:41.2% Speed(reviews/sec):1911. #Correct:362 #Tested:413 Testing Accuracy:87.6%Progress:41.3% Speed(reviews/sec):1906. #Correct:363 #Tested:414 Testing Accuracy:87.6%Progress:41.4% Speed(reviews/sec):1902. #Correct:364 #Tested:415 Testing Accuracy:87.7%Progress:41.5% Speed(reviews/sec):1898. #Correct:365 #Tested:416 Testing Accuracy:87.7%Progress:41.6% Speed(reviews/sec):1903. #Correct:366 #Tested:417 Testing Accuracy:87.7%Progress:41.7% Speed(reviews/sec):1899. #Correct:367 #Tested:418 Testing Accuracy:87.7%Progress:41.8% Speed(reviews/sec):1903. #Correct:368 #Tested:419 Testing Accuracy:87.8%Progress:41.9% Speed(reviews/sec):1908. #Correct:369 #Tested:420 Testing Accuracy:87.8%Progress:42.0% Speed(reviews/sec):1904. #Correct:370 #Tested:421 Testing Accuracy:87.8%Progress:42.1% Speed(reviews/se

Progress:78.1% Speed(reviews/sec):1937. #Correct:660 #Tested:782 Testing Accuracy:84.3%Progress:78.2% Speed(reviews/sec):1935. #Correct:660 #Tested:783 Testing Accuracy:84.2%Progress:78.3% Speed(reviews/sec):1937. #Correct:661 #Tested:784 Testing Accuracy:84.3%Progress:78.4% Speed(reviews/sec):1935. #Correct:662 #Tested:785 Testing Accuracy:84.3%Progress:78.5% Speed(reviews/sec):1937. #Correct:662 #Tested:786 Testing Accuracy:84.2%Progress:78.6% Speed(reviews/sec):1935. #Correct:663 #Tested:787 Testing Accuracy:84.2%Progress:78.7% Speed(reviews/sec):1933. #Correct:664 #Tested:788 Testing Accuracy:84.2%Progress:78.8% Speed(reviews/sec):1935. #Correct:664 #Tested:789 Testing Accuracy:84.1%Progress:78.9% Speed(reviews/sec):1933. #Correct:665 #Tested:790 Testing Accuracy:84.1%Progress:79.0% Speed(reviews/sec):1935. #Correct:666 #Tested:791 Testing Accuracy:84.1%Progress:79.1% Speed(reviews/sec):1933. #Correct:667 #Tested:792 Testing Accuracy:84.2%Progress:79.2% Speed(reviews/se