In [1]:
with open('data/reviews.txt','r') as file:
    reviews = list(map(lambda x:x[:-1],file.readlines()))
with open ('data/labels.txt') as file:
    labels = list(map(lambda x:x[:-1].upper(),file.readlines()))

In [2]:
import numpy as np
l = np.array(labels)
count = {"POSITIVE":0,"NEGATIVE":0}
for w in l:
    if w =="POSITIVE":
        count[w] +=1
    else: count[w] +=1
count

{'POSITIVE': 12500, 'NEGATIVE': 12500}

In [3]:
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS

positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
pos_neg_ratios = Counter()

for i in range(len(reviews)):
    if(labels[i] == 'POSITIVE'):
        for word in reviews[i].split(" "):
            positive_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in reviews[i].split(" "):
            negative_counts[word] += 1
            total_counts[word] += 1
            
rm_words = 100

for term,cnt in list(total_counts.most_common()):
    if(cnt > rm_words):
        pos_neg_ratio = np.log(positive_counts[term] / float(negative_counts[term]+1))
        pos_neg_ratios[term] = pos_neg_ratio
        
new_vocab = []
for word,cnt in total_counts.most_common():
    if cnt>rm_words and word not in (' ,.!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n'+" ".join(STOP_WORDS)):
        new_vocab.append(word)

In [4]:
len(reviews)

25000

In [5]:
new_vocab

['br',
 'movie',
 'film',
 'like',
 'good',
 'story',
 'bad',
 'people',
 'great',
 'movies',
 'think',
 'characters',
 'character',
 'watch',
 'films',
 'seen',
 'life',
 'plot',
 'acting',
 'love',
 'little',
 'best',
 'know',
 'better',
 'end',
 'scene',
 'scenes',
 'watching',
 'doesn',
 'old',
 'years',
 'actors',
 'director',
 'work',
 'didn',
 'new',
 'funny',
 'actually',
 'makes',
 'look',
 'find',
 'going',
 'lot',
 'world',
 'cast',
 'want',
 'things',
 'pretty',
 'young',
 'horror',
 'got',
 'fact',
 'big',
 'thought',
 'series',
 'original',
 'action',
 'right',
 'comedy',
 'point',
 'gets',
 'family',
 'role',
 'isn',
 'saw',
 'interesting',
 'bit',
 'music',
 'guy',
 'script',
 'far',
 'making',
 'minutes',
 'feel',
 'performance',
 'girl',
 'probably',
 'woman',
 'kind',
 'tv',
 'away',
 'day',
 'worst',
 'fun',
 'sure',
 'hard',
 'played',
 'found',
 'having',
 'especially',
 'course',
 'believe',
 'screen',
 'looking',
 'trying',
 'set',
 'goes',
 'book',
 'looks',
 '

In [6]:
vocab = set(total_counts)
vocab_size = len(vocab)
word_index={word:index for index,word in enumerate(vocab)}

In [7]:
len(word_index)
vocab_size

74074

In [8]:
def getFeatures(reviews=reviews):
    feature_size = np.zeros((1,vocab_size))
    features = []
    for review in reviews:
        feature_size *= 0
        for word in review:
            if word in word_index.keys():
                feature_size[0][word_index[word]] = 1
        features.append(feature_size[0])
    return features
def getTargets(labels=labels):
    return [ 1 if label == "POSITIVE" else 0 for label in labels]

In [9]:
features = np.array(getFeatures(reviews[:500]))
targets = np.array(getTargets(labels[:500]))

In [18]:
import time
import sys
import numpy as np

class Network:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1,epoch=1,load_model=False):
        np.random.seed(1)
        self.epoch = epoch
        self.pre_process_data(reviews, labels)
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate,load_model)

    def pre_process_data(self, reviews, labels):
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
                
        self.review_vocab = list(review_vocab)
        
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i

    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate,load_model=False):
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        self.learning_rate = learning_rate
        if load_model:
            self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
            self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,(self.hidden_nodes, self.output_nodes))
        else:
            self.load()
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def train(self, training_reviews_raw, training_labels):
        assert self.epoch >= 1
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))
        assert(len(training_reviews) == len(training_labels))
        correct_so_far = 0
        start = time.time()

        for _ in range(self.epoch):
            for i in range(len(training_reviews)):

                review = training_reviews[i]
                label = training_labels[i]

                # Hidden layer
                self.layer_1 *= 0
                for index in review:
                    self.layer_1 += self.weights_0_1[index]

                # Output layer
                layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            

                # Output error
                layer_2_error = layer_2 - self.get_target_for_label(label) 
                layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

                # Backpropagated error
                layer_1_error = layer_2_delta.dot(self.weights_1_2.T) 
                layer_1_delta = layer_1_error 
                self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate 
                for index in review:
                    self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate 
                if(layer_2 >= 0.5 and label == 'POSITIVE'):
                    correct_so_far += 1
                elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                    correct_so_far += 1
                elapsed_time = float(time.time() - start)
                reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0

                print("Progress:" + str(100 * i/float(len(training_reviews)))[:4] 
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] 
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) 
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%",end="\r",flush=True)
            print("")
            correct_so_far = 0
    def test(self, testing_reviews, testing_labels):
        correct = 0
        start = time.time() 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            print("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] 
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] 
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) 
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%",end="\r",flush=True)
    
    def run(self, review):
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"
    def save(self):
        np.savetxt("weights/w01.txt",self.weights_0_1)
        np.savetxt("weights/w12.txt",self.weights_1_2)
    def load(self):
        self.weights_0_1 = np.loadtxt("weights/w01.txt",delimiter=" ")
        self.weights_1_2 = np.loadtxt("weights/w12.txt",delimiter=" ",ndmin=2)


In [22]:

ml = Network(reviews[:-1000],labels[:-1000], learning_rate=0.001,load_model=True,epoch=1)
ml.train(reviews[:-1000],labels[:-1000])

Progress:99.9% Speed(reviews/sec):849.2 #Correct:20173 #Trained:24000 Training Accuracy:84.0%
Progress:99.9% Speed(reviews/sec):425.2 #Correct:21304 #Trained:24000 Training Accuracy:88.7%
Progress:99.9% Speed(reviews/sec):281.4 #Correct:21714 #Trained:24000 Training Accuracy:90.4%
Progress:99.9% Speed(reviews/sec):211.0 #Correct:21970 #Trained:24000 Training Accuracy:91.5%


In [24]:
ml.save()

In [25]:
ml.test(reviews[1000:],labels[1000:])

Progress:11.9% Speed(reviews/sec):1058. #Correct:2660 #Tested:2868 Testing Accuracy:92.7%

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Progress:27.3% Speed(reviews/sec):1089. #Correct:6074 #Tested:6569 Testing Accuracy:92.4%

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Progress:43.0% Speed(reviews/sec):1080. #Correct:9534 #Tested:10321 Testing Accuracy:92.3%

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Progress:58.5% Speed(reviews/sec):1092. #Correct:12975 #Tested:14046 Testing Accuracy:92.3%

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Progress:74.5% Speed(reviews/sec):1102. #Correct:16549 #Tested:17882 Testing Accuracy:92.5%

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Progress:90.9% Speed(reviews/sec):1107. #Correct:20224 #Tested:21819 Testing Accuracy:92.6%

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Progress:99.9% Speed(reviews/sec):1105. #Correct:22207 #Tested:24000 Testing Accuracy:92.5%

In [26]:
ml.test(np.array(["not good","it was awesome movie"]),np.array(["NEGATIVE","POSITIVE"]))

Progress:50.0% Speed(reviews/sec):1026. #Correct:2 #Tested:2 Testing Accuracy:100.%

In [28]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
tsne2d = tsne.fit_transform(ml.weights_0_1)
ml.word2index.keys()



In [40]:
np.savetxt("weights/tsne.txt",tsne2d)


['',
 'excrements',
 'cratchitt',
 'cram',
 'spelled',
 'rcc',
 'demanded',
 'animating',
 'stall',
 'mahin',
 'livery',
 'gaddis',
 'letts',
 'radlitch',
 'apperance',
 'jungian',
 'forry',
 'headaches',
 'talkd',
 'bagging',
 'ota',
 'perfetta',
 'slauther',
 'parenthood',
 'incher',
 'premonition',
 'gendarme',
 'valderamma',
 'yevgeniya',
 'mended',
 'groupie',
 'radziwill',
 'divagations',
 'thst',
 'bookended',
 'splice',
 'slowness',
 'sunken',
 'kershner',
 'ooh',
 'pyare',
 'nikkhil',
 'bantering',
 'collectible',
 'colico',
 'contextsnamely',
 'drek',
 'drss',
 'mecha',
 'invulnerable',
 'personia',
 'duke',
 'tronje',
 'kmc',
 'inheritors',
 'cipher',
 'former',
 'filmsadly',
 'surefire',
 'russborrough',
 'wobbly',
 'multy',
 'pinning',
 'ap',
 'dalla',
 'cripes',
 'goobacks',
 'seductress',
 'misconceived',
 'phenoms',
 'respect',
 'pervasive',
 'morals',
 'padget',
 'stalky',
 'renews',
 'prolo',
 'pioneered',
 'bosnians',
 'appereantly',
 'sensationalist',
 'mafia',
 'un

In [43]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file

p=figure()

source = ColumnDataSource(data=dict(x1=tsne2d[:,0],
                                    x2=tsne2d[:,1],
                                    names=list(ml.word2index.keys())))

p.scatter(x="x1", y="x2", size=8, source=source)

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#111000",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)