In [106]:
g = open('reviews.txt','r')
reviews = list(map(lambda x:x[:],g.readlines()))
g.close()

In [107]:
g = open('labels.txt','r') 
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [108]:
import time
import sys
from collections import Counter
import numpy as np
from keras.utils import np_utils
import tensorflow as tf
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten



In [109]:


reviews=reviews[:-1000]
labels=labels[:-1000]
min_count=20
polarity_cutoff=0.05
learning_rate=0.01
np.random.seed(1)

In [111]:
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

for i in range(len(reviews)):
    if(labels[i] == 'POSITIVE'):
        for word in reviews[i].split(" "):
                positive_counts[word] += 1
                total_counts[word] += 1
    else:
         for word in reviews[i].split(" "):
            negative_counts[word] += 1
            total_counts[word] += 1

pos_neg_ratios = Counter()

for term,cnt in list(total_counts.most_common()):
    if(cnt >= 50):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

for word,ratio in pos_neg_ratios.most_common():
    if(ratio > 1):
        pos_neg_ratios[word] = np.log(ratio)
    else:
        pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
      

In [113]:
len(pos_neg_ratios)

6991

In [114]:
review_vocab = set()
for review in reviews:
    for word in review.split(" "):
        
        if(total_counts[word] > min_count):
            if(word in pos_neg_ratios.keys()):
                if((pos_neg_ratios[word] >= polarity_cutoff) or (pos_neg_ratios[word] <= -polarity_cutoff)):
                    review_vocab.add(word)
                else:
                    review_vocab.add(word)


review_vocab = list(review_vocab)

In [115]:
len(review_vocab)

6991

In [116]:


label_vocab = set()
for label in labels:
    label_vocab.add(label)


label_vocab = list(label_vocab)


In [117]:

review_vocab_size = len(review_vocab)
label_vocab_size = len(label_vocab)

word2index = {}
for i, word in enumerate(review_vocab):
    word2index[word] = i


label2index = {}
for i, label in enumerate(label_vocab):
    label2index[label] = i
     

In [118]:
training_reviews = list()
for review in reviews:
    indices = set()
    for word in review.split(" "):
        if(word in word2index.keys()):
            indices.add(word2index[word])
    
    training_reviews.append(list(indices))

In [119]:

assert(len(training_reviews) == len(labels))


In [120]:
len(training_reviews)

24000

In [121]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=review_vocab_size)

x_train = tokenizer.sequences_to_matrix(training_reviews, mode='binary')

In [122]:
y_train=list()
for i in labels:
    y_train.append(label2index[i])
        

In [123]:
y_train=np_utils.to_categorical(y_train)


In [124]:
y_train[0], labels[0]

(array([0., 1.], dtype=float32), 'POSITIVE')

In [125]:
len(x_train[3])
from keras.layers import Dense, Dropout, Activation

In [126]:
model = Sequential()
model.add(Dense(512, activation='relu', input_dim=review_vocab_size))
model.add(Dropout(0.5))
model.add(Dense(2, activation='sigmoid'))
model.summary()

model.compile(loss="binary_crossentropy", optimizer="adam", metrics = ['accuracy'])
        

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 512)               3579904   
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 1026      
Total params: 3,580,930
Trainable params: 3,580,930
Non-trainable params: 0
_________________________________________________________________


In [127]:
test_reviews=reviews[-1000:]
test_labels=labels[-1000:]

test_re = list()
for review in test_reviews:
    indices = set()
    for word in review.split(" "):
        if(word in word2index.keys()):
            indices.add(word2index[word])
    
    test_re.append(list(indices))
    


In [128]:
x_test = tokenizer.sequences_to_matrix(test_re, mode='binary')

In [129]:
x_test[1]

array([1., 0., 1., ..., 0., 0., 0.])

In [130]:
y_test=list()
for i in test_labels:
    y_test.append(label2index[i])
        

y_test=np_utils.to_categorical(y_test)

In [142]:
# Running and evaluating the model
hist = model.fit(x_train, y_train,
          batch_size=120,
          epochs=10,
          validation_data=(x_test, y_test), 
          verbose=2)

Train on 24000 samples, validate on 1000 samples
Epoch 1/10
 - 14s - loss: 0.0228 - acc: 0.9959 - val_loss: 0.0092 - val_acc: 1.0000
Epoch 2/10
 - 13s - loss: 0.0146 - acc: 0.9981 - val_loss: 0.0060 - val_acc: 1.0000
Epoch 3/10
 - 13s - loss: 0.0103 - acc: 0.9992 - val_loss: 0.0040 - val_acc: 1.0000
Epoch 4/10
 - 14s - loss: 0.0072 - acc: 0.9995 - val_loss: 0.0029 - val_acc: 1.0000
Epoch 5/10
 - 14s - loss: 0.0054 - acc: 0.9998 - val_loss: 0.0019 - val_acc: 1.0000
Epoch 6/10
 - 13s - loss: 0.0039 - acc: 0.9998 - val_loss: 0.0014 - val_acc: 1.0000
Epoch 7/10
 - 13s - loss: 0.0031 - acc: 1.0000 - val_loss: 0.0010 - val_acc: 1.0000
Epoch 8/10
 - 14s - loss: 0.0025 - acc: 0.9999 - val_loss: 7.9820e-04 - val_acc: 1.0000
Epoch 9/10
 - 14s - loss: 0.0021 - acc: 1.0000 - val_loss: 6.7989e-04 - val_acc: 1.0000
Epoch 10/10
 - 15s - loss: 0.0016 - acc: 1.0000 - val_loss: 5.0166e-04 - val_acc: 1.0000


In [143]:
        score = model.evaluate(x_test, y_test)
        print("\nAccuracy: ", score[-1])

        print("\nPredictions:")
        print(model.predict_proba(x_test))
    


Accuracy:  1.0

Predictions:
[[3.9462373e-03 9.9671352e-01]
 [9.9998009e-01 2.1449019e-05]
 [3.5638233e-05 9.9993384e-01]
 ...
 [9.9995756e-01 5.3396532e-05]
 [2.9086731e-03 9.9595046e-01]
 [1.0000000e+00 2.4484756e-08]]


In [144]:
def previsao(review):
    teste= review
    rev=set()
    
    for word in teste.split(' '):
        if(word in word2index.keys()):
            rev.add(word2index[word])
    x_1=list()
    x_1.append(list(rev))


    x = tokenizer.sequences_to_matrix(x_1, mode='binary')


    prediction=model.predict(x)
    print(prediction)
    
    if(prediction[0][0]>prediction[0][1]):
        print('NEGATIVE')
    else:
        print('POSITIVE')

In [149]:
previsao("this movie is great")

[[0.05301136 0.94816864]]
POSITIVE
