### Look on base NLP methods

In [1]:
# Download reviews.txt and labels.txt from here: https://github.com/udacity/deep-learning/tree/master/sentiment-network

def pretty_print_review_and_label(i):
   print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

# training data
g = open('reviews.txt','r')
reviews = list(map(lambda x:x[:-1], g.readlines()))
g.close()

# target
g = open('labels.txt','r')
labels = list(map(lambda x:x[:-1].upper(), g.readlines()))
g.close()

### This is how we can represent words

In [2]:
import numpy as np

onehots = {}
onehots['cat'] = np.array([1,0,0,0])
onehots['the'] = np.array([0,1,0,0])
onehots['dog'] = np.array([0,0,1,0])
onehots['sat'] = np.array([0,0,0,1])

sentence = ['the','cat','sat']
x = onehots[sentence[0]] + \
    onehots[sentence[1]] + \
    onehots[sentence[2]]

print("Sent Encoding:" + str(x))

Sent Encoding:[1 1 0 1]


### Predicting movie reviews

#### First represent reviews in digital data

In [3]:
import sys

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x:set(x.split(' ')), raw_reviews))

vocab = set()
for sent in tokens:
    for word in sent:
        if len(word) > 0:
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))

target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)

#### Write down simple NN

In [4]:
import numpy as np
np.random.seed(1)

# adding activation functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# setup network parameters
alpha, iterations = 0.01, 2
hidden_size = 100

# setup weights manualy
weights_0_1 = 0.2 * np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 1)) - 0.1

# train our network
correct, total = (0, 0)
for iter in range(iterations):
    # train our network on first 24000 objects
    for i in range(len(input_dataset) - 1000):
        # embended data plus forward error propagation
        x, y = input_dataset[i], target_dataset[i]
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
        # backpropagation
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta) * alpha

        if np.abs(layer_2_delta) < 0.5:
            correct += 1
        total += 1
        if i % 10 == 9:
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('\rIter:'+str(iter)\
                            +' Progress:'+progress[2:4]\
                            +'.'+progress[4:6]\
                            +'% Training Accuracy:'\
                            + str(correct/float(total)) + '%')
    print()

correct, total = (0, 0)
for i in range(len(input_dataset) - 1000, len(input_dataset)):
    
    x = input_dataset[i]
    y = target_dataset[i]

    layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
    layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

    if np.abs(layer_2 - y) < 0.5:
        correct += 1
    total += 1

print('Test Accuracy:' + str(correct / float(total)))

Iter:0 Progress:95.99% Training Accuracy:0.8339583333333334%%
Iter:1 Progress:95.99% Training Accuracy:0.8669166666666667%
Test Accuracy:0.848


### Compare embended words

In the learning process, the network groups our words by similarity. Let's see how it goes. We can group words with any metric. Let's see on MSE.

In [5]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - weights_0_1[target_index]
        squared_difference = raw_difference ** 2
        scores[word] = -math.sqrt(sum(squared_difference))
    return scores.most_common(10)

In [6]:
print(similar('beautiful'))

[('beautiful', -0.0), ('true', -0.7189083598947267), ('emotions', -0.7504230216456905), ('impressed', -0.7526605206380348), ('episodes', -0.7537834885234282), ('vhs', -0.7587024277058798), ('subtle', -0.7624069735425496), ('delightful', -0.7713142364783331), ('spectacular', -0.7725673780871809), ('ralph', -0.7764145134532926)]


In [7]:
print(similar('terrible'))

[('terrible', -0.0), ('disappointing', -0.7818822901909498), ('annoying', -0.7941784967722754), ('mess', -0.8234147715835306), ('fails', -0.823539417498892), ('boring', -0.8294172614962738), ('worse', -0.8301163411692052), ('disappointment', -0.8442467075338626), ('dull', -0.8496125214783501), ('lacks', -0.85419026512117)]
