In [1]:
import random
import os
import re
import numpy as np
from scipy.sparse import csr_matrix
from collections import defaultdict
from math import log
from math import exp

In [2]:
data_dir = '../FILIMDB'

In [3]:
def load_data(file_name):
    """
    Reads specified file, returns list of strings
    :param file_name: file name in data_dir folder
    :returns list of strings
    """
    print('Loading %s' % file_name)
    data_path = os.path.join(data_dir, file_name)
    with open(data_path) as input_data:
        lines = input_data.readlines()
        lines = [l.strip() for l in lines]
    
    print('Loaded %d lines' % len(lines))
    return lines

In [4]:
train_texts, train_labels = load_data('train.texts'), load_data('train.labels')
dev_texts, dev_labels = load_data('dev.texts'), load_data('dev.labels')
test_texts = load_data('test.texts')

Loading train.texts
Loaded 25000 lines
Loading train.labels
Loaded 25000 lines
Loading dev.texts
Loaded 25000 lines
Loading dev.labels
Loaded 25000 lines
Loading test.texts
Loaded 10599 lines


In [5]:
def tokenize(text):
    text = text.lower()
    tokens = re.findall('[a-z]+',text)
    return tokens
tokenized_train_texts = [tokenize(r) for r in train_texts]
tokenized_dev_texts = [tokenize(r) for r in dev_texts]

In [6]:
def loadGlove(file):
    f = open(file,'r')
    glove = {}
    for entry in f:
        entry = entry.split()
        word = entry[0]
        vector = np.array([float(val) for val in entry[1:]])
        glove[word] = vector
    return glove

In [7]:
def vectorize(text, glove):
    res = np.zeros(glove['the'].shape)
    count = 0
    for token in text:
        if token in glove:
            res += glove[token]
            count += 1
    return np.concatenate((np.ones(1), res / count))

In [8]:
def sigmoid(x):
    return 1 / (1 + np.exp(np.negative(x)))

def tanh_der(x):
    return 1 - np.power(np.tanh(x), 2)

def initialize_weights(dim1, dim2):
    return np.random.randn(dim1, dim2) * 0.001

In [9]:
def loss(X, y, W1, W2, W3, alpha):
    N, M = X.shape
    a1 = np.tanh(np.dot(W1, X.T))
    a1 = np.concatenate((np.ones((1, a1.shape[1])), a1))
    a2 = np.tanh(np.dot(W2, a1))
    a2 = np.concatenate((np.ones((1, a2.shape[1])), a2))
    h = sigmoid(np.dot(W3, a2)) #[h(x1) h(x2) ... h(xn)]

    tmp = np.multiply(y, np.log(h.T))
    tmp += np.multiply((1 - y), np.log(1 - h.T))
    L = -np.sum(tmp) / N

    reg = (np.sum(np.power(W1[:,1:],2)) + np.sum(np.power(W2[:,1:],2)) + np.sum(np.power(W3[:,1:],2)))
    L += alpha * reg
    return L

In [10]:
def feedforward(x, W1, W2, W3):
    z1 = np.dot(W1, x.T)
    a1 = np.tanh(z1)
    a1 = np.concatenate((np.ones((1, 1)), a1))

    z2 = np.dot(W2, a1)
    a2 = np.tanh(z2)
    a2 = np.concatenate((np.ones((1, 1)), a2))
    return z1, a1, z2, a2, sigmoid(np.dot(W3, a2))

In [11]:
def backpropagation(X, y,  W1, W2, W3, alpha):
    N, M = X.shape
    W1_grad = np.zeros(W1.shape)
    W2_grad = np.zeros(W2.shape)
    W3_grad = np.zeros(W3.shape)
    for k in range(N):
        z1, a1, z2, a2, a3 = feedforward(X[k,:], W1, W2, W3)
        delta_3 = a3 - y[k]
        delta_2 = np.dot(W3.T, delta_3)[1:]
        delta_2 = np.multiply(delta_2, tanh_der(z2))
        delta_1 = np.dot(W2.T, delta_2)[1:]
        delta_1 = np.multiply(delta_1, tanh_der(z1))

        W1_grad +=  np.dot(delta_1, X[k,:]) ## transpose something
        W2_grad += np.dot(delta_2, a1.T)
        W3_grad += np.dot(delta_3, a2.T)
    W1_grad /= N
    W2_grad /= N
    W3_grad /= N

    W1_reg = W1 * alpha * 2
    W1_reg[:, 0] = 0
    W2_reg = W2 * alpha * 2
    W2_reg[:, 0] = 0    
    W3_reg = W3 * alpha * 2
    W3_reg[:, 0] = 0

    W1_grad += W1_reg
    W2_grad += W2_reg
    W3_grad += W3_reg
    return W1_grad, W2_grad, W3_grad

In [12]:
def update_weights(W1, W2, W3, W1_grad, W2_grad, W3_grad, lr):
    return W1 - W1_grad * lr, W2 - W2_grad * lr, W3 - W3_grad * lr

In [13]:
labels = []
for label in train_labels:
    if label == 'pos':
        labels.append(1)
    else:
        labels.append(0)
labels = np.array(labels)

In [14]:
glove = loadGlove('glove.6B.50d.txt')

In [15]:
vectorized_train_texts = [vectorize(x, glove) for x in tokenized_train_texts]
train_set = np.matrix(vectorized_train_texts)

In [16]:
LAYER1_SIZE = 200
LAYER2_SIZE = 100
EMBEDDING_SIZE = 50
REG_PARAM = 0.00001
LEARNING_RATE = 1.0

In [17]:
W1 = initialize_weights(LAYER1_SIZE, EMBEDDING_SIZE + 1)
W2 = initialize_weights(LAYER2_SIZE, LAYER1_SIZE + 1)
W3 = initialize_weights(1, LAYER2_SIZE + 1)
i = 0

In [None]:
while i < 1000:
    if i % 100 == 0:
        print(loss(train_set, labels, W1, W2, W3, REG_PARAM))
    g1, g2, g3 = backpropagation(train_set, labels, W1, W2, W3, REG_PARAM)
    W1, W2, W3 = update_weights(W1, W2, W3, g1, g2, g3, LEARNING_RATE)

In [18]:
res = []
for j, text in enumerate(vectorized_train_texts):
    _, _, _, _, pred = feedforward(train_set[j,:], W1, W2, W3)
    pred = np.sum(pred)
    #print(pred)
    if pred >= 0.5:
        res.append('pos')
    else:
        res.append('neg')
    print(res)
print(np.sum(np.array(res) == np.array(train_labels)))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

