In [1]:
"""
This block generates the train/test dataset.

[seg_id, text, sentiment] # text is replace with vocabulary ids

TODO:
embedding_lookup?
"""

import os
import csv
import pickle

train_ratio = 0.8
train_data_pkl = 'train_data.pkl'
test_data_pkl = 'test_data.pkl'


files = os.listdir('./Segmented/')
train_files = [f.split('.')[0] for f in files[:int(len(files) * train_ratio)]]
test_files = [f.split('.')[0] for f in files[int(len(files) * train_ratio):]]



csv_filename = "processed_labels_no_criteria.csv"



sentiment_dict = {}
with open('../' + csv_filename, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        sentiment_dict[row[0]] = int(row[1])
        

voc_ids = {}
voc_ids_rev = {}

with open('./LanguageVocabulary.txt', 'r') as f:
    for row in f:
        row = row.split(' ')
        if len(row) < 3:
            continue
        k = row[0]
        v = row[1]
        voc_ids[v] = k
        voc_ids_rev[k] = v
        
        
train_data = []
test_data = []

for f in train_files:
    with open('./Segmented/' + f + '.annotprocessed', 'r') as s:
        for row in s:
            row = row.split('_DELIM_')
            seg_id = f + '_' + row[0]
            text = [voc_ids[w] for w in row[1].rstrip().split(' ') if w]
            train_data.append([seg_id, text, sentiment_dict[seg_id]])
            
for f in test_files:
    with open('./Segmented/' + f + '.annotprocessed', 'r') as s:
        for row in s:
            row = row.split('_DELIM_')
            seg_id = f + '_' + row[0]
            text = [voc_ids[w] for w in row[1].rstrip().split(' ') if w]
            test_data.append([seg_id, text, sentiment_dict[seg_id]])

            
with open(train_data_pkl, 'wb') as f:
    pickle.dump(train_data, f)
    
with open(test_data_pkl, 'wb') as f:
    pickle.dump(test_data, f)

In [15]:
"""
Defines helper functions for getting batches

labels are in one-hot format
"""

import numpy as np
from random import randint


batchSize = 256     # 64
maxSeqLength = 20
wordVecsLast = 3015
numClasses = 7

# load pickle

def getTrainBatch():
    labels = np.zeros([batchSize, numClasses])
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(0, len(train_data) - 1)
        labels[i][train_data[num][2]] = 1
        data = train_data[num][1]
        if len(data) >= maxSeqLength:
            arr[i] = [k if int(k) <= wordVecsLast else 0 for k in data[:maxSeqLength]]
        else:
            arr[i] = [k if int(k) <= wordVecsLast else 0 for k in data] + [0] * (maxSeqLength - len(data))
    return arr, labels


def getTestBatch():
    labels = np.zeros([batchSize, numClasses])
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(0, len(test_data) - 1)
        labels[i][test_data[num][2]] = 1
        data = test_data[num][1]
        if len(data) >= maxSeqLength:
            arr[i] = [k if int(k) <= wordVecsLast else 0 for k in data[:maxSeqLength]]
        else:
            arr[i] = [k if int(k) <= wordVecsLast else 0 for k in data] + [0] * (maxSeqLength - len(data))
    return arr, labels

# getTrainBatch()[1].shape

In [22]:
"""
RNN model
"""

import pickle
import numpy

# batchSize = 64      # already defined
# maxSeqLength = 20
lstmUnits = 64
# numClasses = 7
iterations = 100001   # 100001

wordVectors = None
with open('./glove_300_mosi.pkl', 'rb') as f:
    wordVectors = pickle.load(f, encoding='latin1').astype('float32')    # important encoding, type casting

numDimensions = wordVectors.shape[1]

In [23]:
import tensorflow as tf

tf.reset_default_graph()

keep_prob = tf.placeholder(tf.float32)        # keep_prob=1.0 when testing

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])


data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors,input_data)


lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=keep_prob)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)


weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)


# correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
correctPred = tf.less(tf.abs(tf.subtract(tf.argmax(prediction,1), tf.argmax(labels,1))), 3)
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

In [25]:
# sess = tf.Session()
# saver = tf.train.Saver()
# sess.run(tf.global_variables_initializer())

for i in range(iterations):
    nextBatch, nextBatchLabels = getTrainBatch();
    los, acc, _ = sess.run([loss, accuracy, optimizer], {input_data: nextBatch, labels: nextBatchLabels, keep_prob: 0.6})
    
    
    if (i % 5000 == 0):
        print('Train', i, ':', los, ',', acc)
        
    if (i % 10000 == 0):
        nextBatch, nextBatchLabels = getTestBatch();
        los, acc, _ = sess.run([loss, accuracy, optimizer], {input_data: nextBatch, labels: nextBatchLabels, keep_prob: 1.0})
        print('Test', i, ':', los, ',', acc)
        
    #Write summary to Tensorboard
#     if (i % 50 == 0):
#         summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
#         writer.add_summary(summary, i)

#     #Save the network every 10,000 training iterations
#     if (i % 10000 == 0 and i != 0):
#         save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
#         print("saved to %s" % save_path)
# writer.close()

Train 0 : 1.061e-05 , 1.0
Test 0 : 3.89239 , 0.9375
Train 5000 : 2.84648e-06 , 1.0
Train 10000 : 8.86367e-06 , 1.0
Test 10000 : 4.50274 , 0.863281
Train 15000 : 1.44885e-05 , 1.0
Train 20000 : 3.30928e-05 , 1.0
Test 20000 : 3.38689 , 0.902344
Train 25000 : 1.4761e-06 , 1.0
Train 30000 : 8.56814e-08 , 1.0
Test 30000 : 3.38189 , 0.921875
Train 35000 : 4.54948e-07 , 1.0
Train 40000 : 2.50604e-06 , 1.0
Test 40000 : 3.24991 , 0.921875
Train 45000 : 7.54365e-07 , 1.0
Train 50000 : 7.87942e-06 , 1.0
Test 50000 : 3.29555 , 0.890625
Train 55000 : 1.43994e-05 , 1.0


KeyboardInterrupt: 

In [28]:
import pandas as pd

nextBatch, nextBatchLabels = getTestBatch();
pred, _ = sess.run([prediction, optimizer], {input_data: nextBatch, labels: nextBatchLabels, keep_prob: 1.0})
print(np.mean(np.argmax(nextBatchLabels, 1) == np.argmax(pred, 1)))
print(pd.DataFrame({
    'label' : np.argmax(nextBatchLabels, 1),
    'pred'  : np.argmax(pred, 1)
}))

0.71484375
     label  pred
0        2     2
1        1     4
2        2     2
3        2     2
4        4     3
5        1     1
6        4     4
7        1     1
8        5     5
9        4     4
10       5     5
11       5     5
12       4     4
13       5     5
14       4     4
15       5     4
16       5     5
17       1     1
18       3     5
19       4     4
20       4     4
21       4     4
22       4     4
23       1     1
24       2     2
25       0     1
26       5     2
27       4     2
28       4     6
29       4     4
..     ...   ...
226      1     1
227      3     3
228      5     5
229      3     3
230      2     2
231      1     4
232      3     3
233      4     4
234      3     5
235      3     3
236      1     1
237      1     1
238      0     4
239      3     4
240      5     5
241      5     5
242      1     1
243      3     5
244      3     3
245      2     2
246      3     2
247      4     4
248      2     2
249      3     3
250      4     5
251      3     1
252

In [None]:
import tensorflow as tf

tf.reset_default_graph()

keep_prob = tf.placeholder(tf.float32)        # 1.0 when testing

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])


data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors,input_data)


lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=keep_prob)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)


weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)


correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)