In [1]:
"""
This block generates the train/test dataset.

[seg_id, text, sentiment] # text is replace with vocabulary ids

TODO:
embedding_lookup?
"""

import os
import csv
import pickle

train_ratio = 0.8
train_data_pkl = 'train_data.pkl'
test_data_pkl = 'test_data.pkl'


files = os.listdir('./Segmented/')
train_files = [f.split('.')[0] for f in files[:int(len(files) * train_ratio)]]
test_files = [f.split('.')[0] for f in files[int(len(files) * train_ratio):]]


sentiment_dict = {}
with open('../processed_labels_no_criteria.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        sentiment_dict[row[0]] = int(row[1])
        

voc_ids = {}
voc_ids_rev = {}

with open('./LanguageVocabulary.txt', 'r') as f:
    for row in f:
        row = row.split(' ')
        if len(row) < 3:
            continue
        k = row[0]
        v = row[1]
        voc_ids[v] = k
        voc_ids_rev[k] = v
        
        
train_data = []
test_data = []

for f in train_files:
    with open('./Segmented/' + f + '.annotprocessed', 'r') as s:
        for row in s:
            row = row.split('_DELIM_')
            seg_id = f + '_' + row[0]
            text = [voc_ids[w] for w in row[1].rstrip().split(' ') if w]
            train_data.append([seg_id, text, sentiment_dict[seg_id]])
            
for f in test_files:
    with open('./Segmented/' + f + '.annotprocessed', 'r') as s:
        for row in s:
            row = row.split('_DELIM_')
            seg_id = f + '_' + row[0]
            text = [voc_ids[w] for w in row[1].rstrip().split(' ') if w]
            test_data.append([seg_id, text, sentiment_dict[seg_id]])

            
with open(train_data_pkl, 'wb') as f:
    pickle.dump(train_data, f)
    
with open(test_data_pkl, 'wb') as f:
    pickle.dump(test_data, f)

In [2]:
"""
Defines helper functions for getting batches

replace word with its id, for later embedding lookup
"""

import numpy as np
from random import randint


batchSize = 64
maxSeqLength = 20
wordVecsLast = 3015

# load pickle

def getTrainBatch():
    labels = np.zeros([batchSize])
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(0, len(train_data) - 1)
        labels[i] = train_data[num][2]
        data = train_data[num][1]
        if len(data) >= maxSeqLength:
            arr[i] = [k if int(k) <= wordVecsLast else 0 for k in data[:maxSeqLength]]
        else:
            arr[i] = [k if int(k) <= wordVecsLast else 0 for k in data] + [0] * (maxSeqLength - len(data))
    return arr.astype('int32'), labels.astype('float32')


def getTestBatch():
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(0, len(test_data) - 1)
        labels.append(test_data[num][2])
        data = test_data[num][1]
        if len(data) >= maxSeqLength:
            arr[i] = [k if int(k) <= wordVecsLast else 0 for k in data[:maxSeqLength]]
        else:
            arr[i] = [k if int(k) <= wordVecsLast else 0 for k in data] + [0] * (maxSeqLength - len(data))
    return arr, labels


In [3]:
"""
RNN model
"""

import pickle
import numpy

# batchSize = 64      # already defined
# maxSeqLength = 20
lstmUnits = 64
numClasses = 2
iterations = 100001   # 100000

wordVectors = None
with open('./glove_300_mosi.pkl', 'rb') as f:
    wordVectors = pickle.load(f, encoding='latin1').astype('float32')    # important encoding, type casting

numDimensions = wordVectors.shape[1]

In [4]:
"""
Build graph
"""

num_layers = 1
beta = 0.01     # for regulation


import tensorflow as tf

tf.reset_default_graph()

keep_prob = tf.placeholder(tf.float32)        # keep_prob = 1.0 when testing

labels = tf.placeholder(tf.float32, [batchSize])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])


data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors, input_data)


# lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
# lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=keep_prob)
# value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)


def lstm_cell():
    lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
    lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=keep_prob)
    return lstmCell
    
lstmCell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(num_layers)])
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)



weight = tf.Variable(tf.truncated_normal([lstmUnits, 1]))
bias = tf.Variable(tf.constant(0.1))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

loss = tf.reduce_mean(tf.square(tf.subtract(prediction, labels)))
regulizer = tf.nn.l2_loss(weight)         # regulation
loss = tf.reduce_mean(loss + beta * regulizer)

optimizer = tf.train.AdamOptimizer().minimize(loss)

In [5]:
sess = tf.Session()
# saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

for i in range(iterations):
    nextBatch, nextBatchLabels = getTrainBatch();
    los, _ = sess.run([loss, optimizer], {input_data: nextBatch, labels: nextBatchLabels, keep_prob: 0.6})
    
    
    if (i % 5000 == 0):
        print('Train', i, ':', los)
        
    if (i % 10000 == 0):
        nextBatch, nextBatchLabels = getTestBatch();
        los, _ = sess.run([loss, optimizer], {input_data: nextBatch, labels: nextBatchLabels, keep_prob: 1.0})
        print('Test', i, ':', los)
        
    #Write summary to Tensorboard
#     if (i % 50 == 0):
#         summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
#         writer.add_summary(summary, i)

#     #Save the network every 10,000 training iterations
#     if (i % 10000 == 0 and i != 0):
#         save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
#         print("saved to %s" % save_path)
# writer.close()

Train 0 : 10.5942
Test 0 : 11.2414
Train 5000 : 2.31188
Train 10000 : 2.32023
Test 10000 : 2.33417
Train 15000 : 3.04097
Train 20000 : 2.55794
Test 20000 : 2.37387
Train 25000 : 2.0716
Train 30000 : 2.13644
Test 30000 : 2.48797
Train 35000 : 2.73791
Train 40000 : 3.3782
Test 40000 : 2.46336
Train 45000 : 2.76666


KeyboardInterrupt: 

In [6]:
import pandas as pd





nextBatch, nextBatchLabels = getTestBatch();
pred, _ = sess.run([prediction, optimizer], {input_data: nextBatch, labels: nextBatchLabels, keep_prob: 1.0})
# print(nextBatchLabels.shape, )
print('Test:\n', pd.DataFrame({
    'label' : np.reshape(nextBatchLabels, [-1]),
    'pred'  : np.reshape(pred, [-1])
}))

Test:
     label      pred
0       4  3.027126
1       3  3.014379
2       3  3.021698
3       2  3.020670
4       1  3.039479
5       2  3.034744
6       1  3.030276
7       2  3.013173
8       2  3.034900
9       1  3.027642
10      1  3.032792
11      2  3.028346
12      1  3.036517
13      5  3.054724
14      4  3.027234
15      6  3.037562
16      1  3.061195
17      2  3.042900
18      3  3.042486
19      5  3.019990
20      5  3.027966
21      1  3.035209
22      1  3.031789
23      4  3.034389
24      5  3.042214
25      1  3.038699
26      4  3.047399
27      4  3.037538
28      6  3.021399
29      6  3.020242
..    ...       ...
34      4  3.049342
35      6  3.037520
36      3  3.058545
37      5  3.025038
38      6  3.039189
39      5  3.022470
40      3  3.019446
41      5  3.040059
42      2  3.033320
43      3  3.061849
44      3  3.019408
45      3  3.071329
46      1  3.042590
47      5  3.042694
48      5  3.051822
49      2  3.031372
50      1  3.035062
51      4  3.

In [7]:
nextBatch, nextBatchLabels = getTrainBatch();
pred, _ = sess.run([prediction, optimizer], {input_data: nextBatch, labels: nextBatchLabels, keep_prob: 1.0})
# print(nextBatchLabels.shape, )
print('Train:\n', pd.DataFrame({
    'label' : np.reshape(nextBatchLabels, [-1]),
    'pred'  : np.reshape(pred, [-1])
}))

Train:
     label      pred
0     2.0  3.018102
1     6.0  3.044740
2     1.0  3.017677
3     4.0  3.055537
4     5.0  3.057814
5     6.0  3.061803
6     3.0  3.056198
7     1.0  3.021312
8     5.0  3.054538
9     2.0  3.011974
10    3.0  3.031794
11    4.0  3.040390
12    2.0  3.041854
13    6.0  3.066431
14    4.0  3.045488
15    5.0  3.054424
16    4.0  3.034466
17    4.0  3.057771
18    1.0  3.004414
19    5.0  3.038847
20    1.0  3.013914
21    5.0  3.047146
22    5.0  3.029821
23    5.0  3.033118
24    3.0  3.060281
25    5.0  3.039179
26    4.0  3.031939
27    2.0  3.030317
28    5.0  3.040951
29    0.0  3.034709
..    ...       ...
34    5.0  3.046053
35    5.0  3.066377
36    5.0  3.060128
37    4.0  3.046185
38    4.0  3.035943
39    1.0  3.000264
40    4.0  3.038271
41    0.0  3.020574
42    0.0  3.029723
43    4.0  3.022935
44    6.0  3.052365
45    1.0  3.053542
46    3.0  3.042822
47    1.0  3.035927
48    1.0  3.021806
49    2.0  3.031952
50    5.0  3.051421
51    5.0  3

In [None]:
import tensorflow as tf

tf.reset_default_graph()

keep_prob = tf.placeholder(tf.float32)        # 1.0 when testing

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])


data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors,input_data)


lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=keep_prob)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)


weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)


correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

In [None]:
"""
Generate data for another pretrained model
"""

import os
import csv
import pickle

train_ratio = 0.8
train_data_pkl = 'train_data_another.pkl'
test_data_pkl = 'test_data_another.pkl'


files = os.listdir('./Segmented/')
train_files = [f.split('.')[0] for f in files[:int(len(files) * train_ratio)]]
test_files = [f.split('.')[0] for f in files[int(len(files) * train_ratio):]]


sentiment_dict = {}
with open('../processed_labels_no_criteria.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        sentiment_dict[row[0]] = int(row[1])
        

voc_ids = {}
voc_ids_rev = {}

with open('./LanguageVocabulary.txt', 'r') as f:
    for row in f:
        row = row.split(' ')
        if len(row) < 3:
            continue
        k = row[0]
        v = row[1]
        voc_ids[v] = k
        voc_ids_rev[k] = v
        
        
train_data = []
test_data = []

for f in train_files:
    with open('./Segmented/' + f + '.annotprocessed', 'r') as s:
        for row in s:
            row = row.split('_DELIM_')
            seg_id = f + '_' + row[0]
            text = [w for w in row[1].rstrip().split(' ') if w]
            train_data.append([seg_id, text, sentiment_dict[seg_id]])
            
for f in test_files:
    with open('./Segmented/' + f + '.annotprocessed', 'r') as s:
        for row in s:
            row = row.split('_DELIM_')
            seg_id = f + '_' + row[0]
            text = [w for w in row[1].rstrip().split(' ') if w]
            test_data.append([seg_id, text, sentiment_dict[seg_id]])


import numpy as np
from random import randint


batchSize = 64
maxSeqLength = 20
wordVecsLast = 3015

# load pickle

def getTrainBatch():
    labels = np.zeros([batchSize])
    arr = [None] * batchSize
    for i in range(batchSize):
        num = randint(0, len(train_data) - 1)
        labels[i] = train_data[num][2]
        arr[i] = train_data[num][1]
    return arr, labels


def getTestBatch():
    labels = []
    arr = [None] * batchSize
    for i in range(batchSize):
        num = randint(0, len(test_data) - 1)
        labels.append(test_data[num][2])
        arr[i] = ' '.join([w.lower() for w in test_data[num][1]])
    return arr, labels

            
# with open(train_data_pkl, 'wb') as f:
#     pickle.dump(train_data, f)


test_data = getTestBatch()

# print(arr[:3], labels[:3])

with open(test_data_pkl, 'wb') as f:
    pickle.dump(test_data, f)
    