In [1]:
"""
This block generates the train/test dataset.

[seg_id, text, sentiment] # text is replace with vocabulary ids

TODO:
embedding_lookup?
"""

import os
import csv
import pickle

train_ratio = 0.8
train_data_pkl = 'train_data.pkl'
test_data_pkl = 'test_data.pkl'


files = os.listdir('./Segmented/')
train_files = [f.split('.')[0] for f in files[:int(len(files) * train_ratio)]]
test_files = [f.split('.')[0] for f in files[int(len(files) * train_ratio):]]


sentiment_dict = {}
with open('../processed_labels_no_criteria.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        sentiment_dict[row[0]] = int(row[1])
        

voc_ids = {}
voc_ids_rev = {}

with open('./LanguageVocabulary.txt', 'r') as f:
    for row in f:
        row = row.split(' ')
        if len(row) < 3:
            continue
        k = row[0]
        v = row[1]
        voc_ids[v] = k
        voc_ids_rev[k] = v
        
        
train_data = []
test_data = []

for f in train_files:
    with open('./Segmented/' + f + '.annotprocessed', 'r') as s:
        for row in s:
            row = row.split('_DELIM_')
            seg_id = f + '_' + row[0]
            text = [voc_ids[w] for w in row[1].rstrip().split(' ') if w]
            train_data.append([seg_id, text, sentiment_dict[seg_id]])
            
for f in test_files:
    with open('./Segmented/' + f + '.annotprocessed', 'r') as s:
        for row in s:
            row = row.split('_DELIM_')
            seg_id = f + '_' + row[0]
            text = [voc_ids[w] for w in row[1].rstrip().split(' ') if w]
            test_data.append([seg_id, text, sentiment_dict[seg_id]])

            
with open(train_data_pkl, 'wb') as f:
    pickle.dump(train_data, f)
    
with open(test_data_pkl, 'wb') as f:
    pickle.dump(test_data, f)

In [2]:
"""
Defines helper functions for getting batches

replace word with its id, for later embedding lookup
"""

import numpy as np
from random import randint


batchSize = 64
maxSeqLength = 20
wordVecsLast = 3015

# load pickle

def getTrainBatch():
    labels = np.zeros([batchSize])
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(0, len(train_data) - 1)
        labels[i] = train_data[num][2]
        data = train_data[num][1]
        if len(data) >= maxSeqLength:
            arr[i] = [k if int(k) <= wordVecsLast else 0 for k in data[:maxSeqLength]]
        else:
            arr[i] = [k if int(k) <= wordVecsLast else 0 for k in data] + [0] * (maxSeqLength - len(data))
    return arr.astype('int32'), labels.astype('float32')


def getTestBatch():
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(0, len(test_data) - 1)
        labels.append(test_data[num][2])
        data = test_data[num][1]
        if len(data) >= maxSeqLength:
            arr[i] = [k if int(k) <= wordVecsLast else 0 for k in data[:maxSeqLength]]
        else:
            arr[i] = [k if int(k) <= wordVecsLast else 0 for k in data] + [0] * (maxSeqLength - len(data))
    return arr, labels


In [3]:
"""
RNN model
"""

import pickle
import numpy

# batchSize = 64      # already defined
# maxSeqLength = 20
lstmUnits = 64
numClasses = 2
iterations = 100001   # 100000

wordVectors = None
with open('./glove_300_mosi.pkl', 'rb') as f:
    wordVectors = pickle.load(f, encoding='latin1').astype('float32')    # important encoding, type casting

numDimensions = wordVectors.shape[1]

In [6]:
"""
Build graph
"""

num_layers = 3


import tensorflow as tf

tf.reset_default_graph()

keep_prob = tf.placeholder(tf.float32)        # keep_prob = 1.0 when testing

labels = tf.placeholder(tf.float32, [batchSize])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])


data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors, input_data)


# lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
# lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=keep_prob)
# value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)


def lstm_cell():
    lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
    lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=keep_prob)
    return lstmCell
    
lstmCell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(num_layers)])
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)



weight = tf.Variable(tf.truncated_normal([lstmUnits, 1]))
bias = tf.Variable(tf.constant(0.1))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

loss = tf.reduce_mean(tf.square(tf.subtract(prediction, labels)))        # regulation?
optimizer = tf.train.AdamOptimizer().minimize(loss)

In [8]:
sess = tf.Session()
# saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

for i in range(iterations):
    nextBatch, nextBatchLabels = getTrainBatch();
    los, _ = sess.run([loss, optimizer], {input_data: nextBatch, labels: nextBatchLabels, keep_prob: 0.6})
    
    
    if (i % 5000 == 0):
        print('Train', i, ':', los)
        
    if (i % 10000 == 0):
        nextBatch, nextBatchLabels = getTestBatch();
        los, _ = sess.run([loss, optimizer], {input_data: nextBatch, labels: nextBatchLabels, keep_prob: 1.0})
        print('Test', i, ':', los)
        
    #Write summary to Tensorboard
#     if (i % 50 == 0):
#         summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
#         writer.add_summary(summary, i)

#     #Save the network every 10,000 training iterations
#     if (i % 10000 == 0 and i != 0):
#         save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
#         print("saved to %s" % save_path)
# writer.close()

Train 0 : 12.2901
Test 0 : 7.7319


In [18]:
import pandas as pd

nextBatch, nextBatchLabels = getTestBatch();
pred, _ = sess.run([prediction, optimizer], {input_data: nextBatch, labels: nextBatchLabels, keep_prob: 1.0})
# print(nextBatchLabels.shape, )
print(pd.DataFrame({
    'label' : np.reshape(nextBatchLabels, [-1]),
    'pred'  : np.reshape(pred, [-1])
}))

    label      pred
0       5  3.021466
1       2  3.056729
2       4  3.041991
3       4  3.024831
4       1  3.021790
5       3  3.026036
6       4  3.024738
7       5  3.030445
8       4  3.022862
9       2  3.022340
10      2  3.015999
11      1  3.021473
12      5  3.022049
13      5  3.021826
14      1  3.041781
15      3  3.024398
16      3  3.023578
17      2  3.021894
18      1  3.041781
19      2  3.021894
20      1  3.009306
21      2  3.030623
22      3  3.032790
23      4  3.022111
24      5  3.014538
25      1  3.023248
26      3  3.022216
27      2  2.989534
28      5  3.053723
29      5  3.052557
..    ...       ...
34      2  3.021382
35      4  3.021699
36      4  3.023288
37      5  3.012693
38      3  3.022768
39      5  3.057554
40      1  3.023263
41      2  3.027979
42      5  3.053298
43      1  3.023126
44      3  3.026036
45      3  3.011357
46      1  3.021340
47      2  3.029730
48      3  3.021776
49      2  3.012153
50      3  3.028682
51      4  3.047035


In [54]:
import tensorflow as tf

tf.reset_default_graph()

keep_prob = tf.placeholder(tf.float32)        # 1.0 when testing

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])


data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors,input_data)


lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=keep_prob)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)


weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)


correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

In [8]:
"""
Generate data for another pretrained model
"""

import os
import csv
import pickle

train_ratio = 0.8
train_data_pkl = 'train_data_another.pkl'
test_data_pkl = 'test_data_another.pkl'


files = os.listdir('./Segmented/')
train_files = [f.split('.')[0] for f in files[:int(len(files) * train_ratio)]]
test_files = [f.split('.')[0] for f in files[int(len(files) * train_ratio):]]


sentiment_dict = {}
with open('../processed_labels_no_criteria.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        sentiment_dict[row[0]] = int(row[1])
        

voc_ids = {}
voc_ids_rev = {}

with open('./LanguageVocabulary.txt', 'r') as f:
    for row in f:
        row = row.split(' ')
        if len(row) < 3:
            continue
        k = row[0]
        v = row[1]
        voc_ids[v] = k
        voc_ids_rev[k] = v
        
        
train_data = []
test_data = []

for f in train_files:
    with open('./Segmented/' + f + '.annotprocessed', 'r') as s:
        for row in s:
            row = row.split('_DELIM_')
            seg_id = f + '_' + row[0]
            text = [w for w in row[1].rstrip().split(' ') if w]
            train_data.append([seg_id, text, sentiment_dict[seg_id]])
            
for f in test_files:
    with open('./Segmented/' + f + '.annotprocessed', 'r') as s:
        for row in s:
            row = row.split('_DELIM_')
            seg_id = f + '_' + row[0]
            text = [w for w in row[1].rstrip().split(' ') if w]
            test_data.append([seg_id, text, sentiment_dict[seg_id]])


import numpy as np
from random import randint


batchSize = 64
maxSeqLength = 20
wordVecsLast = 3015

# load pickle

def getTrainBatch():
    labels = np.zeros([batchSize])
    arr = [None] * batchSize
    for i in range(batchSize):
        num = randint(0, len(train_data) - 1)
        labels[i] = train_data[num][2]
        arr[i] = train_data[num][1]
    return arr, labels


def getTestBatch():
    labels = []
    arr = [None] * batchSize
    for i in range(batchSize):
        num = randint(0, len(test_data) - 1)
        labels.append(test_data[num][2])
        arr[i] = ' '.join([w.lower() for w in test_data[num][1]])
    return arr, labels

            
# with open(train_data_pkl, 'wb') as f:
#     pickle.dump(train_data, f)


test_data = getTestBatch()

# print(arr[:3], labels[:3])

with open(test_data_pkl, 'wb') as f:
    pickle.dump(test_data, f)
    