In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import datetime
from sklearn.model_selection import train_test_split
import _pickle as pickle

In [2]:
def progbar(curr, total, full_progbar):
    """
    Progress bar used in training process.
    Reference: https://geekyisawesome.blogspot.com/2016/07/python-console-progress-bar-using-b-and.html
    """
    frac = curr/total
    filled_progbar = round(frac*full_progbar)
#     print('\r', '#'*filled_progbar + '-'*(
#         full_progbar-filled_progbar), '[{:>7.2%}]'.format(frac), end='')
    print('\r', '#'*filled_progbar + '-'*(
        full_progbar-filled_progbar), f"[{curr}/{total}, {frac:>7.2%}]\n", end='')

In [3]:
df = pd.read_csv("./text_emotion.csv")
df.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [4]:
num_dimensions = 300

batch_size = 24
lstm_units = 256
num_classes = 13
iterations = 100
max_seq_length = 40

In [None]:
if bool(input("Re-read embedding?[Y/N]>>> ").upper() == "Y"):
    with open('/home/ec2-user/data/glove.840B.300d.txt') as f:
        content = f.readlines()
    total_dict = {}
    for i in range(len(content)):
        progbar(i, len(content), 40)
        this = content[i].split()
        try:
            total_dict[this[0]] = np.array(this[1:]).astype(np.float32)
        except:
            for i in range(len(this[1:])):
                try:
                    np.array(this[1:][i]).astype(np.float32)
                except:
                    this[i+1] = '0.0'
            total_dict[this[0]] = np.array(this[1:]).astype(np.float32)

    with open('dict.pkl', 'wb') as f:
        pickle.dump(total_dict, f)
else:
    with open('./dict.pkl', 'rb') as f:
        total_dict = pickle.load(f)

Re-read embedding?[Y/N]Y
 ##-------------------------------------- [108288/2196017,   4.93%]------------------------------------- [486/2196017,   0.02%][3954/2196017,   0.18%][4389/2196017,   0.20%] [4801/2196017,   0.22%]---------------------------------------- [5011/2196017,   0.23%][6676/2196017,   0.30%] [7483/2196017,   0.34%][8806/2196017,   0.40%][9210/2196017,   0.42%][9630/2196017,   0.44%]---------------------------------------- [10056/2196017,   0.46%][10464/2196017,   0.48%]---------------------------------------- [11315/2196017,   0.52%]---------------------------------------- [12148/2196017,   0.55%] [12599/2196017,   0.57%][15940/2196017,   0.73%][17292/2196017,   0.79%][19885/2196017,   0.91%][22417/2196017,   1.02%][23655/2196017,   1.08%][24487/2196017,   1.12%][24916/2196017,   1.13%][25252/2196017,   1.15%][25488/2196017,   1.16%][27267/2196017,   1.24%][28116/2196017,   1.28%][28973/2196017,   1.32%][29786/2196017,   1.36%]#--------------------------------------- [

In [None]:
senti_lst = list(set(df["sentiment"]))
print(senti_lst)

In [None]:
X_raw = []
y_raw = []
for i, (senti, item) in enumerate(zip(df["sentiment"], df["content"])):
    progbar(i, len(df), 50)
    sentence = item.split()
    vec_lst = list()
    for word in sentence:
        try:
            vec_lst.append(total_dict[word][:num_dimensions])
        except KeyError:
#             vec_lst.append(np.zeros([num_dimensions]))
            vec_lst.append(total_dict["something"])
    while len(vec_lst) < max_seq_length:
        vec_lst.append(np.zeros([num_dimensions])
        )
    vec_ary = np.stack(vec_lst)
    X_raw.append(vec_ary)
    
    # ==== Process y ====
    y_idx = sentis.index(senti)
    label = np.zeros([num_classes])
    label[y_idx] = 1
    assert sum(label) == 1
    y_raw.append(label)
X_raw = np.array(X_raw)
y_raw = np.array(y_raw)

In [None]:
print(f"X shape: {X_raw.shape}")
print(f"y shape: {y_raw.shape}")

In [None]:
for (i, truth) in zip(range(y_raw.shape[0]), df["sentiment"]):
    idx = np.squeeze(np.where(y_ary[i, :] == 1))
    assert senti_lst[idx] == truth

In [None]:
(X_train, X_test,
 y_train, y_test) = train_test_split(
    X_raw, y_raw,
    test_size=0.2,
    shuffle=True
)

(X_train, X_val,
 y_train, y_val) = train_test_split(
    X_train, y_train,
    test_size=0.2,
    shuffle=True
)

In [None]:
print(f"Training and testing set generated,\
\nX_train shape: {X_train.shape}\
\ny_train shape: {y_train.shape}\
\nX_test shape: {X_test.shape}\
\ny_test shape: {y_test.shape}\
\nX_validation shape: {X_val.shape}\
\ny_validation shape: {y_val.shape}")

In [None]:
X_ary.shape

In [None]:
sentis = list(set(df["sentiment"]))
print(sentis)

In [None]:
sentis.index(df["sentiment"][0])

In [None]:
cost = 0
for i in range(len(df.content[0].split())):
    try:
        cost += np.linalg.norm(total_dict[df.content[0].split()[i]][:300] - X_raw[0][i])
    except:
        cost += np.linalg.norm(total_dict["something"][:300] - X_raw[0][i])
print(cost)

In [None]:
tf.reset_default_graph()

with tf.name_scope("DATA_IO"):
    X = tf.placeholder(
        tf.float32,
        [None, max_seq_length, num_dimensions]
    )
    y = tf.placeholder(
        tf.float32,
        [None, num_classes]
    )
    
with tf.name_scope("RNN"):
    lstm_cell = tf.contrib.rnn.LSTMCell(
        num_units=lstm_units
    )
    lstm_cell = tf.contrib.rnn.DropoutWrapper(
        cell=lstm_cell,
        output_keep_prob=0.75
    )
    outputs, state = tf.nn.dynamic_rnn(
        lstm_cell, 
        X, 
        dtype=tf.float32
    )

with tf.name_scope("OUTPUT"):
    weight = tf.Variable(
        tf.truncated_normal(
            [lstm_units, num_classes]
        )
    )
    
    bias = tf.Variable(
        tf.constant(
            0.1, shape=[num_classes]
        )
    )
    
#     value = tf.transpose(value, [1, 0, 2])
    
#     last = tf.gather(value, int(value.get_shape()[0]) - 1)
    last = outputs[:, -1, :]
    pred = tf.matmul(last, weight) + bias

In [None]:
with tf.name_scope("METRICS"):
    correct_pred = tf.equal(
        tf.argmax(pred, axis=1),
        tf.argmax(y, axis=1)
    )

    accuracy = tf.reduce_mean(
        tf.cast(correct_pred, tf.float32)
    )

with tf.name_scope("LOSSES"):
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            logits=pred,
            labels=y
        )
    )
    optimizer = tf.train.AdamOptimizer().minimize(loss)

tf.summary.scalar("Loss", loss)
tf.summary.scalar("Accuracy", accuracy)

In [None]:
sess = tf.Session()
saver = tf.train.Saver()

sess.run(tf.global_variables_initializer())

merged = tf.summary.merge_all()
logdir = "./tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
tf.summary.FileWriter(logdir, sess.graph)

for e in range(iterations):
    sess.run(
        optimizer,
        feed_dict={
            X: X_train,
            y: y_train
        }
    )
    
    if e % 5 == 0:
        summary = sess.run(
            merged,
            feed_dict={
                X: X_val,
                y: y_val
            }
        )
    writer.add_summary(summary, e)
    writer.close()