In [1]:
import tensorflow as tf 
import pandas as pd 
import numpy as np 

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import text, sequence 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [18]:
VOCAB_SIZE = 5000
EMBEDDING_DIM = 300
MAX_LEN = 20
TRUNC_TYPE = 'post'
PADDING_TYPE = 'post'
OOV_TOKEN = '<OOV>'
LABEL_NUMBER = 6

In [19]:
# train_data = pd.read_csv('../data/train.csv') # for training
# test_data = pd.read_csv('../data/test.csv') # for testing
train_data = pd.read_csv('../data/train_mod.csv') # for training
test_data = pd.read_csv('../data/test_mod.csv') # for testing
# train_data = pd.read_csv('../data/train_mod_3labels.csv') # for training
# test_data = pd.read_csv('../data/test_mod_3labels.csv') # for testing

In [20]:
# Make tokenizer and word_index
sentence_tokenizer = text.Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
sentence_tokenizer.fit_on_texts(train_data.sentence.values)
word_index = sentence_tokenizer.word_index

In [21]:
import io

ft = io.open('../pretrained/cc.vi.300.vec', encoding='utf-8')

embeddings_index = {}
for line in ft:
    values = line.rstrip().split(' ')
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = vector

ft.close()

words_not_found = []
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
        
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

number of null word embeddings: 1144


In [8]:
# Tokenize sentence
train_sentence = sentence_tokenizer.texts_to_sequences(train_data.sentence.values) # Convert all word to sequence
train_sentence = sequence.pad_sequences(train_sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE) # Pad each entry
test_sentence = sentence_tokenizer.texts_to_sequences(test_data.sentence.values) # Convert all word to sequence
test_sentence = sequence.pad_sequences(test_sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE) # Pad each entry

In [9]:
# Categorize emotion
train_emotion = pd.get_dummies(train_data.emotion.values)
test_emotion = pd.get_dummies(test_data.emotion.values)

In [15]:
model = keras.Sequential([
    layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False),
    # layers.SpatialDropout1D(.5),
    layers.Dropout(.5),
    layers.Bidirectional(layers.LSTM(128)),
    layers.Dense(64, activation='sigmoid'),
    # layers.Dropout(.5),
    layers.Dense(LABEL_NUMBER, activation='softmax')
])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 20, 300)           1500000   
_________________________________________________________________
dropout_6 (Dropout)          (None, 20, 300)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               439296    
_________________________________________________________________
dense_6 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_7 (Dense)              (None, 7)                 455       
Total params: 1,956,199
Trainable params: 456,199
Non-trainable params: 1,500,000
_________________________________________________________________


In [16]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]])

initial_epochs = 30

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)
mc=ModelCheckpoint('LSTMV2.h5', monitor='val_accuracy', mode='max', save_best_only=True,verbose=1) 
history = model.fit(train_sentence, train_emotion,batch_size=64, epochs=initial_epochs, validation_split=.1, verbose=1, callbacks=[es,mc])
# history = model.fit(train_sentence, train_emotion,batch_size=64, epochs=initial_epochs, validation_split=.1, verbose=1)
# model.evaluate(test_sentence, test_emotion)

Train on 4993 samples, validate on 555 samples
Epoch 1/30
Epoch 00001: val_accuracy improved from -inf to 0.32252, saving model to LSTMV2.h5
Epoch 2/30
Epoch 00002: val_accuracy improved from 0.32252 to 0.34955, saving model to LSTMV2.h5
Epoch 3/30
Epoch 00003: val_accuracy improved from 0.34955 to 0.39099, saving model to LSTMV2.h5
Epoch 4/30
Epoch 00004: val_accuracy improved from 0.39099 to 0.42703, saving model to LSTMV2.h5
Epoch 5/30
Epoch 00005: val_accuracy improved from 0.42703 to 0.44685, saving model to LSTMV2.h5
Epoch 6/30
Epoch 00006: val_accuracy did not improve from 0.44685
Epoch 7/30
Epoch 00007: val_accuracy improved from 0.44685 to 0.47387, saving model to LSTMV2.h5
Epoch 8/30
Epoch 00008: val_accuracy did not improve from 0.47387
Epoch 9/30
Epoch 00009: val_accuracy improved from 0.47387 to 0.48829, saving model to LSTMV2.h5
Epoch 10/30
Epoch 00010: val_accuracy improved from 0.48829 to 0.49730, saving model to LSTMV2.h5
Epoch 11/30
Epoch 00011: val_accuracy did not i

In [17]:
model.load_weights('LSTMV2.h5')
result = model.evaluate(test_sentence, test_emotion)
print(result)

[1.3289614133160523, 0.5007215, 0.6548043, 0.26551226]


In [12]:
sentence = 'Thật là kinh khủng'
# labels = ['Anger', 'Disgust', 'Enjoyment', 'Fear', 'Other', 'Sadness', 'Surprise']
labels = ['Anger', 'Disgust', 'Enjoyment', 'Fear', 'Sadness', 'Surprise']
# labels = ['Negative', 'Neutral', 'Positive']
sentence = sentence_tokenizer.texts_to_sequences([sentence])
sentence = sequence.pad_sequences(sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE)
pred = model.predict([sentence])
print(pred) 
print(labels[np.argmax(pred)])

[[0.00918274 0.0775242  0.55239874 0.01682269 0.06917091 0.27490067]]
Enjoyment
