In [8]:
import tensorflow as tf 
import pandas as pd 
import numpy as np 

from pyvi import ViTokenizer,ViPosTagger
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import text, sequence 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [9]:
VOCAB_SIZE = 5000
EMBEDDING_DIM = 64
MAX_LEN = 20
TRUNC_TYPE = 'post'
PADDING_TYPE = 'post'
OOV_TOKEN = '<OOV>'

In [10]:
train_data = pd.read_csv('../data/train.csv') # for training
test_data = pd.read_csv('../data/test.csv') # for testing

In [11]:
# Make tokenizer and word_index
sentence_tokenizer = text.Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
sentence_tokenizer.fit_on_texts(train_data.sentence.values)
word_index = sentence_tokenizer.word_index

In [12]:
# Tokenize sentence
train_sentence = sentence_tokenizer.texts_to_sequences(train_data.sentence.values) # Convert all word to sequence
train_sentence = sequence.pad_sequences(train_sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE) # Pad each entry
test_sentence = sentence_tokenizer.texts_to_sequences(test_data.sentence.values) # Convert all word to sequence
test_sentence = sequence.pad_sequences(test_sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE) # Pad each entry

In [13]:
# Categorize emotion
train_emotion = pd.get_dummies(train_data.emotion.values)
test_emotion = pd.get_dummies(test_data.emotion.values)

In [14]:
model = keras.Sequential([
    layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN),
    layers.Bidirectional(layers.LSTM(128, dropout=.5, recurrent_dropout=.5)),
    # layers.Dense(64, activation='relu'),
    # layers.Dropout(.5),
    layers.Dense(7, activation='softmax')
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 64)            320000    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               197632    
_________________________________________________________________
dense (Dense)                (None, 7)                 1799      
Total params: 519,431
Trainable params: 519,431
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

initial_epochs = 10

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)
mc=ModelCheckpoint('LSTMV1.h5', monitor='val_accuracy', mode='max', save_best_only=True,verbose=1) 
history = model.fit(train_sentence, train_emotion,batch_size=64, epochs=initial_epochs, validation_split=.1, verbose=1, callbacks=[es,mc])

Train on 4993 samples, validate on 555 samples
Epoch 1/10
Epoch 00001: val_accuracy improved from -inf to 0.31171, saving model to LSTMV1.h5
Epoch 2/10
Epoch 00002: val_accuracy improved from 0.31171 to 0.41441, saving model to LSTMV1.h5
Epoch 3/10
Epoch 00003: val_accuracy improved from 0.41441 to 0.46486, saving model to LSTMV1.h5
Epoch 4/10
Epoch 00004: val_accuracy improved from 0.46486 to 0.50450, saving model to LSTMV1.h5
Epoch 5/10
Epoch 00005: val_accuracy improved from 0.50450 to 0.51892, saving model to LSTMV1.h5
Epoch 6/10
Epoch 00006: val_accuracy improved from 0.51892 to 0.52613, saving model to LSTMV1.h5
Epoch 7/10
Epoch 00007: val_accuracy improved from 0.52613 to 0.54955, saving model to LSTMV1.h5
Epoch 00007: early stopping


In [16]:
model.load_weights('LSTMV1.h5')
result = model.evaluate(test_sentence, test_emotion)
print(result[1])

0.5281385


In [20]:
sentence = 'thật là kinh tởm'
labels = ['Anger', 'Disgust', 'Enjoyment', 'Fear', 'Other', 'Sadness', 'Surprise']
# labels = ['Anger', 'Disgust', 'Enjoyment', 'Fear', 'Sadness', 'Surprise']
# labels = ['Negative', 'Neutral', 'Positive']
sentence = sentence_tokenizer.texts_to_sequences([sentence])
sentence = sequence.pad_sequences(sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE)
pred = model.predict([sentence])
print(pred) 
print(labels[np.argmax(pred)])

[[0.2622929  0.65266967 0.00516923 0.02511064 0.033725   0.00861775
  0.01241482]]
Disgust
