In [None]:
import tensorflow as tf 
import pandas as pd 
import numpy as np 

from pyvi import ViTokenizer,ViPosTagger
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import text, sequence 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
VOCAB_SIZE = 5000
EMBEDDING_DIM = 64
MAX_LEN = 20
TRUNC_TYPE = 'post'
PADDING_TYPE = 'post'
OOV_TOKEN = '<OOV>'
LABEL_NUMBER = 7

In [None]:
train_data = pd.read_csv('../data/train.csv') # for training
test_data = pd.read_csv('../data/test.csv') # for testing
# train_data = pd.read_csv('../data/train_mod.csv') # for training
# test_data = pd.read_csv('../data/test_mod.csv') # for testing
# train_data = pd.read_csv('../data/train_mod_3labels.csv') # for training
# test_data = pd.read_csv('../data/test_mod_3labels.csv') # for testing

In [None]:
# Make tokenizer and word_index
sentence_tokenizer = text.Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
sentence_tokenizer.fit_on_texts(train_data.sentence.values)
word_index = sentence_tokenizer.word_index

In [None]:
# Tokenize sentence
train_sentence = sentence_tokenizer.texts_to_sequences(train_data.sentence.values) # Convert all word to sequence
train_sentence = sequence.pad_sequences(train_sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE) # Pad each entry
test_sentence = sentence_tokenizer.texts_to_sequences(test_data.sentence.values) # Convert all word to sequence
test_sentence = sequence.pad_sequences(test_sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE) # Pad each entry

In [None]:
# Categorize emotion
train_emotion = pd.get_dummies(train_data.emotion.values)
test_emotion = pd.get_dummies(test_data.emotion.values)

In [None]:
model = keras.Sequential([
    layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LEN),
    layers.Bidirectional(layers.LSTM(128, dropout=.5, recurrent_dropout=.5)),
    # layers.Dense(64, activation='relu'),
    # layers.Dropout(.5),
    layers.Dense(LABEL_NUMBER, activation='softmax')
])

model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]])

initial_epochs = 10

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)
mc=ModelCheckpoint('LSTMV1.h5', monitor='val_accuracy', mode='max', save_best_only=True,verbose=1) 
history = model.fit(train_sentence, train_emotion,batch_size=64, epochs=initial_epochs, validation_split=.1, verbose=1, callbacks=[es,mc])

In [None]:
model.load_weights('LSTMV1.h5')
result = model.evaluate(test_sentence, test_emotion)
print(result[1])

In [None]:
sentence = 'thật là kinh tởm'
labels = ['Anger', 'Disgust', 'Enjoyment', 'Fear', 'Other', 'Sadness', 'Surprise']
# labels = ['Anger', 'Disgust', 'Enjoyment', 'Fear', 'Sadness', 'Surprise']
# labels = ['Negative', 'Neutral', 'Positive']
sentence = sentence_tokenizer.texts_to_sequences([sentence])
sentence = sequence.pad_sequences(sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE)
pred = model.predict([sentence])
print(pred) 
print(labels[np.argmax(pred)])