In [None]:
import tensorflow as tf 
import pandas as pd 
import numpy as np 
import gensim

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import text, sequence 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
VOCAB_SIZE = 5000
EMBEDDING_DIM = 400
MAX_LEN = 20
TRUNC_TYPE = 'post'
PADDING_TYPE = 'post'
OOV_TOKEN = '<OOV>'

In [None]:
train_data = pd.read_csv('../data/train_mod_3labels.csv') # for training
test_data = pd.read_csv('../data/test_mod_3labels.csv') # for testing

In [None]:
# Make tokenizer and word_index
sentence_tokenizer = text.Tokenizer(oov_token=OOV_TOKEN)
sentence_tokenizer.fit_on_texts(train_data.sentence.values)
word_index = sentence_tokenizer.word_index
VOCAB_SIZE = len(word_index)+1

In [None]:
# Tokenize sentence
train_sentence = sentence_tokenizer.texts_to_sequences(train_data.sentence.values) # Convert all word to sequence
train_sentence = sequence.pad_sequences(train_sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE) # Pad each entry
test_sentence = sentence_tokenizer.texts_to_sequences(test_data.sentence.values) # Convert all word to sequence
test_sentence = sequence.pad_sequences(test_sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE) # Pad each entry

In [None]:
# Categorize emotion
train_emotion = pd.get_dummies(train_data.emotion.values)
test_emotion = pd.get_dummies(test_data.emotion.values)

In [None]:
word_model = gensim.models.KeyedVectors.load_word2vec_format('../pretrained/wiki.vi.model.bin', binary=True)

embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word_model.vocab:
        embedding_matrix[i] = word_model.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

In [None]:
documents = train_data.sentence.to_list()
wc_sentences = [[word for word in document.lower().split()] for document in documents]
wc_model = gensim.models.Word2Vec(sentences=wc_sentences, size=EMBEDDING_DIM)
print(len(wc_sentences))
print(len(list(wc_model.wv.vocab)))
wc_model.wv.save_word2vec_format('../pretrained/myword2vec', binary=False)
embeddings_index = {}

wf = open('../pretrained/myword2vec.txt', encoding='utf-8')
for line in wf:
    values=line.split()
    word=values[0]
    vector=np.asarray(values[1:])
    embeddings_index[word] = vector
wf.close()

embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > VOCAB_SIZE:
        continue
    if word in word_model.vocab:
        embedding_matrix[i] = word_model.word_vec(word)

In [None]:
model = keras.Sequential([
    layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False),
    # layers.SpatialDropout1D(.5),
    layers.Dropout(.5),
    layers.Bidirectional(layers.LSTM(128, dropout=.2, recurrent_dropout=.2)),
    layers.Dense(64, activation='sigmoid'),
    layers.Dropout(.5),
    layers.Dense(3, activation='softmax')
])

model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

initial_epochs = 30

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)
mc=ModelCheckpoint('LSTMV3.h5', monitor='val_accuracy', mode='max', save_best_only=True,verbose=1) 
history = model.fit(train_sentence, train_emotion,batch_size=64, epochs=initial_epochs, validation_split=.1, verbose=1, callbacks=[es,mc])

In [None]:
model.load_weights('LSTMV3.h5')
result = model.evaluate(test_sentence, test_emotion)
yhat_class = model.predict_classes(test_sentence,verbose=0)
print(result)

In [None]:
sentence = 'con đĩ mẹ mày'
# labels = ['Anger', 'Disgust', 'Enjoyment', 'Fear', 'Other', 'Sadness', 'Surprise']
# labels = ['Anger', 'Disgust', 'Enjoyment', 'Fear', 'Sadness', 'Surprise']
labels = ['Negative', 'Neutral', 'Positive']
sentence = sentence_tokenizer.texts_to_sequences([sentence])
sentence = sequence.pad_sequences(sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE)
pred = model.predict([sentence])
print(pred) 
print(labels[np.argmax(pred)])