In [1]:
import tensorflow as tf 
import pandas as pd 
import numpy as np 
import gensim

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import text, sequence 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [53]:
VOCAB_SIZE = 5000
EMBEDDING_DIM = 400
MAX_LEN = 20
TRUNC_TYPE = 'post'
PADDING_TYPE = 'post'
OOV_TOKEN = '<OOV>'
LABEL_NUMBER = 3

In [54]:
# train_data = pd.read_csv('../data/train.csv') # for training
# test_data = pd.read_csv('../data/test.csv') # for testing
# train_data = pd.read_csv('../data/train_mod.csv') # for training
# test_data = pd.read_csv('../data/test_mod.csv') # for testing
train_data = pd.read_csv('../data/train_mod_3labels.csv') # for training
test_data = pd.read_csv('../data/test_mod_3labels.csv') # for testing

In [55]:
# Make tokenizer and word_index
sentence_tokenizer = text.Tokenizer(oov_token=OOV_TOKEN)
sentence_tokenizer.fit_on_texts(train_data.sentence.values)
word_index = sentence_tokenizer.word_index
VOCAB_SIZE = len(word_index)+1

In [56]:
# Tokenize sentence
train_sentence = sentence_tokenizer.texts_to_sequences(train_data.sentence.values) # Convert all word to sequence
train_sentence = sequence.pad_sequences(train_sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE) # Pad each entry
test_sentence = sentence_tokenizer.texts_to_sequences(test_data.sentence.values) # Convert all word to sequence
test_sentence = sequence.pad_sequences(test_sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE) # Pad each entry

In [57]:
# Categorize emotion
train_emotion = pd.get_dummies(train_data.emotion.values)
test_emotion = pd.get_dummies(test_data.emotion.values)

In [29]:
word_model = gensim.models.KeyedVectors.load_word2vec_format('../pretrained/wiki.vi.model.bin', binary=True)

embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word_model.vocab:
        embedding_matrix[i] = word_model.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Null word embeddings: 816


In [58]:
documents = train_data.sentence.to_list()
wc_sentences = [[word for word in document.lower().split()] for document in documents]
wc_model = gensim.models.Word2Vec(sentences=wc_sentences, size=EMBEDDING_DIM)
print(len(wc_sentences))
print(len(list(wc_model.wv.vocab)))
wc_model.wv.save_word2vec_format('../pretrained/myword2vec', binary=False)
embeddings_index = {}

wf = open('../pretrained/myword2vec.txt', encoding='utf-8')
for line in wf:
    values=line.split()
    word=values[0]
    vector=np.asarray(values[1:])
    embeddings_index[word] = vector
wf.close()

embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > VOCAB_SIZE:
        continue
    if word in word_model.vocab:
        embedding_matrix[i] = word_model.word_vec(word)

4527
1488


In [59]:
model = keras.Sequential([
    layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LEN, trainable=False),
    # layers.SpatialDropout1D(.5),
    layers.Dropout(.5),
    layers.Bidirectional(layers.LSTM(128, dropout=.2, recurrent_dropout=.2)),
    layers.Dense(64, activation='sigmoid'),
    layers.Dropout(.5),
    layers.Dense(LABEL_NUMBER, activation='softmax')
])

model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 20, 400)           1684000   
_________________________________________________________________
dropout_14 (Dropout)         (None, 20, 400)           0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 256)               541696    
_________________________________________________________________
dense_14 (Dense)             (None, 64)                16448     
_________________________________________________________________
dropout_15 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 3)                 195       
Total params: 2,242,339
Trainable params: 558,339
Non-trainable params: 1,684,000
______________________________________

In [60]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]])

initial_epochs = 30

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)
mc=ModelCheckpoint('LSTMV3.h5', monitor='val_accuracy', mode='max', save_best_only=True,verbose=1) 
history = model.fit(train_sentence, train_emotion,batch_size=64, epochs=initial_epochs, validation_split=.1, verbose=1, callbacks=[es,mc])

Train on 4074 samples, validate on 453 samples
Epoch 1/30
Epoch 00001: val_accuracy improved from -inf to 0.64901, saving model to LSTMV3.h5
Epoch 2/30
Epoch 00002: val_accuracy improved from 0.64901 to 0.72627, saving model to LSTMV3.h5
Epoch 3/30
Epoch 00003: val_accuracy improved from 0.72627 to 0.73510, saving model to LSTMV3.h5
Epoch 4/30
Epoch 00004: val_accuracy improved from 0.73510 to 0.74614, saving model to LSTMV3.h5
Epoch 5/30
Epoch 00005: val_accuracy improved from 0.74614 to 0.77042, saving model to LSTMV3.h5
Epoch 6/30
Epoch 00006: val_accuracy did not improve from 0.77042
Epoch 7/30
Epoch 00007: val_accuracy did not improve from 0.77042
Epoch 8/30
Epoch 00008: val_accuracy did not improve from 0.77042
Epoch 9/30
Epoch 00009: val_accuracy did not improve from 0.77042
Epoch 10/30
Epoch 00010: val_accuracy did not improve from 0.77042
Epoch 11/30
Epoch 00011: val_accuracy did not improve from 0.77042
Epoch 12/30
Epoch 00012: val_accuracy did not improve from 0.77042
Epoch 

In [61]:
model.load_weights('LSTMV3.h5')
result = model.evaluate(test_sentence, test_emotion)
yhat_class = model.predict_classes(test_sentence,verbose=0)
print(result)

[0.675294392497827, 0.7180851, 0.7319778, 0.70212764]


In [None]:
sentence = 'con đĩ mẹ mày'
# labels = ['Anger', 'Disgust', 'Enjoyment', 'Fear', 'Other', 'Sadness', 'Surprise']
# labels = ['Anger', 'Disgust', 'Enjoyment', 'Fear', 'Sadness', 'Surprise']
labels = ['Negative', 'Neutral', 'Positive']
sentence = sentence_tokenizer.texts_to_sequences([sentence])
sentence = sequence.pad_sequences(sentence, maxlen=MAX_LEN, padding=PADDING_TYPE, truncating=TRUNC_TYPE)
pred = model.predict([sentence])
print(pred) 
print(labels[np.argmax(pred)])