## Step 1: Import Library and Testing Dataset

In [14]:
import json
from tqdm import tqdm
import numpy as np
from keras.utils import pad_sequences
from sklearn.metrics import classification_report
from keras.models import load_model

In [15]:
with open('archive/stopwords-nlp-vi.txt') as f:
    stop_word = ['_'.join(word.strip().split()) for word in f.readlines()]

In [16]:
with open("archive/test_dict.json", 'r', encoding = 'utf-16-le') as f:
    datatest = json.load(f)

len_dataset = len(datatest['topic_ids'])
topics = set(datatest['topic_ids'])
n_topics = len(topics)
print('length dataset:', len_dataset)
print('n_topics:', n_topics)
print('topics:', topics)

length dataset: 50373
n_topics: 10
topics: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}


## Step 2: Preprocessing dataset

In [17]:
sentences = []
labels = []

for sentence, label in tqdm(zip(datatest['contents'], datatest['topic_ids']), total=len_dataset):
    sentence = ' '.join([word for word in sentence if word not in stop_word])
    sentences.append(sentence)
    labels.append(label)

100%|██████████| 50373/50373 [05:28<00:00, 153.21it/s]


In [18]:
import pickle

# loading the Tokenizer class
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [19]:
max_length = 500
trunc_type='post'
padding_type='post'

# Generate the word index dictionary
word_index = tokenizer.word_index

# Generate and pad the test sequences
testing_sequences = tokenizer.texts_to_sequences(sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert the labels lists into numpy arrays
testing_labels = np.array(labels)

## Step 3: Loading Embedding LSTM Model to Test and Evaluation

In [20]:
model_lstm = load_model('my_model.h5')
model_lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 16)           1873232   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               12544     
 l)                                                              
                                                                 
 dense (Dense)               (None, 16)                1040      
                                                                 
 dense_1 (Dense)             (None, 10)                170       
                                                                 
Total params: 1,886,986
Trainable params: 1,886,986
Non-trainable params: 0
_________________________________________________________________


In [21]:
predictions = model_lstm.predict(testing_padded)



In [22]:
prediction_labels = np.argmax(predictions, axis=-1)

In [23]:
print(classification_report(testing_labels, prediction_labels))

              precision    recall  f1-score   support

           0       0.76      0.81      0.78      7567
           1       0.57      0.48      0.52      2036
           2       0.54      0.66      0.60      2096
           3       0.86      0.80      0.83      5276
           4       0.84      0.85      0.85      3788
           5       0.91      0.91      0.91      5417
           6       0.91      0.86      0.89      6716
           7       0.97      0.97      0.97      6667
           8       0.89      0.90      0.90      6250
           9       0.91      0.90      0.90      4560

    accuracy                           0.85     50373
   macro avg       0.82      0.81      0.81     50373
weighted avg       0.85      0.85      0.85     50373

