## Step 1: Import Library and Testing Dataset

In [1]:
import json
from tqdm import tqdm
import numpy as np
from keras.utils import pad_sequences
from sklearn.metrics import classification_report
from keras.models import load_model

In [2]:
with open('archive/stopwords-nlp-vi.txt') as f:
    stop_word = ['_'.join(word.strip().split()) for word in f.readlines()]

In [3]:
with open("archive/test_dict.json", 'r', encoding = 'utf-16-le') as f:
    datatest = json.load(f)

len_dataset = len(datatest['topic_ids'])
topics = set(datatest['topic_ids'])
n_topics = len(topics)
print('length dataset:', len_dataset)
print('n_topics:', n_topics)
print('topics:', topics)

length dataset: 33759
n_topics: 10
topics: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}


## Step 2: Preprocessing dataset

In [4]:
sentences = []
labels = []

for sentence, label in tqdm(zip(datatest['contents'], datatest['topic_ids']), total=len_dataset):
    sentence = ' '.join([word for word in sentence if word not in stop_word])
    sentences.append(sentence)
    labels.append(label)

100%|██████████| 33759/33759 [03:02<00:00, 185.14it/s]


In [5]:
import pickle

# loading the Tokenizer class
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [6]:
max_length = 3000
trunc_type='post'
padding_type='post'

# Generate the word index dictionary
word_index = tokenizer.word_index

# Generate and pad the test sequences
testing_sequences = tokenizer.texts_to_sequences(sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert the labels lists into numpy arrays
testing_labels = np.array(labels)

## Step 3: Loading Embedding LSTM Model to Test and Evaluation

In [7]:
model_lstm = load_model('my_model.h5')
model_lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3000, 16)          160000    
                                                                 
 bidirectional (Bidirectiona  (None, 64)               12544     
 l)                                                              
                                                                 
 dense (Dense)               (None, 24)                1560      
                                                                 
 dense_1 (Dense)             (None, 10)                250       
                                                                 
Total params: 174,354
Trainable params: 174,354
Non-trainable params: 0
_________________________________________________________________


In [8]:
predictions = model_lstm.predict(testing_padded)



In [12]:
prediction_labels = np.argmax(predictions, axis=-1)

In [15]:
print(classification_report(testing_labels, prediction_labels))

              precision    recall  f1-score   support

           0       0.75      0.79      0.77      5219
           1       0.71      0.66      0.69      3159
           2       0.67      0.72      0.70      1820
           3       0.82      0.88      0.85      2552
           4       0.83      0.85      0.84      3868
           5       0.90      0.86      0.88      3384
           6       0.85      0.87      0.86      2898
           7       0.98      0.94      0.96      5298
           8       0.89      0.87      0.88      3080
           9       0.93      0.88      0.90      2481

    accuracy                           0.84     33759
   macro avg       0.83      0.83      0.83     33759
weighted avg       0.84      0.84      0.84     33759

