In [1]:
from os import listdir
from os.path import isfile
import re
import tensorflow.compat.v1 as tf
import numpy as np
import random
tf.disable_eager_execution()
tf.__version__

'2.4.1'

In [6]:
class RNN:
    def __init__(self,
                vocab_size,
                embedding_size,
                lstm_size,
                #pretrained_w2v_path,
                batch_size):
        self._vocab_size = vocab_size
        self._embedding_size = embedding_size
        self._lstm_size = lstm_size
        self._batch_size = batch_size
        
        self._data = tf.placeholder(tf.int32, shape=[batch_size, MAX_DOC_LENGTH])
        self._labels = tf.placeholder(tf.int32, shape=[batch_size, ])
        self._sentence_lengths = tf.placeholder(tf.int32, shape = [batch_size, ])
        self._final_token = tf.placeholder(tf.int32, shape = [batch_size, ])
    def embedding_layer(self, indices):
        pretrained_vectors = []
        pretrained_vectors.append(np.zeros(self._embedding_size))
        
        np.random.seed(2018)
        for _ in range(self._vocab_size + 1):
            pretrained_vectors.append(np.random.normal(loc = 0., scale = 1., size = self._embedding_size))
            
        pretrained_vectors = np.array(pretrained_vectors)
        
        self._embedding_matrix = tf.get_variable(
            name='embedding',
            shape = (self._vocab_size + 2, self._embedding_size),
            initializer = tf.constant_initializer(pretrained_vectors)
        )
        return tf.nn.embedding_lookup(self._embedding_matrix, indices)
    def LSTM_layer(self, embeddings):
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self._lstm_size)
        zero_state = tf.zeros(shape=(self._batch_size, self._lstm_size))
        initial_state = tf.nn.rnn_cell.LSTMStateTuple(zero_state, zero_state)
        
        lstm_inputs = tf.unstack(
            tf.transpose(embeddings, perm=[1,0,2])
        )
        lstm_outputs, last_state = tf.nn.static_rnn(
            cell = lstm_cell,
            inputs = lstm_inputs,
            initial_state = initial_state,
            sequence_length = self._sentence_lengths
        )
        
        lstm_outputs = tf.unstack(
            tf.transpose(lstm_outputs, perm = [1,0,2])
        )
        lstm_outputs = tf.concat(
            lstm_outputs,
            axis = 0
        )
        
        mask = tf.sequence_mask(
            lengths = self._sentence_lengths,
            maxlen= MAX_DOC_LENGTH,
            dtype = tf.float32
        )
        mask = tf.concat(tf.unstack(mask, axis = 0), axis = 0)
        mask = tf.expand_dims(mask, -1)
        lstm_outputs = mask * lstm_outputs
        lstm_outputs_split = tf.split(lstm_outputs, num_or_size_splits = self._batch_size)
        lstm_outputs_sum = tf.reduce_sum(lstm_outputs_split, axis = 1)
        lstm_outputs_average = lstm_outputs_sum / tf.expand_dims(
            tf.cast(self._sentence_lengths, tf.float32), 
            -1
        )
        return lstm_outputs_average
    def build_graph(self):
        embeddings = self.embedding_layer(self._data)
        lstm_outputs = self.LSTM_layer(embeddings)
        
        weights = tf.get_variable(
            name = 'final_layer_weights',
            shape = (self._lstm_size, NUM_CLASSES),
            initializer = tf.random_normal_initializer(seed=2018)
        )
        
        biases = tf.get_variable(
            name = 'final_layer_biases',
            shape = (NUM_CLASSES),
            initializer = tf.random_normal_initializer(seed = 2018)
        )
        
        logits = tf.matmul(lstm_outputs, weights) + biases
        
        labels_one_hot = tf.one_hot(
            indices = self._labels,
            depth = NUM_CLASSES,
            dtype = tf.float32
        )
        
        loss = tf.nn.softmax_cross_entropy_with_logits(
            labels = labels_one_hot,
            logits = logits
        )
        
        loss = tf.reduce_mean(loss)
        probs = tf.nn.softmax(logits)
        predicted_labels = tf.argmax(probs, axis = 1)
        predicted_labels = tf.squeeze(predicted_labels)
        return (predicted_labels, loss)
    def trainer(self, loss, learning_rate):
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
        return train_op

In [7]:
class DataReader:
    def __init__(self, data_path, batch_size):
        self._batch_size = batch_size
        # Đọc dữ liệu vào
        with open(data_path) as f:
            d_lines = f.read().splitlines() 
        
        self._data = []
        self._labels = []
        self._final_tokens = []
        self._sentence_length = []
        
        for data_id, line in enumerate(d_lines):
            features = line.split('<fff>')
            label, doc_id, sentence_length = int(features[0]), int(features[1]), int(features[2])
            tokens = features[3].split()
            
            
            self._data.append(tokens)
            self._final_tokens.append(tokens[-1])
            self._labels.append(label)
            self._sentence_length.append(sentence_length)
        
        self._data = np.array(self._data)
        self._labels = np.array(self._labels)
        self._final_tokens = np.array(self._final_tokens)
        self._sentence_length = np.array(self._sentence_length)
        
        self._num_epoch = 0
        self._batch_id = 0
    def next_batch(self):
        start = self._batch_id * self._batch_size
        end = start + self._batch_size
        self._batch_id += 1
        
        if start + self._batch_size > len(self._data):
            end = len(self._data)
            start = len(self._data) - 50
            self._num_epoch += 1
            self._batch_id = 0
            indices = list(range(len(self._data)))
            random.seed(2018)
            random.shuffle(indices)
            self._data, self._labels, self._final_tokens, self._sentence_length = self._data[indices], self._labels[indices], self._final_tokens[indices], self._sentence_length[indices]
            
        return self._data[start:end], self._labels[start:end], self._sentence_length[start:end], self._final_tokens[start:end]

In [8]:
def train_and_evaluate_RNN():
    with open('dataset/w2v/vocab-raw.txt') as f:
        vocab_size = len(f.read().splitlines())
        
    tf.set_random_seed(2018)
    rnn = RNN(
        vocab_size=vocab_size,
        embedding_size = 300,
        lstm_size = 50,
        batch_size = 50
    )
    predicted_labels, loss = rnn.build_graph()
    train_op = rnn.trainer(loss = loss,learning_rate = 0.01)
    with tf.Session() as sess:
        train_data_reader = DataReader(
            data_path = 'dataset/w2v/20news-train-encoded.txt',
            batch_size = 50
        )
        
        test_data_reader = DataReader(
            data_path = 'dataset/w2v/20news-test-encoded.txt',
            batch_size = 50
        )
        step = 0
        MAX_STEP = 30000
        sess.run(tf.global_variables_initializer())
        
        while step < MAX_STEP:
            next_train_batch = train_data_reader.next_batch()
            train_data, train_labels, train_sentence_lengths, train_final_tokens = next_train_batch
            plabels_eval, loss_eval, _ = sess.run(
                [predicted_labels, loss, train_op],
                feed_dict = {
                    rnn._data: train_data,
                    rnn._labels: train_labels,
                    rnn._sentence_lengths: train_sentence_lengths,
                    #rnn._final_tokens: train_final_tokens
                }
            )
            step += 1
            if step % 100 == 0:
                print ('loss: ', loss_eval)
            if train_data_reader._batch_id == 0:
                num_true_preds = 0
                while True:
                    next_test_batch = test_data_reader.next_batch()
                    test_data, test_labels, test_sentence_lengths, test_final_tokens = next_test_batch
                    
                    test_plabels_eval = sess.run(
                        predicted_labels,
                        feed_dict = {
                            rnn._data: test_data,
                            rnn._labels: test_labels,
                            rnn._sentence_lengths: test_sentence_lengths,
                            #rnn._final_tokens: test_final_tokens
                        }
                    )
                    matches = np.equal(test_plabels_eval, test_labels)
                    num_true_preds += np.sum(matches.astype(float))
                    
                    if test_data_reader._batch_id == 0:
                        break
                print('Epoch: ', train_data_reader._num_epoch)
                print('Accuracy on test data: ', num_true_preds * 100 / len(test_data_reader._data))

In [9]:
NUM_CLASSES = 20
MAX_DOC_LENGTH = 500

train_and_evaluate_RNN()

Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor




Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

loss:  1.3821086
loss:  6.10779
Epoch:  1
Accuracy on test data:  6.571959638874137
loss:  1.984223
loss:  1.4214394
Epoch:  2
Accuracy on test data:  71.12320764737122
loss:  0.99488336
loss:  0.7481438
Epoch:  3
Accuracy on test data:  76.07541157727032
loss:  0.309473
loss:  0.3354514
loss:  0.24099119
Epoch:  4
Accuracy on test data:  75.94264471587891
loss:  0.03961974
loss:  0.04133545
Epoch:  5
Accuracy on test data:  76.89856611789698
loss:  0.01980238
loss:  0.014662591
Epoch:  6
Accuracy on test data:  76.95167286245353
loss:  0.006245686
loss:  0.0046249824
Epoch:  7
Accuracy on test data:  77.23048327137546
loss:  0.0025788485
loss:  0.011362626
loss:  0.0032007142
Epoch:  8
Accuracy on test data:  77.16409984067977


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/leo/anaconda3/envs/internship/lib/python3.9/contextlib.py", line 135, in __exit__
    self.gen.throw(type, value, traceback)
  File "/home/leo/anaconda3/envs/internship/lib/python3.9/site-packages/tensorflow/python/framework/ops.py", line 5588, in get_controller
    yield g
  File "<ipython-input-8-acf1b9fa9d8f>", line 31, in train_and_evaluate_RNN
    plabels_eval, loss_eval, _ = sess.run(
  File "/home/leo/anaconda3/envs/internship/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 967, in run
    result = self._run(None, fetches, feed_dict, options_ptr,
  File "/home/leo/anaconda3/envs/internship/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 1190, in _run
    results = self._do_run(handle, final_targets, final_fetches,
  File "/home/leo/anaconda3/envs/internship/lib/python3.9/site-packages/tensorflow/python/client/session.py", line 1368, in _do_run
    return self._do_call(_run_fn, feeds, f

TypeError: object of type 'NoneType' has no len()