Đọc dữ liệu từ files và xây dụng từ điển

In [0]:
from collections import defaultdict
import re
import os
def gen_data_and_vocab():
  def collect_data_from(parent_path, newsgroup_list, word_count = 'None'):
    data = []
    for group_id, newsgroup in enumerate(newsgroup_list):
      dir_path = parent_path + '\\' + newsgroup +'\\'
      files = [(filename, dir_path + filename) for filename in os.listdir(dir_path)]  
      files.sort()
      label = group_id
      print("processing: {}-{}".format(group_id, newsgroup))
      for filename, filepath in files:
        with open(filepath) as f:
          text = f.read().lower()
          words = re.split('\W+', text)
          if word_count == 'None':
            for word in words:
              word_count[word] += 1
          content = ' '.join(words)
          assert len(content.splitlines()) == 1
          data.append(str(label) + '<fff>' + filename + '<fff>' + content)

    return data

  word_count = defaultdict(int)
  path = "C:\\Users\\pl\\Downloads\\20news-bydate"
  parts = [path +"\\"+ dir_name for dir_name in os.listdir(path)]
  train_path, test_path = (parts[0], parts[1]) if "train" in parts[0] else (parts[1], parts[0])
  newsgroup_list = [newsgroup for newsgroup in os.listdir(train_path)]
  newsgroup_list.sort()
  
  train_data = collect_data_from(
      parent_path = train_path,
      newsgroup_list = newsgroup_list,
      word_count = word_count
    )
  vocab = [word for word, freq in word_count.items() if freq > 10]
  vocab.sort()
  with open("C:\\Users\\pl\\Downloads\\20news-bydate\\vocab-raw.txt",'w') as f:
    f.write('\n'.join(vocab))
  test_data = collect_data_from(
      parent_path = test_path,
      newsgroup_list = newsgroup_list
    )
  with open("C:\\Users\\pl\\Downloads\\20news-bydate\\20news-train-raw.txt",'w') as f:
    f.write('\n'.join(train_data))
  with open("C:\\Users\\pl\\Downloads\\20news-bydate\\20news-test-raw.txt",'w') as f:
    f.write('\n'.join(test_data))

gen_data_and_vocab()

In [0]:
MAX_DOC_LENGTH = 500
unknown_ID = 0
padding_ID = 1

Encode dữ liệu

Các từ trong từ điển được đánh ID = 2,3,4,...,V+2

Các từ không xuất hiện trong từ điển có ID = 0

Các từ rỗng được thêm vào có ID = 1

In [0]:
MAX_DOC_LENGTH = 500
unknown_ID = 0
padding_ID = 1

def encode_data(data_path, vocab_path):
  with open(vocab_path,encoding='latin-1') as f:
    vocab = dict([(word, word_ID + 2) 
                  for word_ID, word in enumerate(f.read().splitlines())])
  with open(data_path,encoding='latin-1') as f :
    documents = f.read().splitlines()
  encoded_data = []
  for document in documents:
    label, doc_id, text = document.split('<fff>')
    words = text.split()[:MAX_DOC_LENGTH]
    sentence_length = len(words)
    encoded_text = []
    for word in words:
      if word in vocab:
        encoded_text.append(str(vocab[word]))
      else:
        encoded_text.append(str(unknown_ID))
    if len(words) < MAX_DOC_LENGTH:
      num_padding = MAX_DOC_LENGTH - len(words)
      for i in range(num_padding):
        encoded_text.append(str(padding_ID))
    encoded_data.append(str(label) + '<fff>' + str(doc_id) + '<fff>' + 
                        str(sentence_length) + '<fff>' + ' '.join(encoded_text))

  dir_name = '/'.join(data_path.split('/')[:-1])
  file_name = '-'.join(data_path.split('/')[-1].split('-')[:-1]) + '-encoded.txt'
  with open(dir_name + '/' +file_name, 'w') as f:
    f.write('\n'.join(encoded_data))

encode_data(data_path='/content/drive/My Drive/Data_Colab/20news-train-raw.txt',
            vocab_path='/content/drive/My Drive/Data_Colab/vocab-raw.txt')
encode_data(data_path='/content/drive/My Drive/Data_Colab/20news-test-raw.txt',
            vocab_path='/content/drive/My Drive/Data_Colab/vocab-raw.txt')

Get data reader

In [0]:
import numpy as np

class DataReader():
  def __init__(self, data, labels, sentence_lengths):
    self._data = data
    self._labels = labels
    self._sentence_lengths = sentence_lengths
    self._num_epoch = 0
    self._batch_id = 0

  def reset(self):
    self._num_epoch = 0
    self._batch_id = 0

  def next_batch(self, batch_size):
    start = self._batch_id * batch_size
    end = start + batch_size
    self._batch_id += 1
    if end + batch_size > len(self._data):
      self._num_epoch += 1
      self._batch_id = 0
      indices = list(range(len(self._data)))
      np.random.seed(2020)
      np.random.shuffle(indices)
      self._data, self._labels, self._sentence_lengths = self._data[indices], self._labels[indices], self._sentence_lengths[indices]
    return self._data[start:end], self._labels[start:end], self._sentence_lengths[start:end]

def load_data(data_path):
  with open(data_path, encoding = 'latin1') as f:
    d_lines = f.read().splitlines()
  data, labels, sentence_lengths = [], [], []
  for line in d_lines:
    features = line.split('<fff>')
    label, doc_id, sentence_len = int(features[0]), int(features[1]), int(features[2])
    vector = [int(ID) for ID in features[3].split()]
    data.append(vector)
    labels.append(label)
    sentence_lengths.append(sentence_len)
  return np.array(data), np.array(labels), np.array(sentence_lengths)
  
train_data, train_labels, train_sentence_lengths = load_data(
    data_path='/content/drive/My Drive/Data_Colab/20news-train-encoded.txt'
)
test_data, test_labels, test_sentence_lengths = load_data(
    data_path='/content/drive/My Drive/Data_Colab/20news-test-encoded.txt'
)

Xây dựng mô hình

In [0]:
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
import numpy as np

MAX_DOC_LENGTH = 500
NUM_CLASSES = 20

class RNN:
  def __init__(self, vocab_size, embedding_size, lstm_size, batch_size):
    self._vocab_size = vocab_size
    self._embedding_size = embedding_size
    self._lstm_size = lstm_size
    self._batch_size = batch_size

    self._data = tf.placeholder(tf.int32, shape=[batch_size, MAX_DOC_LENGTH])
    self._labels = tf.placeholder(tf.int32, shape=[batch_size, ])
    self._sentence_lengths = tf.placeholder(tf.int32, shape=[batch_size, ])
    self._final_tokens = tf.placeholder(tf.int32, shape=[batch_size, ])

  def embedding_layer(self, indices):
    pretrained_vectors = []
    pretrained_vectors.append(np.zeros(self._embedding_size))
    for _ in range(self._vocab_size + 1):
      pretrained_vectors.append(np.random.normal(loc=0, scale=1, size=self._embedding_size))

    pretrained_vectors = np.array(pretrained_vectors)
    self._embedding_matrix = tf.get_variable(
        name='embedding',
        shape=(self._vocab_size + 2,self._embedding_size),
        initializer=tf.constant_initializer(pretrained_vectors)
    )
    return tf.nn.embedding_lookup(self._embedding_matrix, indices)

  def LSTM_layer(self, embeddings):
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self._lstm_size)
    zero_state = tf.zeros(shape=(self._batch_size, self._lstm_size))
    initial_state = tf.nn.rnn_cell.LSTMStateTuple(zero_state, zero_state)

    lstm_inputs = tf.unstack(
        tf.transpose(embeddings, perm=[1,0,2])
    )
    lstm_outputs, last_state = tf.nn.static_rnn(
        cell=lstm_cell,
        inputs=lstm_inputs,
        initial_state=initial_state,
        sequence_length=self._sentence_lengths
    )# [num_docs, lstm_size]
    lstm_outputs = tf.unstack(
        tf.transpose(lstm_outputs, perm=[1,0,2])
    )
    lstm_outputs = tf.concat(
        lstm_outputs, axis=0
    )# [num_docs*MAX_SENT_LENGTH, lstm_size]
    mask = tf.sequence_mask(
        lengths=self._sentence_lengths,
        maxlen=MAX_DOC_LENGTH,
        dtype=tf.float32
    )# [num_docs, MAX_SENTENCE_LENGTH]
    mask = tf.concat(tf.unstack(mask, axis=0), axis=0)
    mask = tf.expand_dims(mask, -1)
    lstm_outputs = mask * lstm_outputs
    lstm_outputs_split = tf.split(lstm_outputs, 
                                  num_or_size_splits=self._batch_size)
    lstm_outputs_sum = tf.reduce_sum(lstm_outputs_split, axis=1)#[num_docs, lstm_size]
    lstm_outputs_average = lstm_outputs_sum / tf.expand_dims(
        tf.cast(self._sentence_lengths, tf.float32), -1)#[num_docs, lstm_size]
    return lstm_outputs_average

  def build_graph(self):
    embeddings = self.embedding_layer(self._data)
    lstm_outputs = self.LSTM_layer(embeddings)

    weights = tf.get_variable(
        name='final_layer_weights',
        shape=(self._lstm_size, NUM_CLASSES),
        initializer=tf.random_normal_initializer(seed=2020)
    )
    biases = tf.get_variable(
        name='final_layer_biases',
        shape=(NUM_CLASSES),
        initializer=tf.random_normal_initializer(seed=2020)
    )
    logits = tf.matmul(lstm_outputs, weights) + biases
    labels_one_hot = tf.one_hot(
        indices=self._labels,
        depth=NUM_CLASSES,
        dtype=tf.float32
    )

    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels=labels_one_hot,
        logits=logits
    )
    loss = tf.reduce_mean(loss)
    probs = tf.nn.softmax(logits)
    predicted_labels = tf.argmax(probs, axis=1)
    predicted_labels = tf.squeeze(predicted_labels)
    return predicted_labels, loss

  def trainer(self, loss, learning_rate):
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    return train_op

#train and compute accuracy
def train_and_evaluate_RNN(lstm_size, batch_size):
  acc=[]
  with open('/content/drive/My Drive/Data_Colab/vocab-raw.txt',encoding='latin-1') as f:
    vocab_size = len(f.read().splitlines())
  train_data_reader = DataReader(
    data=train_data, 
    labels=train_labels, 
    sentence_lengths=train_sentence_lengths
  )
  test_data_reader = DataReader(
    data=test_data, 
    labels=test_labels, 
    sentence_lengths=test_sentence_lengths
  )
  tf.reset_default_graph()
  tf.set_random_seed(2020)
  rnn = RNN(
      vocab_size=vocab_size,
      embedding_size=300,
      lstm_size=lstm_size,
      batch_size=batch_size
  )
  predicted_labels, loss = rnn.build_graph()
  train_op = rnn.trainer(loss=loss, learning_rate=0.01)
  
  with tf.Session() as sess:
    step = 0
    MAX_STEP = 1e6

    sess.run(tf.global_variables_initializer())
    while step < MAX_STEP:
      next_train_batch = train_data_reader.next_batch(batch_size)
      data, labels, sentence_lengths = next_train_batch
      plabels_eval, loss_eval, _ = sess.run(
          [predicted_labels, loss, train_op],
          feed_dict={
              rnn._data: data,
              rnn._labels: labels,
              rnn._sentence_lengths: sentence_lengths,
          }
      )
      step += 1
      # if step % 20 == 0:
      #   print('loss:', loss_eval)
      if train_data_reader._batch_id == 0:
        num_true_preds = 0
        while True:
          next_test_batch = test_data_reader.next_batch(batch_size)
          data, labels, sentence_lengths = next_test_batch
          test_plabels_eval = sess.run(
              predicted_labels,
              feed_dict={
                  rnn._data: data,
                  rnn._labels: labels,
                  rnn._sentence_lengths: sentence_lengths,
              }
          )
          matches = np.equal(test_plabels_eval, labels)
          num_true_preds += np.sum(matches.astype(float))

          if test_data_reader._batch_id == 0:
            break
        print('Epoch:', train_data_reader._num_epoch)
        print('Accuracy on test data:', num_true_preds*100./len(test_data_reader._data))
        acc.append(num_true_preds*100./len(test_data_reader._data))
  return acc

acc = train_and_evaluate_RNN(
     lstm_size=80, 
     batch_size=10)

acc = [str(i) for i in acc]
with open('/content/drive/My Drive/Data_Colab/RNN_acc.txt') as f:
  f.write('\n'.join(acc))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

Epoch: 1
Accuracy on test data: 3.186404673393521
Epoch: 2
Accuracy on test data: 75.7700477960701
Epoch: 3
Accuracy on test data: 76.9383961763144
Epoch: 4
Accuracy on test data: 76.72596919808815
Epoch: 5
Accuracy on test data: 77.72172065852364
Epoch: 6
Accuracy on test data: 77.33669676048858
E

Cross validation tim best lstm size and batch size

In [3]:
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()

with open('/content/drive/My Drive/Data_Colab/vocab-raw.txt',encoding='latin-1') as f:
  vocab_size = len(f.read().splitlines())

def get_the_best_parameter(data, labels, sentence_lengths):
  
  def compute_RSS(Y_new,Y_predicted):
    return (1/Y_new.shape[0])*np.sum((Y_new-Y_predicted)**2)

  def cross_validation(num_folds, lstm_size, batch_size):
    row_ids=np.array(range(data.shape[0]))
    valid_ids=np.split(row_ids[:len(row_ids)-len(row_ids)%num_folds],num_folds)
    valid_ids[-1]=np.append(valid_ids[-1],row_ids[len(row_ids)-len(row_ids)%num_folds:])
    train_ids=[[k for k in row_ids if k not in valid_ids[i]] for i in range(num_folds)]
    aver_RSS = 0
    tf.reset_default_graph()
    tf.set_random_seed(2020)
    rnn = RNN(
        vocab_size=vocab_size,
        embedding_size=300,
        lstm_size=lstm_size,
        batch_size=batch_size
    )
    predicted_labels, loss = rnn.build_graph()
    train_op = rnn.trainer(loss=loss, learning_rate=1)
    for i in range(num_folds):
      valid_part={'data':data[valid_ids[i]], 'labels':labels[valid_ids[i]], 'sentence_lengths':sentence_lengths[valid_ids[i]]}
      train_part={'data':data[train_ids[i]], 'labels':labels[train_ids[i]], 'sentence_lengths':sentence_lengths[train_ids[i]]}
      train_data_reader = DataReader(
          data=train_part['data'], 
          labels=train_part['labels'], 
          sentence_lengths=train_part['sentence_lengths']
      )
      valid_data_reader = DataReader(
          data=valid_part['data'], 
          labels=valid_part['labels'], 
          sentence_lengths=valid_part['sentence_lengths']
      )
      with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        while True:
          batch_data, batch_labels, batch_sentence_lengths = train_data_reader.next_batch(batch_size)
          plabels_eval, loss_eval, _ = sess.run(
              [predicted_labels, loss, train_op],
              feed_dict={
                  rnn._data: batch_data,
                  rnn._labels: batch_labels,
                  rnn._sentence_lengths: batch_sentence_lengths,
              }
          )
          if train_data_reader._batch_id == 0 and train_data_reader._num_epoch == 2:
            break

        RSS = 0
        while True:
          batch_data, batch_labels, batch_sentence_lengths = valid_data_reader.next_batch(batch_size)
          valid_plabels_eval = sess.run(
              predicted_labels,
              feed_dict={
                  rnn._data: batch_data,
                  rnn._labels: batch_labels,
                  rnn._sentence_lengths: batch_sentence_lengths,
              }
          )
          RSS += compute_RSS(batch_labels, valid_plabels_eval)
          if valid_data_reader._batch_id == 0:
            break
      print('RSS:',RSS)
      aver_RSS += RSS
    print('aver RSS:', aver_RSS)
    return aver_RSS/num_folds
  
  def range_scan(lstm_size_values, batch_size_values):
    best_lstm_size = 50
    min_RSS = 1e8
    for current_lstm_size in lstm_size_values:
      aver_RSS = cross_validation(
          num_folds=5, 
          lstm_size=current_lstm_size, 
          batch_size=50
      )
      if aver_RSS<min_RSS:
        best_lstm_size = current_lstm_size
        min_RSS=aver_RSS

    best_batch_size = 50
    min_RSS = 1e8
    for current_batch_size in batch_size_values:
      aver_RSS = cross_validation(
          num_folds=5, 
          lstm_size=50, 
          batch_size=current_batch_size
      )
      if aver_RSS<min_RSS:
        best_batch_size = current_batch_size
        min_RSS=aver_RSS
        return best_lstm_size, best_batch_size

  lstm_size_values = [i*10 for i in range(1,10)]
  batch_size_values = lstm_size_values
  best_lstm_size, best_batch_size = range_scan(lstm_size_values, batch_size_values)
  return best_lstm_size, best_batch_size

best_lstm_size, best_batch_size = get_the_best_parameter(train_data, train_labels, train_sentence_lengths)
print("best lstm size:", best_lstm_size)
print("best batch size:", best_batch_size)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

RSS: 5189.279999999999
RSS: 2491.3999999999996
RSS: 1685.68
RSS: 2477.38
RSS: 5943.620000000001
aver RSS: 17787.36
RSS: 5139.280000000001
RSS: 2347.4
RSS: 2172.8
RSS: 2511.320000000001
RSS: 4460.38
aver RSS: 16631.18
RSS: 5846.960000000002
RSS: 3090.960000000001
RSS: 1868.9999999999995
RSS: 3144.68