- Đọc dữ liệu từ files, không loại bỏ stop word mà chỉ normalize và loại bỏ những kí tự đặc biệt.

- Xây dụng từ điển, loại bỏ các từ xuất hiện trong ít văn bản (cụ thể ở đây loại bỏ những từ có document frequence <=10).

In [None]:
from collections import defaultdict
import re
import os
def gen_data_and_vocab():
  def collect_data_from(parent_path, newsgroup_list, word_count = 'None'):
    data = []
    for group_id, newsgroup in enumerate(newsgroup_list):
      dir_path = parent_path + '\\' + newsgroup +'\\'
      files = [(filename, dir_path + filename) for filename in os.listdir(dir_path)]
      files.sort()
      label = group_id
      print("processing: {}-{}".format(group_id, newsgroup))
      for filename, filepath in files:
        with open(filepath) as f:
          text = f.read().lower()
          words = re.split('\W+', text)
          if word_count == 'None':
            for word in words:
              word_count[word] += 1
          content = ' '.join(words)
          assert len(content.splitlines()) == 1
          data.append(str(label) + '<fff>' + filename + '<fff>' + content)

    return data

  word_count = defaultdict(int)
  path = "C:\\Users\\pl\\Downloads\\20news-bydate"
  parts = [path +"\\"+ dir_name for dir_name in os.listdir(path)]
  train_path, test_path = (parts[0], parts[1]) if "train" in parts[0] else (parts[1], parts[0])
  newsgroup_list = [newsgroup for newsgroup in os.listdir(train_path)]
  newsgroup_list.sort()
  
  train_data = collect_data_from(
      parent_path = train_path,
      newsgroup_list = newsgroup_list,
      word_count = word_count
    )
  vocab = [word for word, freq in word_count.items() if freq > 10]
  vocab.sort()
  with open("C:\\Users\\pl\\Downloads\\20news-bydate\\vocab-raw.txt",'w') as f:
    f.write('\n'.join(vocab))
  test_data = collect_data_from(
      parent_path = test_path,
      newsgroup_list = newsgroup_list
    )
  with open("C:\\Users\\pl\\Downloads\\20news-bydate\\20news-train-raw.txt",'w') as f:
    f.write('\n'.join(train_data))
  with open("C:\\Users\\pl\\Downloads\\20news-bydate\\20news-test-raw.txt",'w') as f:
    f.write('\n'.join(test_data))

gen_data_and_vocab()

- Mã hóa mỗi từ trong từ điển thành 1 số id tương ứng (từ 2 đến kích thước từ điển - 1), những từ không có trong từ điển sẽ có id là 1.

- Mỗi văn bản sẽ được chuyển thành vector có độ dài cố định (độ dài MAX_DOC_LENGTH = 500 từ) các từ trong văn bản sẽ được giữ nguyên vị trí và được thay thế bằng id của nó. 

- Trong trường hợp số từ trong văn bản nhỏ hơn MAX_DOC_LENGTH sẽ thêm các từ padding có id là 0.

In [None]:
MAX_DOC_LENGTH = 500
unknown_ID = 1
padding_ID = 0

def encode_data(data_path, vocab_path):
  with open(vocab_path,encoding='latin-1') as f:
    vocab = dict([(word, word_ID + 2) 
                  for word_ID, word in enumerate(f.read().splitlines())])
  with open(data_path,encoding='latin-1') as f :
    documents = f.read().splitlines()
  encoded_data = []
  for document in documents:
    label, doc_id, text = document.split('<fff>')
    words = text.split()[:MAX_DOC_LENGTH]
    sentence_length = len(words)
    encoded_text = []
    for word in words:
      if word in vocab:
        encoded_text.append(str(vocab[word]))
      else:
        encoded_text.append(str(unknown_ID))
    if len(words) < MAX_DOC_LENGTH:
      num_padding = MAX_DOC_LENGTH - len(words)
      for i in range(num_padding):
        encoded_text.append(str(padding_ID))
    encoded_data.append(str(label) + '<fff>' + str(doc_id) + '<fff>' + 
                        str(sentence_length) + '<fff>' + ' '.join(encoded_text))

  dir_name = '/'.join(data_path.split('/')[:-1])
  file_name = '-'.join(data_path.split('/')[-1].split('-')[:-1]) + '-encoded.txt'
  with open(dir_name + '/' +file_name, 'w') as f:
    f.write('\n'.join(encoded_data))

encode_data(data_path='/content/drive/My Drive/Data_Colab/20news-train-raw.txt',
            vocab_path='/content/drive/My Drive/Data_Colab/vocab-raw.txt')
encode_data(data_path='/content/drive/My Drive/Data_Colab/20news-test-raw.txt',
            vocab_path='/content/drive/My Drive/Data_Colab/vocab-raw.txt')

In [2]:
import torch
import numpy as np

def load_data(data_path):
  with open(data_path, encoding = 'latin1') as f:
    d_lines = f.read().splitlines()
  data, labels, sentence_lengths = [], [], []
  for line in d_lines:
    features = line.split('<fff>')
    label, doc_id, sentence_len = int(features[0]), int(features[1]), int(features[2])
    vector = [int(ID) for ID in features[3].split()]
    data.append(vector)
    labels.append(label)
    sentence_lengths.append(sentence_len)
  return torch.tensor(data), torch.tensor(labels), torch.tensor(sentence_lengths)
  
train_data, train_labels, train_sentence_lengths = load_data(
    data_path='/content/drive/My Drive/Data_Colab/20news-train-encoded.txt'
)
test_data, test_labels, test_sentence_lengths = load_data(
    data_path='/content/drive/My Drive/Data_Colab/20news-test-encoded.txt'
)
with open('/content/drive/My Drive/Data_Colab/vocab-raw.txt', encoding = 'latin1') as f:
  vocab_size = len(f.read().splitlines())

# Convolutional Neural Network (CNN)

- Đầu vào đi qua embedding layer thu được ma trận word embedding
- Đầu tiên đi qua convolutional layer với 500 5x300 filters, stride 1
- Tiếp theo lấy Maxpool theo cả vector chiều thứ 3 của đầu ra
- Dropout với xác suất 0.5 để làm giảm overfit.
- Cuối cùng đi qua full connected neural network layer đầu ra là số nhãn lớp để thực hiện dự đoán

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import numpy as np

EMBEDDING_SIZE = 300
MAX_DOC_LENGTH = 500
NUM_CLASSES = 20

class CNN(nn.Module):
  def __init__(self, vocab_size, embedding_size, batch_size):
    super(CNN, self).__init__()
    self._vocab_size = vocab_size
    self._embedding_size = embedding_size
    self._batch_size = batch_size
    self.build_graph()

  def build_graph(self):
    self._embedding_layer = nn.Embedding(self._vocab_size+2, self._embedding_size)
    self._convolutional_layer = nn.Sequential(
        nn.Conv2d(1, 500, kernel_size=(5, self._embedding_size)),
        nn.ReLU()
    )
    self._full_connected_layer = nn.Linear(MAX_DOC_LENGTH, NUM_CLASSES)
    self._loss_function = nn.CrossEntropyLoss()
    self._dropout = nn.Dropout(p=0.5)

  def forward(self, inp):
    embeddings = self._embedding_layer(inp).unsqueeze(1)
    outputs = self._convolutional_layer(embeddings).squeeze(3)
    outputs = F.max_pool1d(outputs, kernel_size=outputs.size(2)).squeeze(2)
    outputs = self._dropout((outputs))
    outputs = self._full_connected_layer(outputs)
    return outputs

  def fit(self, train_data, train_labels, max_epochs=50 ,learning_rate=0.01, threshold=1e-3):
    data_set = TensorDataset(train_data, train_labels)
    data_loader = DataLoader(data_set, batch_size = self._batch_size, shuffle = True)
    opt = torch.optim.Adam(params = self.parameters(), lr = learning_rate)
    self.train()
    last_loss = 0
    for epoch in range(max_epochs):
      new_loss = 0
      for data,labels in data_loader:
        opt.zero_grad()
        prediced = self.forward(data)
        loss = self._loss_function(prediced, labels)
        new_loss += loss
        loss.backward()
        opt.step()
      new_loss = new_loss / len(data_loader)
      print('epoch: {}, loss: {}'.format(epoch, new_loss))
      print('test accuracy:',self.predict_and_compute_accuracy(test_data, test_labels))
      if abs(last_loss - new_loss) <= threshold:
        return
      last_loss=new_loss

  def predict_and_compute_accuracy(self, test_data, test_labels):
    data_set = TensorDataset(test_data, test_labels)
    data_loader = DataLoader(data_set, batch_size = self._batch_size, shuffle = False)
    num_true_predict = 0
    for data, labels in data_loader:
      predicted = torch.argmax(self.forward(data), dim = 1)
      num_true_predict += sum((predicted == labels).float())
    return num_true_predict*100./len(test_data)

In [None]:
from time import time

t=time()
cnn = CNN(
    vocab_size=vocab_size, 
    embedding_size=475,  
    batch_size=5
)
cnn.fit(
    train_data = train_data,
    train_labels = train_labels,
    learning_rate=0.01,
    threshold=1e-3
)
print('training time:',time()-t,'s')
print('train accuracy:', cnn.predict_and_compute_accuracy(train_data, train_labels))

# Recurrent Neural Network (RNN)

- Đầu vào đi qua embedding layer thu được ma trận word embedding
- Long Short Term Memory layer, lấy trung bình của các hidden state của các thuộc tính không phải padding
- Dropout với xác suất 0.5 để làm giảm overfit
- Fully connected layer cho đầu ra là số nhãn lớp để chọn ra nhãn lớp có giá trị cao nhất

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import numpy as np

MAX_DOC_LENGTH = 500
NUM_CLASSES = 20

class RNN(nn.Module):
  def __init__(self, vocab_size, embedding_size, lstm_size, batch_size):
    super().__init__()
    self._vocab_size = vocab_size
    self._embedding_size = embedding_size
    self._lstm_size = lstm_size
    self._batch_size = batch_size
    self.build_graph()

  def build_graph(self):
    self._embedding_layer = nn.Embedding(self._vocab_size+2, self._embedding_size)
    self._LSTM_layer = nn.LSTM(self._embedding_size, self._lstm_size, batch_first=True)
    self._full_connected_layer = nn.Linear(self._lstm_size, NUM_CLASSES)
    self._dropout = nn.Dropout(p=0.5)

  def forward(self, data, sentence_lengths):
    h = torch.zeros(1, data.size(0), self._lstm_size)
    c = torch.zeros(1, data.size(0), self._lstm_size)
    embeddings = self._embedding_layer(data)  
    lstm_outputs, (hidden, cell)  = self._LSTM_layer(embeddings, (h, c))
    lstm_outputs = [lstm_outputs[i][:sentence_lengths[i]].mean(0) for i in range(data.size(0))]
    lstm_outputs = torch.stack(lstm_outputs)
    outputs = self._dropout(lstm_outputs)
    outputs = self._full_connected_layer(lstm_outputs)
    return outputs

  def fit(self, train_data, train_labels, train_sentence_lengths, max_epochs=50, learning_rate=0.01, threshold=1e-3):
    data_set = TensorDataset(train_data, train_labels, train_sentence_lengths)
    data_loader = DataLoader(data_set, batch_size = self._batch_size, shuffle = True)
    opt = torch.optim.Adam(params = self.parameters(), lr = learning_rate)
    self._loss_function = nn.CrossEntropyLoss()
    self.train()
    last_loss = 0
    for epoch in range(max_epochs):
      new_loss = 0
      for data, labels, sentence_lengths in data_loader:
        opt.zero_grad()
        prediced = self.forward(data, sentence_lengths)
        loss = self._loss_function(prediced, labels)
        new_loss += loss
        loss.backward()
        opt.step()
      new_loss = new_loss / len(data_loader)
      #print('epoch: {}, loss: {}'.format(epoch, new_loss))
      #print('test accuracy:',self.predict_and_compute_accuracy(test_data, test_labels, test_sentence_lengths))
      if abs(last_loss - new_loss) <= threshold:
        return
      last_loss=new_loss

  def predict_and_compute_accuracy(self, test_data, test_labels, test_sentence_lengths):
    data_set = TensorDataset(test_data, test_labels, test_sentence_lengths)
    data_loader = DataLoader(data_set, batch_size = self._batch_size, shuffle = False)
    num_true_predict = 0
    for data, labels, sentence_lengths in data_loader:
      predicted = torch.argmax(self.forward(data, sentence_lengths), dim = 1)
      num_true_predict += sum((predicted == labels).float())
    return num_true_predict*100./len(test_data)

In [None]:
from time import time

t=time()
rnn = RNN(
    vocab_size=vocab_size, 
    embedding_size=475, 
    lstm_size=90, 
    batch_size=5
)
rnn.fit(
    train_data = train_data,
    train_labels = train_labels,
    train_sentence_lengths = train_sentence_lengths,
    learning_rate=0.01, 
    threshold=1e-3
)
print('training time:',time()-t,'s')
print('train accuracy:', rnn.predict_and_compute_accuracy(train_data, train_labels, train_sentence_lengths))

Cross Validation tìm tham số tối ưu cho RNN

In [4]:
def get_the_best_parameter():
  
  def cross_validation(num_folds,embedding_size, lstm_size, batch_size):
    aver_acc = 0
    data_set = TensorDataset(train_data, train_labels, train_sentence_lengths)
    data_loader = DataLoader(data_set, batch_size = int(train_data.size(0)/num_folds), shuffle = True)
    data_loader = list(data_loader)[:num_folds]
    for data, labels, sentence_lengths in data_loader:
      data_fold_set = TensorDataset(data, labels, sentence_lengths)
      data_fold_loader = DataLoader(data_fold_set, batch_size = int(data.size(0)/2), shuffle = True)
      data_fold_loader = list(data_fold_loader)
      train_data_fold, valid_data_fold = data_fold_loader[0], data_fold_loader[1]
      rnn = RNN(
          vocab_size=vocab_size, 
          embedding_size=embedding_size, 
          lstm_size=lstm_size, 
          batch_size=batch_size
      )
      rnn.fit(
          train_data = train_data_fold[0],
          train_labels = train_data_fold[1],
          train_sentence_lengths = train_data_fold[2],
          max_epochs=2,
          learning_rate=0.01, 
          threshold=1e-3
      )
      acc = rnn.predict_and_compute_accuracy(
          test_data = valid_data_fold[0], 
          test_labels = valid_data_fold[1], 
          test_sentence_lengths = valid_data_fold[2]
      )
      print("acc:",acc)
      aver_acc += acc
    print('aver acc:', aver_acc)
    return aver_acc/num_folds
  
  def range_scan(embedding_size_values, lstm_size_values, batch_size_values):
    best_embedding_size = 300
    max_acc = 0
    for current_embedding_size in embedding_size_values:
      aver_acc = cross_validation(
          num_folds=5, 
          embedding_size=current_embedding_size,
          lstm_size=50, 
          batch_size=50
      )
      if aver_acc>max_acc:
        best_embedding_size = current_embedding_size
        max_acc=aver_acc
    print("best embedding size:", best_embedding_size)
    best_lstm_size = 50
    max_acc = 0
    for current_lstm_size in lstm_size_values:
      aver_acc = cross_validation(
          num_folds=5,
          embedding_size=300,
          lstm_size=current_lstm_size, 
          batch_size=50
      )
      if aver_acc>max_acc:
        best_lstm_size = current_lstm_size
        max_acc=aver_acc
    print("best lstm size:", best_lstm_size)
    best_batch_size = 50
    max_acc = 0
    for current_batch_size in batch_size_values:
      aver_acc = cross_validation(
          num_folds=5, 
          embedding_size=300,
          lstm_size=50, 
          batch_size=current_batch_size
      )
      if aver_acc>max_acc:
        best_batch_size = current_batch_size
        max_acc=aver_acc
    print("best batch size:", best_batch_size)
    return best_embedding_size, best_lstm_size, best_batch_size

  embedding_size_values = [i*25 for i in range(1,20)]
  lstm_size_values = [i*5 for i in range(1,20)]
  batch_size_values = [i*5 for i in range(1,20)]
  best_embedding_size, best_lstm_size, best_batch_size = range_scan(embedding_size_values, lstm_size_values, batch_size_values)
  return best_embedding_size, best_lstm_size, best_batch_size

best_embedding_size, best_lstm_size, best_batch_size = get_the_best_parameter()
print("best embedding size:", best_embedding_size)
print("best lstm size:", best_lstm_size)
print("best batch size:", best_batch_size)

acc: tensor(12.2900)
acc: tensor(11.9363)
acc: tensor(10.4332)
acc: tensor(14.3236)
acc: tensor(11.9363)
aver acc: tensor(60.9195)
acc: tensor(12.8205)
acc: tensor(17.7719)
acc: tensor(14.7657)
acc: tensor(14.5889)
acc: tensor(18.7445)
aver acc: tensor(78.6914)
acc: tensor(19.3634)
acc: tensor(14.7657)
acc: tensor(17.9487)
acc: tensor(16.2688)
acc: tensor(22.1043)
aver acc: tensor(90.4509)
acc: tensor(23.1653)
acc: tensor(18.5676)
acc: tensor(22.2812)
acc: tensor(21.2202)
acc: tensor(24.8453)
aver acc: tensor(110.0796)
acc: tensor(23.8727)
acc: tensor(25.9063)
acc: tensor(24.2263)
acc: tensor(23.5190)
acc: tensor(25.1105)
aver acc: tensor(122.6348)
acc: tensor(26.3484)
acc: tensor(21.3086)
acc: tensor(20.4244)
acc: tensor(20.2476)
acc: tensor(27.6746)
aver acc: tensor(116.0035)
acc: tensor(24.0495)
acc: tensor(25.8179)
acc: tensor(29.6198)
acc: tensor(23.0769)
acc: tensor(22.3696)
aver acc: tensor(124.9337)
acc: tensor(31.1229)
acc: tensor(29.9735)
acc: tensor(33.5102)
acc: tensor(29.0