## TextRNN



### What is the TextRNN?

TextRNN 是利用循环神经网络 (RNN) 解决文本分类问题的一种方法。它常用于处理序列数据，比如文本序列。TextRNN 的应用非常广泛，如垃圾邮件分类、情感分析、新闻分类、问句分类等。

### Principle of TextRNN

TextRNN 是基于 RNN 的架构，通常使用 RNN 的变体如 LSTM 或 GRU 来捕捉文本序列中的上下文信息。文本在输入模型之前被分词，处理为固定长度的序列，并通过嵌入层将每个词映射为一个向量。RNN 在每个时间步处理一个词，更新其隐藏状态，直到处理完整个文本序列。最后的隐藏状态或经过某些处理后的状态用于文本分类。

### Architecture

![image.png](attachment:image.png)

### Summary

TextRNN 在文本分类任务中表现优秀，但其训练速度相对较慢。一般来说，两层 RNN 就足够了。可以通过修改 LSTM 单元、加入 Dropout 或 Batch Normalization 等方法来提升模型性能。

## Implementation

### Data Loading

In [6]:
import sys
from collections import Counter
import numpy as np
import tensorflow.keras as kr

In [7]:
if sys.version_info[0]>2:
    is_py3 = True
else:
    reload(sys)
    sys.setdefaultencoding("utf-8")
    is_py3 = False
    

def native_word(word, encoding='utf-8'):
    if not is_py3:
        return word.encode(encoding)
    else:
        return word

def native_content(content):
    if not is_py3:
        return content.decode('utf-8')
    else:
        return content
    
def open_file(filename, mode='r'):
    if is_py3:
        return open(filename, mode, encoding='utf-8', errors='ignore')
    else:
        return open(filename, mode)
    
def read_file(filename):
    contents, labels = [], []
    with open_file(filename) as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')
                if content:
                    contents.append(list(native_content(content)))
                    labels.append(native_content(label))
            except:
                pass
    return contents, labels

def build_vocab(train_dir, vocab_dir, vocab_size=5000):
    data_train, _ = read_file(train_dir)
    all_data = []
    for content in data_train:
        all_data.extend(content)
    counter = Counter(all_data)
    count_pairs = counter.most_common(vocab_size - 1)
    words, _ = list(zip(*count_pairs))
    words = ['<PAD>'] + list(words)
    open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')
    
    
def read_vocab(vocab_dir):
    with open_file(vocab_dir) as fp:
        words = [native_content(_.strip()) for _ in fp.readlines()]
    word_to_id = dict(zip(words, range(len(words))))
    return words, word_to_id

def read_category():
    categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
    categories = [native_content(x) for x in categories]
    cat_to_id = dict(zip(categories, range(len(categories))))
    return categories, bat_to_id

def to_words(content, words):
    return ''.join(words[x] for x in content) 

def process_file(filename, word_to_id, cat_to_id, max_length=600):
    contents, labels = read_file(filename)
    
    data_id, label_id = [], []
    
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
    y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))
    
    return x_pad, y_pad


def batch_iter(x, y, batch_size=64):
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1
    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x[indices]
    y_shuffle = y[indices]
    
    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]

    

### TextRNN Model

In [8]:
import tensorflow as tf

class TRNNConfig(object):
    embedding_dim = 64
    seq_length = 600
    num_classes = 10
    vocab_size = 5000
    
    num_layers = 2
    hidden_dim = 128
    rnn = 'gru'
    
    dropout_keep_prob = 0.8
    learning_rate = 1e-3
    batch_size = 128
    num_epochs = 10
    
    print_per_batch = 100
    save_per_batch = 10
    
    
class TextRNN(object):
    def __init__(self, config):
        self.config = config
        
        self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        
        self.rnn()
        
        
    def rnn(self):
        def lstm_cell():
            return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True)
        
        def gru_cell():
            return tf.contrib.rnn.GRUCell(self.config.hidden_dim)
        
        
        def dropout():
            if (self.config.rnn == 'lstm'):
                cell = lstm_cell()
            else:
                cell = gru_cell()
            return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
        
        with tf.device('/cpu:0'):
            embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
            embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
            
        with tf.name_scope("rnn"):
            cells = [dropout() for _ in range(self.config.num_layers)]
            rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
            
            _output, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32)
            last = outputs[:, -1, :]
        
        with tf.name_scope("score"):
            fc = tf.layers.dense(last, self.config.hidden_dim, name='fc1')
            fc = tf.contrib.layers.dropout(fc, self.keep_prob)
            fc = tf.nn.relu(fc)
            
            self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)
            
        with tf.name_scope("optimize"):
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy)
            
            self.optim = tf.train.AdamOptimizer(learning_rate = self.config.learning_rate).minimize(self.loss)
            
        with tf.name_scope("accuracy"):
            correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        

### Model Training

In [9]:
from __future__ import print_function
import os
import sys
import time
from datetime import timedelta
import numpy as np
import tensorflow as tf
from sklearn import metrics

In [10]:
base_path = "/Users/dengkai/workspace/machine-learning/NLP/data_set/nlp/fastText/"
dir_name = "THUCNews/"
path = base_path + dir_name

train_dir = os.path.join(path, 'cnews.train.txt')
test_dir = os.path.join(path, 'cnews.test.txt')
val_dir = os.path.join(path, 'cnews.val.txt')
vocab_dir = os.path.join(path, 'cnews.vocab.txt')

save_dir = base_path + "textrnn/"
save_path = os.path.join(save_dir, 'best_validation')

def get_time_dif(start_time):
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds = int(round(time_dif)))

def feed_data(x_batch, y_batch, keep_prob):
    feed_dict = {
        model.input_x: x_batch,
        model.input_y: y_batch,
        model.keep_prob: keep_prob
    }
    return feed_dict

def evaluate(sess, x_, y_):
    data_len = len(x)
    batch_eval = batch_inter(x_, y_, 128)
    total_loss = 0.0
    total_acc = 0.0
    for x_batch, y_batch in batch_eval:
        batch_len = len(x_batch)
        feed_dict = feed_data(x_batch, y_batch, 1.0)
        loss, acc = sess.run([model.loss, model.acc], fedd_dict=feed_dict)
        total_loss += loss*batch_len
        total_acc += acc*btach_len
    return total_loss / data_len, total_acc / data_len

def train():
    print("Configuring TensorBoard and Saver...")
    tensorboard_dir = 'tensorboard/textrnn'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)
        
    tf.summary.scalar("loss", model.loss)
    tf.sumary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    print("Loading training and validation data...")
    
    start_time = time.time()
    x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
    x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)
    
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)
    
    print("Training and evaluatin...")
    start_time = time.time()
    total_batch = 0
    total_acc_val = 0.0
    last_improved = 0
    require_improvement = 1000
    
    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch+1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)
            
            if total_batch % config.save_per_batch == 0:
                s = session.run(merged_summary, feed_dict = feed_dict)
                writer.add_summary(s, total_batch)
            if total_batch % config.print_per_batch == 0:
                feed_dict[model.keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc], feed_dict = feed_dict)
                loss_val, acc_val = evaluate(session, x_val, y_val)
                
                if acc_val > best_accval:
                    best_acc_val = acc_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path = save_path)
                    improved_str = '*'
                else:
                    improved_str = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                    + 'Val loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))
            
            feed_dict[model.keep_prob] = config.dropout_keep_prob
            session.run(model.optim, feed_dict=feed_dict)
            total_batch += 1
            
            if total_batch - last_improved > require_improvement:
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break


            
def test():
    print("Loading test data...")
    start_time = time.time()
    x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)
    
    print('Testing...')
    
    loss_test, acc_test = evaluate(session, x_text, y_test)
    msg = 'Test Loss: {0:>6.2, Test Acc: {1:>7.2%}}'
    print(msg.format(loss_test, acc_test))
    
    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1
    
    y_test_cls = np.argmax(y_test, 1)
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)
    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i+1)* batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)
        
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_name=categories))
    
    print("Confusion Matrix...")
    com = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)
    
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)
    

### Training

In [11]:
type_ ='train'
print('Configuring RNN model...')
config = TRNNConfig()
if not os.path.exists(vocab_dir):
    build_vocab(train_dir, vocab_dir, config.vocab_size)
catefories, cat_to_id = read_category()
words, word_to_id = read_vocab(vocab_dir)
config.vocab_size = len(words)
model = TextRNN(config)

if type_ == 'train':
    train()
else:
    test()


Configuring RNN model...


FileNotFoundError: [Errno 2] No such file or directory: '/Users/dengkai/workspace/machine-learning/NLP/data_set/nlp/fastText/THUCNews/cnews.train.txt'

In [None]:
test()