### tensorflow 2.0 下 bilstm + attention 实现文本分类 demo

In [1]:
import os
import sys
import warnings
import pickle
import datetime
import tensorflow as tf 
import pandas as pd
import traceback
import time 
import json
import numpy as np 
from tensorflow import keras 
from tensorflow.keras import layers,Input
from tensorflow.keras.layers import Dense,LSTM,Bidirectional,Dropout,Embedding,BatchNormalization
warnings.filterwarnings("ignore")

train = pd.read_csv('E:/Pycharm/calss_comment/data/train.csv')
test = pd.read_csv('E:/Pycharm/calss_comment/data/test.csv')

In [2]:
train_df = pd.read_csv('E:/Pycharm/text_summary/data/train.csv')
test_df = pd.read_csv('E:/Pycharm/text_summary/data/test.csv')

x_train = train_df['article'].values
y_train = train_df['summarization'].values
x_test = test_df['article'].values

In [3]:
x_train = train["text"]
y_train = train['label']
x_test = test["text"]

In [6]:
def replace_abbreviations(text):
    texts = []
    for item in text:
        item = item.lower().replace("it's", "it is").replace("i'm", "i am").replace("he's", "he is").replace("she's",
                                                                                                             "she is") \
            .replace("we're", "we are").replace("they're", "they are").replace("you're", "you are").replace("that's",
                                                                                                            "that is") \
            .replace("this's", "this is").replace("can't", "can not").replace("don't", "do not").replace("doesn't",
                                                                                                         "does not") \
            .replace("we've", "we have").replace("i've", " i have").replace("isn't", "is not").replace("won't",
                                                                                                       "will not") \
            .replace("hasn't", "has not").replace("wasn't", "was not").replace("weren't", "were not").replace("let's",
                                                                                                              "let us") \
            .replace("didn't", "did not").replace("hadn't", "had not").replace("waht's", "what is").replace("couldn't",
                                                                                                            "could not") \
            .replace("you'll", "you will").replace("you've", "you have")

        item = item.replace("'s", "")
        texts.append(item)

    return texts

#删除标点符号及其它字符


def clear_review(text):
    texts = []
    for item in text:
        item = item.replace("<br /><br />", "")
        item = re.sub("[^a-zA-Z]", " ", item.lower())
        texts.append(" ".join(item.split()))
    return texts

#删除停用词　＋　词形还原


def stemed_words(text):
    stop_words = stopwords.words("english")
    lemma = WordNetLemmatizer()
    texts = []
    for item in text:
        words = [lemma.lemmatize(w, pos='v') for w in item.split() if w not in stop_words]
        texts.append(" ".join(words))
    return texts

#文本预处理


def preprocess(text):
    text = replace_abbreviations(text)
    text = clear_review(text)
    text = stemed_words(text)

    return text


In [7]:
import re
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_texts = preprocess(x_train)
test_texts = preprocess(x_test)

max_features = 6000
texts = train_texts + test_texts
#分词
tok = Tokenizer(num_words=max_features)
tok.fit_on_texts(texts)
#序列
list_tok = tok.texts_to_sequences(texts)

maxlen = 1024

seq_tok = pad_sequences(list_tok, maxlen=maxlen)

x_train = seq_tok[:len(train_texts)]

In [9]:
def one_hot_encode(raw_y, num_classes):
    index = np.array(raw_y)
    class_cnt = num_classes #np.max(index) + 1 
    out = np.zeros((index.shape[0], class_cnt))
    out[np.arange(index.shape[0]), index] = 1
    return out 
def load_sample(fn, max_seq_len, word_dict, num_classes):
    text_df = pd.read_csv(fn)
    raw_y = []
    raw_x = []
    for i in range(len(text_df)):
        label = text_df['label'][i]
        raw_y.append(int(label))

        text = text_df['text'][i]
        text_len = len(text)
        x = np.zeros(max_seq_len, dtype = np.int32)
    if text_len <= max_seq_len:
          for i in range(text_len):
            x[i] = word_dict[text[i]]
    else:
          for i in range(text_len - max_seq_len, text_len):
            x[i - text_len + max_seq_len] = word_dict[text[i]]
    raw_x.append(x)

    all_x = np.array(raw_x)
    all_y = one_hot_encode(raw_y, num_classes)
    return all_x, all_y 
def batch_iter(x, y, batch_size = 16):
    data_len = len(x)
    num_batch = (data_len + batch_size - 1) // batch_size
    indices = np.random.permutation(np.arange(data_len))
    x_shuff = x[indices]
    y_shuff = y[indices]
    for i in range(num_batch):
        start_offset = i*batch_size 
        end_offset = min(start_offset + batch_size, data_len)
        yield i, num_batch, x_shuff[start_offset:end_offset], y_shuff[start_offset:end_offset]

In [10]:
class RnnAttentionLayer(layers.Layer):
  def __init__(self, attention_size, drop_rate):
    super().__init__()
    self.attention_size = attention_size
    self.dropout = Dropout(drop_rate, name = "rnn_attention_dropout")

  def build(self, input_shape):
    self.attention_w = self.add_weight(name = "atten_w", shape = (input_shape[-1], self.attention_size), initializer = tf.random_uniform_initializer(), dtype = "float32", trainable = True)
    self.attention_u = self.add_weight(name = "atten_u", shape = (self.attention_size,), initializer = tf.random_uniform_initializer(), dtype = "float32", trainable = True)
    self.attention_b = self.add_weight(name = "atten_b", shape = (self.attention_size,), initializer = tf.constant_initializer(0.1), dtype = "float32", trainable = True)    
    super().build(input_shape)

  def call(self, inputs, training):
    x = tf.tanh(tf.add(tf.tensordot(inputs, self.attention_w, axes = 1), self.attention_b))
    x = tf.tensordot(x, self.attention_u, axes = 1)
    x = tf.nn.softmax(x)
    weight_out = tf.multiply(tf.expand_dims(x, -1), inputs)
    final_out = tf.reduce_sum(weight_out, axis = 1) 
    drop_out = self.dropout(final_out, training = training)
    return drop_out

In [11]:
class RnnLayer(layers.Layer):
  def __init__(self, rnn_size, drop_rate):
    super().__init__()
    fwd_lstm = LSTM(rnn_size, return_sequences = True, go_backwards= False, dropout = drop_rate, name = "fwd_lstm")
    bwd_lstm = LSTM(rnn_size, return_sequences = True, go_backwards = True, dropout = drop_rate, name = "bwd_lstm")
    self.bilstm = Bidirectional(merge_mode = "concat", layer = fwd_lstm, backward_layer = bwd_lstm, name = "bilstm")
    #self.bilstm = Bidirectional(LSTM(rnn_size, activation= "relu", return_sequences = True, dropout = drop_rate))

  def call(self, inputs, training):
    outputs = self.bilstm(inputs, training = training)
    return outputs

In [12]:
class Model(tf.keras.Model):
  def __init__(self, num_classes, drop_rate, vocab_size, embedding_size, rnn_size, attention_size):
    super().__init__()
    self.embedding_layer = Embedding(vocab_size, embedding_size, embeddings_initializer = "uniform", name = "embeding_0")
    self.rnn_layer = RnnLayer(rnn_size, drop_rate)
    self.attention_layer = RnnAttentionLayer(attention_size, drop_rate)
    self.rnn_layer = RnnLayer(rnn_size//2, drop_rate)
    self.attention_layer = RnnAttentionLayer(attention_size, drop_rate)
    self.dense_layer = Dense(num_classes, activation = "softmax", kernel_regularizer=keras.regularizers.l2(0.001), name = "dense_1")

  def call(self, input_x, training):
    x = self.embedding_layer(input_x)
    x = self.rnn_layer(x, training = training)
    x = self.attention_layer(x, training = training)
    x = self.dense_layer(x)
    return x
  

In [13]:
def early_stop(patience=0, min_delta=0, monitor='val_loss'):
    '''
    使用early stop的方法，当loss不再下降时，停止训练
    :param patience: 当loss不再下降时继续训练的batch数量
    :param min_delta: loss的阈值，loss需要下降到该值以下
    :param monitor: 需要监视的指标，默认为loss
    :return:
    '''
    callbacks = [
        keras.callbacks.EarlyStopping(
            monitor=monitor,
            min_delta=min_delta,
            patience=patience,
            verbose=1
        )
    ]
    return callbacks

In [14]:
model = Model(11, drop_rate = 0.05, vocab_size = 6000, 
              embedding_size = 256, rnn_size = 128, attention_size = 128)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
callbacks = early_stop()
model.fit(x_train, y_train, batch_size=32,epochs=5,validation_split=0.3,callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 00004: early stopping


<tensorflow.python.keras.callbacks.History at 0x17f61149208>

In [15]:
def predict_classes(x):
        """Generate class predictions for the input samples.

        The input samples are processed batch by batch.

        # Arguments
            x: input data, as a Numpy array or list of Numpy arrays
                (if the model has multiple inputs).
            batch_size: integer.
            verbose: verbosity mode, 0 or 1.

        # Returns
            A numpy array of class predictions.
        """
        
        if x.shape[-1] > 1:
            return x.argmax(axis=-1)
        else:
            return (x > 0.5).astype('int32')

In [16]:
x_test = seq_tok[len(train_texts):]
y_pred = model.predict(x_test)


In [17]:
y_pred = predict_classes(y_pred)

In [19]:
result = pd.DataFrame()
result['result'] = y_pred
result.to_csv('lstm_att1.csv', header=None)