In [1]:
import time
start_time = time.time()
from sklearn.model_selection import train_test_split
import sys, os, re, csv, codecs, numpy as np, pandas as pd
np.random.seed(32)
os.environ["OMP_NUM_THREADS"] = "4"
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer

import logging
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))
            

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
EMBEDDING_FILE = '../inputs/wiki.zh.vec'
train = pd.read_csv("../inputs/train.tsv",sep='\t')
test = pd.read_csv("../inputs/vali.tsv",sep='\t')
X_train = train["内容"].fillna("无").str.lower()
y_train = train["标签"].values
X_test = test["内容"].fillna("无").str.lower()
del train

In [3]:
max_features=100000
maxlen=800
embed_size=300

In [4]:
from keras.utils.np_utils import to_categorical
lookupTable, y_train = np.unique(y_train, return_inverse=True)
y_train = to_categorical(y_train, num_classes=None)

In [5]:
tok=Tokenizer(num_words=max_features, filters="!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n——！，。？、~@#￥%……&*（）：；《）《》“”()»〔〕-]+")
tok.fit_on_texts(list(X_train)+list(X_test))
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
x_train=pad_sequences(X_train,maxlen=maxlen)
x_test=pad_sequences(X_test,maxlen=maxlen)
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
word_index = tok.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector        

In [6]:
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D

file_path = "best_model_bigru_cnn.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)


In [7]:
def build_model(lr = 0.0, lr_d = 0.0, units = 0, dr = 0.0):
    inp = Input(shape = (maxlen,))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(dr)(x)

    x = Bidirectional(GRU(units, return_sequences = True))(x1)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    
    y = Bidirectional(LSTM(units, return_sequences = True))(x1)
    y = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(y)
    
    avg_pool1 = GlobalAveragePooling1D()(x)
    max_pool1 = GlobalMaxPooling1D()(x)
    
    avg_pool2 = GlobalAveragePooling1D()(y)
    max_pool2 = GlobalMaxPooling1D()(y)
    
    
    x = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])

    x = Dense(4, activation = "softmax")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "categorical_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(x_train, y_train, batch_size = 128, epochs = 30,validation_split=0.05 , 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model
    

In [None]:
model = build_model(lr = 1e-3, lr_d = 0, units = 128, dr = 0.2)

Train on 131365 samples, validate on 6914 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 0.18886, saving model to best_model_bigru_cnn.hdf5
Epoch 2/30

Epoch 00002: val_loss improved from 0.18886 to 0.16499, saving model to best_model_bigru_cnn.hdf5
Epoch 3/30

Epoch 00003: val_loss improved from 0.16499 to 0.15101, saving model to best_model_bigru_cnn.hdf5
Epoch 4/30

Epoch 00004: val_loss improved from 0.15101 to 0.12825, saving model to best_model_bigru_cnn.hdf5
Epoch 5/30

Epoch 00005: val_loss did not improve from 0.12825
Epoch 6/30

Epoch 00006: val_loss did not improve from 0.12825
Epoch 7/30

Epoch 00007: val_loss did not improve from 0.12825
Epoch 8/30

Epoch 00008: val_loss improved from 0.12825 to 0.11701, saving model to best_model_bigru_cnn.hdf5
Epoch 9/30

Epoch 00009: val_loss did not improve from 0.11701
Epoch 10/30

Epoch 00010: val_loss did not improve from 0.11701
Epoch 11/30