In [2]:
import time
start_time = time.time()
from sklearn.model_selection import train_test_split
import sys, os, re, csv, codecs, numpy as np, pandas as pd
np.random.seed(32)
os.environ["OMP_NUM_THREADS"] = "12"
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from tensorflow.keras import backend as K
from keras.engine import InputSpec, Layer

import logging
from sklearn.metrics import roc_auc_score
from tensorflow.keras.callbacks import Callback

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))
            

Using TensorFlow backend.


In [3]:
EMBEDDING_FILE = '/home/xq/data/embed/wiki.zh.vec'
train = pd.read_csv("../inputs/train.csv")
test = pd.read_csv("../inputs/testa.csv")
val = pd.read_csv("../inputs/vali.csv")
X_train = train["content"].fillna("无").str.lower()
x_val = val["content"].fillna("无").str.lower()
X_test = test["content"].fillna("无").str.lower()

In [4]:
columns = train.columns.tolist()[2:]

In [6]:
from keras.utils.np_utils import to_categorical
from typing import List
def get_y(df:pd.DataFrame, cols:List[str]=columns) -> List[np.array]:
    y_list = []
    for col in cols:
        y = df[col].values + 2
        y_ = to_categorical(y, num_classes=4)
        y_list.append(y_)
    return y_list

In [7]:
y_train = get_y(train)

In [8]:
y_val = get_y(val)

In [9]:
max_features=100000
maxlen=200
embed_size=300

In [10]:
tok=Tokenizer(num_words=max_features)
tok.fit_on_texts(list(X_train)+list(X_test))
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
x_train=pad_sequences(X_train,maxlen=maxlen)
x_test=pad_sequences(X_test,maxlen=maxlen)

In [11]:
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
word_index = tok.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector        

In [12]:
from  tensorflow.keras.optimizers import Adam, RMSprop
from  tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from  tensorflow.keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D

file_path = "../ckpt/best_model_bigru_cnn.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)


In [13]:
def build_model(lr = 0.0, lr_d = 0.0, units = 0, dr = 0.0, classes=20):
    inp = Input(shape = (maxlen,))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(dr)(x)

    x = Bidirectional(GRU(units, return_sequences = True))(x1)
    x = Conv1D(128, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    
    y = Bidirectional(LSTM(units, return_sequences = True))(x1)
    y = Conv1D(128, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(y)
    
    avg_pool1 = GlobalAveragePooling1D()(x)
    max_pool1 = GlobalMaxPooling1D()(x)
    
    avg_pool2 = GlobalAveragePooling1D()(y)
    max_pool2 = GlobalMaxPooling1D()(y)
    
    
    x = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])
    ys = []
    for i in range(classes):
        y = Dense(4, activation = "softmax")(x)
        ys.append(y)
    model = Model(inputs = inp, outputs = ys)
    model.compile(loss = "categorical_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(x_train, y_train, batch_size = 128, epochs = 30,validation_split=0.05 , 
                        verbose = 1, callbacks = [check_point, early_stop])
    model = load_model(file_path)
    return model
    

In [None]:
model = build_model(lr = 1e-3, lr_d = 0, units = 128, dr = 0.2)

Train on 99750 samples, validate on 5250 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 13.71723, saving model to ../ckpt/best_model_bigru_cnn.hdf5
Epoch 2/30

Epoch 00002: val_loss improved from 13.71723 to 11.98983, saving model to ../ckpt/best_model_bigru_cnn.hdf5
Epoch 3/30

Epoch 00003: val_loss improved from 11.98983 to 11.46877, saving model to ../ckpt/best_model_bigru_cnn.hdf5
Epoch 4/30



Epoch 00004: val_loss improved from 11.46877 to 11.23324, saving model to ../ckpt/best_model_bigru_cnn.hdf5
Epoch 5/30

Epoch 00005: val_loss improved from 11.23324 to 11.09223, saving model to ../ckpt/best_model_bigru_cnn.hdf5
Epoch 6/30

Epoch 00006: val_loss improved from 11.09223 to 10.75307, saving model to ../ckpt/best_model_bigru_cnn.hdf5
Epoch 7/30

Epoch 00007: val_loss improved from 10.75307 to 10.66794, saving model to ../ckpt/best_model_bigru_cnn.hdf5
Epoch 8/30



Epoch 00008: val_loss improved from 10.66794 to 10.56068, saving model to ../ckpt/best_model_bigru_cnn.hdf5
Epoch 9/30

Epoch 00009: val_loss improved from 10.56068 to 10.47887, saving model to ../ckpt/best_model_bigru_cnn.hdf5
Epoch 10/30

Epoch 00010: val_loss did not improve from 10.47887
Epoch 11/30

Epoch 00011: val_loss improved from 10.47887 to 10.34028, saving model to ../ckpt/best_model_bigru_cnn.hdf5
Epoch 12/30
 3456/99750 [>.............................] - ETA: 12:08 - loss: 10.1762 - dense_loss: 0.3454 - dense_1_loss: 0.4185 - dense_2_loss: 0.4251 - dense_3_loss: 0.3871 - dense_4_loss: 0.6442 - dense_5_loss: 0.1610 - dense_6_loss: 0.3493 - dense_7_loss: 0.7649 - dense_8_loss: 0.4791 - dense_9_loss: 0.6942 - dense_10_loss: 0.5635 - dense_11_loss: 0.4894 - dense_12_loss: 0.6346 - dense_13_loss: 0.5214 - dense_14_loss: 0.7361 - dense_15_loss: 0.6084 - dense_16_loss: 0.6374 - dense_17_loss: 0.3758 - dense_18_loss: 0.5258 - dense_19_loss: 0.4151 - dense_acc: 0.8819 - dense_1_a