In [2]:
import numpy as np
np.random.seed(42)
import pandas as pd
import string
import re

import gensim
from collections import Counter
import pickle

import tensorflow as tf
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Dropout, Conv1D, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, LSTM,Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import CuDNNLSTM, CuDNNGRU
from keras.preprocessing import text, sequence

from keras.callbacks import Callback
from keras import optimizers
from keras.layers import Lambda

import warnings
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords

import os
os.environ['OMP_NUM_THREADS'] = '4'

import gc
from keras import backend as K
from sklearn.model_selection import KFold

import time

eng_stopwords = set(stopwords.words("english"))

In [2]:
EMBEDDING_FILE = '../inputs/wiki.zh.vec'
train = pd.read_csv("../inputs/train.tsv",sep='\t')
test = pd.read_csv("../inputs/vali.tsv",sep='\t')

In [3]:
X_train = train["内容"].fillna("无").str.lower()
y_train = train["标签"].values

X_test = test["内容"].fillna("无").str.lower()

In [4]:
max_features=100000
maxlen=800
embed_size=300

In [5]:
import numpy as np
lookupTable, y_train = np.unique(y_train, return_inverse=True)

In [6]:
lookupTable

array(['人类作者', '机器作者', '机器翻译', '自动摘要'], dtype=object)

In [7]:
max_features=100000
maxlen=800
embed_size=300

In [8]:
tok=text.Tokenizer(num_words=max_features)
tok.fit_on_texts(list(X_train)+list(X_test))
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
x_train=sequence.pad_sequences(X_train,maxlen=maxlen)
x_test=sequence.pad_sequences(X_test,maxlen=maxlen)

In [9]:
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [10]:
word_index = tok.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [11]:
sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool]) 
preds = Dense(4, activation="softmax")(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])

In [13]:
from keras.utils.np_utils import to_categorical
y_train = to_categorical(y_train, num_classes=None)

In [14]:
y_train

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.]], dtype=float32)

In [21]:
batch_size = 128
epochs = 30
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)



In [25]:
from sklearn.metrics import roc_auc_score
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [26]:
filepath="../inputs/weights_base.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
f1_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)
callbacks_list = [f1_val,checkpoint, early]

In [27]:
model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks = callbacks_list,verbose=1)

Train on 124451 samples, validate on 13828 samples
Epoch 1/30

 ROC-AUC - epoch: 1 - score: 0.997784

Epoch 00001: val_acc improved from -inf to 0.96854, saving model to ../inputs/weights_base.best.hdf5
Epoch 2/30

 ROC-AUC - epoch: 2 - score: 0.997582

Epoch 00002: val_acc improved from 0.96854 to 0.96898, saving model to ../inputs/weights_base.best.hdf5
Epoch 3/30

 ROC-AUC - epoch: 3 - score: 0.998180

Epoch 00003: val_acc improved from 0.96898 to 0.97700, saving model to ../inputs/weights_base.best.hdf5
Epoch 4/30

 ROC-AUC - epoch: 4 - score: 0.998162

Epoch 00004: val_acc did not improve from 0.97700
Epoch 5/30

 ROC-AUC - epoch: 5 - score: 0.998489

Epoch 00005: val_acc improved from 0.97700 to 0.97932, saving model to ../inputs/weights_base.best.hdf5
Epoch 6/30

 ROC-AUC - epoch: 6 - score: 0.998528

Epoch 00006: val_acc improved from 0.97932 to 0.98062, saving model to ../inputs/weights_base.best.hdf5
Epoch 7/30

 ROC-AUC - epoch: 7 - score: 0.998137

Epoch 00007: val_acc did 

<keras.callbacks.History at 0x7f2a0d079400>

In [28]:
model.load_weights(filepath)
print('Predicting....')
y_pred = model.predict(x_test,batch_size=1024,verbose=1)

Predicting....


In [32]:
y_p = np.argmax(y_pred,1)

In [31]:
lookupTable

array(['人类作者', '机器作者', '机器翻译', '自动摘要'], dtype=object)

In [33]:
y_p.shape

(58567,)

In [35]:
my_dict = dict()
for i, v in enumerate(lookupTable):
    my_dict[i] = v

In [36]:
my_dict

{0: '人类作者', 1: '机器作者', 2: '机器翻译', 3: '自动摘要'}

In [42]:
test['标签'] = np.vectorize(my_dict.get)(y_p)

In [44]:
test.to_csv('../inputs/sub_bilistmcnn_1.csv', columns=['id', '标签'], header=False, index=False)