In [2]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import gc
import re
import pickle
import warnings
warnings.filterwarnings('ignore')

from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from keras import optimizers
from keras.layers.normalization import BatchNormalization
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from keras.models import Sequential, Model
from keras.layers import Input, Dense, CuDNNLSTM, Bidirectional, Embedding, CuDNNGRU, Conv1D, MaxPooling1D
from keras.layers import Flatten, PReLU, Dropout, BatchNormalization, SpatialDropout1D, concatenate
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing.sequence import pad_sequences
from keras.utils import Sequence
from keras.callbacks import Callback
from keras import backend as K
from keras.engine.topology import Layer 
from keras.utils.training_utils import multi_gpu_model

os.environ['CUDA_VISIBLE_DEVICES'] = "1"
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

Using TensorFlow backend.


In [1]:
DATA_PATH = '../data/data_set_0926/'
FEAT_PATH = './features/'
WEIGHT_PATH = './weights/'

In [3]:
invite_info = pd.read_table(os.path.join(DATA_PATH 'invite_info_0926.txt'), header=None)
invite_info.columns = ['问题ID','用户ID','邀请创建时间','邀请是否被回答']

invite_info_evaluate_A = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_1_0926.txt'), header=None)
invite_info_evaluate_A.columns =  ['问题ID','用户ID','邀请创建时间']

invite_info_evaluate_B = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_2_0926.txt'), header=None)
invite_info_evaluate_B.columns =  ['问题ID','用户ID','邀请创建时间']

question_info = pd.read_table(os.path.join(DATA_PATH, 'question_info_0926.txt', header=None)
question_info.columns = ['问题ID',  '问题创建时间' , '问题标题的单字编码序列' , '问题标题的切词编码序列' , '问题描述的单字编码序列',  '问题描述的词编码序列' , '问题绑定的话题ID']

member_info = pd.read_table(os.path.join(DATA_PATH, 'member_info_0926.txt'), header=None)
member_info.columns = ['用户ID','性别','创作关键词的编码序列','创作数量等级','创作热度等级','注册类型','注册平台','访问频率','用户二分类特征A','用户二分类特征B','用户二分类特征C','用户二分类特征D','用户二分类特征E','用户分类特征A','用户分类特征B','用户分类特征C','用户分类特征D','用户分类特征E','用户的盐值分数','用户关注的话题','用户感兴趣的话题']

tmp = member_info['用户感兴趣的话题'].apply(lambda x: re.split('[,:]',x))
member_info['用户感兴趣的话题_T'] = tmp.apply(lambda x : ','.join(x[::2]))
member_info['用户感兴趣的话题_score'] = tmp.apply(lambda x : ','.join(x[1::2]))
                              
oversample = False

if oversample:
    data = pd.concat([invite_info, invite_info_evaluate_B, invite_info_evaluate_A], axis=0)
else:
    data = pd.concat([invite_info, invite_info_evaluate_B], axis=0)
                              
data = data.merge(question_info, on='问题ID', how='left').merge(member_info, on='用户ID', how='left')
                              
del question_info ,member_info, tmp, invite_info_evaluate_A, invite_info_evaluate_B
gc.collect()

In [9]:
tic = time.time()
data_new = pd.DataFrame()
data_new['author_id'] = data['用户ID']
data_new['question_id'] = data['问题ID']
data_new['label'] = data['邀请是否被回答']
data_new['title_w_series'] = data['问题标题的切词编码序列'].apply(lambda x: [int(num[1:]) for num in x.split(',')])
data_new['desc_w_series'] = data['问题描述的词编码序列'].apply(lambda x:[int(num[1:]) for num in x.split(',')])
data_new['topic_attent'] = data['用户关注的话题'].apply(lambda x:[int(num[1:]) for num in x.split(',')])
data_new['topic_interest'] = data['用户感兴趣的话题_T'].apply(lambda x:[int(num[1:]) for num in x.split(',')])
data_new['topic'] = data['问题绑定的话题ID'].apply(lambda x:[int(num[1:]) for num in x.split(',')])
del data
gc.collect()
data = data_new
print("Used time: %d s" % (time.time()-tic))

CPU times: user 7min 44s, sys: 30.9 s, total: 8min 14s
Wall time: 8min 21s


In [13]:
word = pd.read_table(os.path.join(DATA_PATH, 'word_vectors_64d.txt'), header=None)
word.columns = ['id','embed']
word['id'] = word['id'].apply(lambda x: int(x[1:]))
word['embed'] = word['embed'].apply(lambda x: [float(num) for num in x.split(' ')])

topic = pd.read_table(os.path.join(DATA_PATH, 'topic_vectors_64d.txt'), header=None)
topic.columns = ['id','embed']
topic['id'] = topic['id'].apply(lambda x: int(x[1:]))
topic['embed'] = topic['embed'].apply(lambda x: [float(num) for num in x.split(' ')])

In [16]:
max_doc_size = 128
seq_feat = ['topic_attent', 'topic_interest', 'title_w_series', 'desc_w_series', 'topic']

for f in seq_feat:
    if data[f].apply(len).max() >  max_doc_size:
        data[f] = data[f].apply(lambda x: x[:max_doc_size])

In [17]:
# name, in_dim, seq_length
seq_embed_cnt = [(f, (len(topic) if 'topic' in f else len(word)) + 1, data[f].apply(len).max()) for f in seq_feat]
print(seq_embed_cnt)

# seq embed weight 
embed_weights = {'word': np.array([[0] * 64] + [list(v) for v in word['embed'].values]), 
                'topic': np.array([[0] * 64] + [list(v) for v in topic['embed'].values])}

del word, topic
gc.collect()

[('topic_attent', 100001, 100),
 ('topic_interest', 100001, 10),
 ('title_w_series', 1762830, 38),
 ('desc_w_series', 1762830, 128),
 ('topic', 100001, 13)]

In [21]:
train = data[:len(invite_info)]
test = data[len(invite_info):]
print(train.shape)
print(test.shape)

del data,data_new, invite_info
gc.collect()

(9489162, 8)
(1141718, 8)


In [23]:
# {key:array}
train = train.to_dict(orient='list')
test = test.to_dict(orient='list')
for k in train.keys():
    train[k] = np.array(train[k])
for k in test.keys():
    test[k] = np.array(test[k])

In [24]:
## padding
for f, _, maxlen in seq_embed_cnt:
    print(f)
    train[f] = pad_sequences(train[f], maxlen=maxlen)
    test[f] = pad_sequences(test[f], maxlen=maxlen)
    gc.collect()

topic_attent
topic_interest
title_w_series
desc_w_series
topic
CPU times: user 12min 2s, sys: 34.4 s, total: 12min 37s
Wall time: 12min 35s


In [25]:
import collections
class DictWrapper(collections.Sequence):
    def __init__(self, d):
        self.d = d
    def __len__(self):
        return len(self.d[list(self.d.keys())[0]])
    def __getitem__(self, position):
        return DictWrapper(dict((k, v[position]) for k, v in self.d.items()))
    
train = DictWrapper(train)
test = DictWrapper(test)

## DNN_LSTM

In [27]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [28]:
class Position_Embedding(Layer): 
    def __init__(self, size=None, mode='sum', **kwargs):        
        self.size = size         
        self.mode = mode       
        super(Position_Embedding, self).__init__(**kwargs) 

    def call(self, x): 
        if (self.size == None) or (self.mode == 'sum'):            
            self.size = int(x.shape[-1])        
            batch_size, seq_len = K.shape(x)[0], K.shape(x)[1]        
            position_j = 1. / K.pow(10000., \
                2 * K.arange(self.size / 2, dtype='float32') / self.size)        
            position_j = K.expand_dims(position_j, 0)        
            position_i = K.cumsum(K.ones_like(x[:, :, 0]), 1)-1     
            position_i = K.expand_dims(position_i, 2)        
            position_ij = K.dot(position_i, position_j)        
            position_ij = K.concatenate([K.cos(position_ij), K.sin(position_ij)], 2) 
            if self.mode == 'sum': 
                return position_ij + x 
            elif self.mode == 'concat': 
                return K.concatenate([position_ij, x], 2) 

    def compute_output_shape(self, input_shape): 
        if self.mode == 'sum': 
            return input_shape 
        elif self.mode == 'concat': 
            return (input_shape[0], input_shape[1], input_shape[2]+self.size)

In [29]:
def DNN_LSTM(seq_embed_cnt, seq_embed_size=64, embed_weights=None):
    
    # sequence embedding
    inp_seq_embed = []
    out_seq_embed = []
    for feat_name, inp_embed_dim, seq_length in seq_embed_cnt:
        inp = Input(shape=(seq_length,))
        inp_seq_embed.append(inp)
        weights = (embed_weights['topic'] if 'topic' in feat_name else embed_weights['word'])
        x = Embedding(inp_embed_dim, seq_embed_size, weights=[weights], trainable=False)(inp)
        x = Position_Embedding()(x)
        x = SpatialDropout1D(0.1)(x)
        x = Bidirectional(CuDNNGRU(32, return_sequences=True))(x)
        atten_1 = Attention(seq_length)(x)
        convs = []
        filter_sizes = [2, 4, 6, 10]
        for fsz in filter_sizes:
            l_conv = Conv1D(filters=seq_length, kernel_size=fsz, activation='relu')(x)
            l_pool = MaxPooling1D((seq_length - fsz + 1,))(l_conv)
            l_pool = Flatten()(l_pool)
            convs.append(l_pool)
        text_cnn = concatenate(convs, axis=1)
        out_seq_embed.extend([atten_1, text_cnn])
     
    # concat
    conc = concatenate(out_seq_embed)
    conc = Dense(256)(conc)
    conc = BatchNormalization()(conc)
    conc = PReLU()(conc)
    conc = Dropout(0.2)(conc)
    conc = Dense(128)(conc)
    conc = BatchNormalization()(conc)
    out = Dense(1, activation="sigmoid")(conc)
    model = Model(inputs=inp_seq_embed, outputs=out)
    
    return model

In [30]:
class DataSequence(Sequence):
    
    def __init__(self, x, y, seq_embed_feat, batch_size=128):
        self.x = x
        self.y = y
        self.seq_embed_feat = seq_embed_feat
        self.batch_size = batch_size
        self.x_seq_embed = [x.d[f] for f in seq_embed_feat] 
        
        
    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))
    
    def __getitem__(self, idx):
        batch_idx = np.arange(idx * self.batch_size, min((idx + 1) * self.batch_size, len(self.x)))
        batch_x = [xf[batch_idx] for xf in self.x_seq_embed]
        batch_y = self.y[batch_idx]
        return batch_x, batch_y

In [31]:
class MetricsCallback(Callback):
    def __init__(self, trn_x, y_trn,val_x, y_val, batch_size=128, save_name='weight.h5'):
        self.trn_generator = DataSequence(trn_x, y_trn, seq_feat, batch_size=batch_size)
        self.val_generator = DataSequence(val_x, y_val, seq_feat, batch_size=batch_size)
        self.y_trn = y_trn
        self.y_val = y_val
        self.save_name = save_name
        self.best_score = 0.5

    def on_epoch_end(self, epoch, logs={}):
        # eval train
        y_pred = self.model.predict_generator(self.trn_generator, 
                                              max_queue_size=10, 
                                              workers=1, 
                                              use_multiprocessing=False, 
                                              verbose=0)
        roc = roc_auc_score(self.y_trn, y_pred)
        # eval valid
        y_pred_val = self.model.predict_generator(self.val_generator, 
                                              max_queue_size=10, 
                                              workers=1, 
                                              use_multiprocessing=False, 
                                              verbose=0)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')

        if roc_val > self.best_score:
            self.best_score = roc_val
            self.model.save_weights(os.path.join(WEIGHT_PATH, self.save_name))
        
        return

In [49]:
BATCH_SIZE = 1024
train_x, train_y = train, train.d['label']

for i, seeds in enumerate([42]):
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seeds)
    for index, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y)):
        print('*' * 30)
        X_train, y_train, X_valid, y_valid = train_x[tr_idx], train_y[tr_idx], train_x[va_idx], train_y[va_idx]
        trn_generator = DataSequence(X_train, y_train, seq_feat, batch_size=BATCH_SIZE)
        val_generator = DataSequence(X_valid, y_valid, seq_feat, batch_size=BATCH_SIZE)
        K.clear_session()
        model = DNN_LSTM(seq_embed_cnt, seq_embed_size=64, embed_weights=embed_weights)
        model.compile(loss ='binary_crossentropy', optimizer='Adam')  #logloss
        history = model.fit_generator(generator=trn_generator,
                                      validation_data=val_generator,
                                      epochs=15, 
                                      verbose=1, 
                                      callbacks=[MetricsCallback(X_train, y_train,
                                                   X_valid, y_valid, 
                                                   batch_size=BATCH_SIZE*4, 
                                                   save_name='lstm_fold_%d.h5' % index)],
                                      max_queue_size=10, 
                                      workers=1, 
                                      use_multiprocessing=False)  #0.6660
        
        del X_train, y_train, X_valid, y_valid
        gc.collect()

******************************
Epoch 1/15
roc-auc: 0.6751 - roc-auc_val: 0.6706                                                                                                    
Epoch 2/15
roc-auc: 0.6878 - roc-auc_val: 0.6794                                                                                                    
Epoch 3/15
roc-auc: 0.6977 - roc-auc_val: 0.6864                                                                                                    
Epoch 4/15
roc-auc: 0.7089 - roc-auc_val: 0.6931                                                                                                    
Epoch 5/15
roc-auc: 0.7096 - roc-auc_val: 0.6918                                                                                                    
Epoch 6/15
roc-auc: 0.7165 - roc-auc_val: 0.6976                                                                                                    
Epoch 7/15
roc-auc: 0.7245 - roc-auc_val: 0.7023                           

roc-auc: 0.7108 - roc-auc_val: 0.6943                                                                                                    
Epoch 5/15
roc-auc: 0.7167 - roc-auc_val: 0.6979                                                                                                    
Epoch 6/15
roc-auc: 0.7212 - roc-auc_val: 0.6999                                                                                                    
Epoch 7/15
roc-auc: 0.7275 - roc-auc_val: 0.7028                                                                                                    
Epoch 8/15
roc-auc: 0.7299 - roc-auc_val: 0.7038                                                                                                    
Epoch 9/15
roc-auc: 0.7337 - roc-auc_val: 0.7067                                                                                                    
Epoch 10/15
roc-auc: 0.7373 - roc-auc_val: 0.7082                                                                    

In [32]:
BATCH_SIZE = 1024
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_x = test
train_x, train_y = train, train.d['label']
test_generator = DataSequence(test_x, np.zeros(len(test_x)), seq_feat, batch_size=BATCH_SIZE)

stack_test = np.zeros((len(test_x), 1))
stack_train = np.zeros((len(train_x), 1))

for i, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y)):

    print('-'*100)
    print('Fold %d' % i)
    X_train, y_train, X_valid, y_valid = train_x[tr_idx], train_y[tr_idx], train_x[va_idx], train_y[va_idx]

    K.clear_session()
    model = DNN_LSTM(seq_embed_cnt, seq_embed_size=64, embed_weights=embed_weights)
    model.compile(loss='binary_crossentropy', optimizer='adam',)

    val_generator = DataSequence(X_valid, y_valid, seq_feat, batch_size=BATCH_SIZE)

    model.load_weights(os.path.join(WEIGHT_PATH, 'lstm_fold_%d.h5' % i))

    stack_train[va_idx] = model.predict_generator(val_generator, verbose=1)
    stack_test += model.predict_generator(test_generator, verbose=1) / 5
    
    print(roc_auc_score(y_valid, stack_train[va_idx]))
    
    del X_valid, y_valid
    gc.collect()
    print('Predict Done.')

----------------------------------------------------------------------------------------------------
Fold 0
0.7117546495794349
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 1
0.7122995297749857
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 2
0.7119206606310954
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 3
0.7124045318508603
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 4
0.711623117109101
Predict Done.


In [33]:
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
df_stack['lstm_enc_feat'] = stack[:,0]

In [34]:
df_stack.to_pickle(os.path.join(FEAT_PATH, 'lstm_enc_feat.pickle'))
print("Feature saved, shape:",df_stack.shape)

Feature saved, shape: (10630880, 1)
