In [1]:
import gc
import os
import time
import numpy as np
import pandas as pd
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")

import keras.backend as K
import keras.backend.tensorflow_backend as KTF
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation,normalization, Input,concatenate,Dense, Embedding,Flatten
from keras.layers.advanced_activations import PReLU
from keras.callbacks import ModelCheckpoint
from keras import activations
from keras.engine.topology import Layer, InputSpec
from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from keras import optimizers
from keras.utils import Sequence
from keras.layers.normalization import BatchNormalization
from keras.callbacks import Callback
from sklearn.metrics import roc_auc_score

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
KTF.set_session(sess)

Using TensorFlow backend.


# Data Loading

In [2]:
DATA_PATH = '../data/data_set_0926/'
FEAT_PATH = './features/'
WEIGHT_PATH = './weights/'

In [2]:
invite_info = pd.read_table(os.path.join(DATA_PATH, 'invite_info_0926.txt'), header=None)
invite_info.columns = ['问题ID','用户ID','邀请创建时间','邀请是否被回答']

invite_info_evaluate_A = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_1_0926.txt'), header=None)
invite_info_evaluate_A.columns =  ['问题ID','用户ID','邀请创建时间']

invite_info_evaluate_B = pd.read_table(os.path.join(DATA_PATH, 'invite_info_evaluate_2_0926.txt'), header=None)
invite_info_evaluate_B.columns =  ['问题ID','用户ID','邀请创建时间']

oversample = False

if oversample:
    data = pd.concat([invite_info, invite_info_evaluate_B, invite_info_evaluate_A], axis=0)
else:
    data = pd.concat([invite_info, invite_info_evaluate_B], axis=0)

14

# Feature Engineering

In [5]:
tic = time.time()
tmp = pd.read_hdf(os.path.join(FEAT_PATH, 'ques_describe_W_sum_64.h5'), key='data')
data = data.merge(tmp, on='问题ID', how='left')

tmp = pd.read_hdf(os.path.join(FEAT_PATH, 'ques_describe_SW_sum_64.h5'), key='data')
data = data.merge(tmp, on='问题ID', how='left')

tmp = pd.read_hdf(os.path.join(FEAT_PATH, 'ques_topic_W_sum_64.h5'), key='data')
data = data.merge(tmp, on='问题ID', how='left')

tmp = pd.read_hdf(os.path.join(FEAT_PATH, 'ques_topic_SW_sum_64.h5'), key='data')
data = data.merge(tmp, on='问题ID', how='left')

tmp = pd.read_hdf(os.path.join(FEAT_PATH, 'user_watched_topic_w2v_sum_64.h5'), key='data')
data = data.merge(tmp, on='用户ID', how='left')

tmp = pd.read_hdf(os.path.join(FEAT_PATH, 'user_fav_topic_w2v_sum_64.h5'), key='data')
data = data.merge(tmp, on='用户ID', how='left')

tmp = pd.read_hdf(os.path.join(FEAT_PATH, 'ques_attach_topic_w2v_sum_64.h5'), key='data')
data = data.merge(tmp, on='问题ID', how='left')

print("Used time: %d s" % (time.time()-tic))

CPU times: user 2min 43s, sys: 1min 58s, total: 4min 41s
Wall time: 4min 54s


In [6]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    tic = time.time()
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    toc = time.time()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    print('Used time: %d s'%(toc-tic))
    return df

In [7]:
data = reduce_mem_usage(data)

Memory usage of dataframe is 36741.55 MB
Memory usage after optimization is: 9385.92 MB
Decreased by 74.5%
Used time: 1258 s


In [8]:
del tmp
gc.collect()

14

# Model

In [9]:
ques_describe_W_feat = ['ques_describe_W_w2v_sum_{}'.format(i) for i in range(1,65)]
ques_describe_SW_feat = ['ques_describe_SW_w2v_sum_{}'.format(i) for i in range(1,65)]
ques_topic_W_feat = ['ques_topic_W_w2v_sum_{}'.format(i) for i in range(1,65)]
ques_topic_SW_feat = ['SW_w2v_sum_{}'.format(i) for i in range(1,65)]
user_watched_topic_feat = ['用户关注的话题_w2v_sum_{}'.format(i) for i in range(1,65)]
user_fav_topic_feat = ['用户感兴趣的话题_w2v_sum_{}'.format(i) for i in range(1,65)]
ques_attach_topic_feat = ['问题绑定的话题ID_w2v_sum_{}'.format(i) for i in range(1,65)]

num_feat =  ques_describe_W_feat + ques_describe_SW_feat +  ques_topic_W_feat + ques_topic_SW_feat + user_watched_topic_feat +\
            user_fav_topic_feat + ques_attach_topic_feat

In [10]:
feature = num_feat

test_index = np.isnan(data['邀请是否被回答'])
train_index = ~test_index
train_x = data[train_index][feature].reset_index(drop=True)
train_y = data[train_index]['邀请是否被回答'].reset_index(drop=True)
test_x  = data[test_index][feature].reset_index(drop=True)
print('All features: train shape {}, test shape {}'.format(train_x.shape, test_x.shape))

All features: train shape (9489162, 448), test shape (1141718, 448)


In [11]:
del data
gc.collect()

28

In [12]:
class DataSequence(Sequence):
    
    def __init__(self, x, y, single_embed_feat, dense_feat, batch_size=128):
        self.x = x
        self.y = y
        self.single_embed_feat = single_embed_feat
        self.num_feat = num_feat
        self.batch_size = batch_size
        self.x_single_embed = [x[f].values for f in single_embed_feat]
        self.x_dense = x[dense_feat].values
        
    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))
    
    def __getitem__(self, idx):
        batch_x = [xf[idx * self.batch_size:(idx + 1) * self.batch_size] for xf in self.x_single_embed] \
                + [self.x_dense[idx * self.batch_size:(idx + 1) * self.batch_size]]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        return batch_x, batch_y

In [13]:
class MetricsCallback(Callback):
    def __init__(self, trn_x, y_trn,val_x, y_val, batch_size=128, save_name='weight.h5'):
        self.trn_generator = DataSequence(trn_x, y_trn, [], num_feat, batch_size=batch_size)
        self.val_generator = DataSequence(val_x, y_val, [], num_feat, batch_size=batch_size)
        self.y_trn = y_trn
        self.y_val = y_val
        self.save_name = save_name
        self.best_score = 0.5

    def on_epoch_end(self, epoch, logs={}):
        # train
        y_pred = self.model.predict_generator(self.trn_generator, 
                                              max_queue_size=10, 
                                              workers=1, 
                                              use_multiprocessing=False, 
                                              verbose=0)
        roc = roc_auc_score(self.y_trn, y_pred)
        # valid
        y_pred_val = self.model.predict_generator(self.val_generator, 
                                              max_queue_size=10, 
                                              workers=1, 
                                              use_multiprocessing=False, 
                                              verbose=0)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')

        if roc_val > self.best_score:
            self.best_score = roc_val
            self.model.save_weights(os.path.join(WEIGHT_PATH, self.save_name))
        
        return

In [14]:
def DNN():
    
    input1  = Input(shape=(X_train.shape[1],))
    dense_1 = Dense(512)(input1)
    dense_1 = BatchNormalization()(dense_1)
    dense_1 = PReLU()(dense_1)
    dense_2 = Dense(256)(dense_1)
    dense_2 = BatchNormalization()(dense_2)
    dense_2 = PReLU()(dense_2)
    dense_3 = Dense(128)(dense_2)
    dense_3 = BatchNormalization()(dense_3)
    dense_3 = PReLU()(dense_3)
    dense_4 = Dense(64)(dense_3)
    dense_4 = BatchNormalization()(dense_4)
    dense_4 = PReLU()(dense_4)

    out     = Dense(1, activation='sigmoid')(dense_4)
    model = Model(inputs=input1, outputs = out)

    return model

In [12]:
##Train
BATCH_SIZE = 1024

for i, seeds in enumerate([42]):
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seeds)
    for index, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y.values)):
        print('*' * 30)
        X_train, y_train, X_valid, y_valid = train_x.iloc[tr_idx], train_y[tr_idx], train_x.iloc[va_idx], train_y[va_idx]
        trn_generator = DataSequence(X_train, y_train, [], num_feat, batch_size=BATCH_SIZE)
        val_generator = DataSequence(X_valid, y_valid, [], num_feat, batch_size=BATCH_SIZE)
        model = DNN()
        model.compile(loss ='binary_crossentropy', optimizer='Adam')
        history = model.fit_generator(generator=trn_generator,
                                      validation_data=val_generator,
                                      epochs=20, 
                                      verbose=1, 
                                      callbacks=[MetricsCallback(X_train, y_train,X_valid, y_valid, 
                                                 batch_size=BATCH_SIZE*4, save_name='mlp_fold_%d.h5' % index)],
                                      max_queue_size=10, 
                                      workers=1, 
                                      use_multiprocessing=False)  #0.6830
        
        del X_train, y_train, X_valid, y_valid
        gc.collect()

******************************
Epoch 1/20
roc-auc: 0.6608 - roc-auc_val: 0.6578                                                                                                    
Epoch 2/20
roc-auc: 0.6737 - roc-auc_val: 0.6682                                                                                                    
Epoch 3/20
roc-auc: 0.6819 - roc-auc_val: 0.6733                                                                                                    
Epoch 4/20
roc-auc: 0.6887 - roc-auc_val: 0.6771                                                                                                    
Epoch 5/20
roc-auc: 0.6958 - roc-auc_val: 0.6804                                                                                                    
Epoch 6/20
roc-auc: 0.7 - roc-auc_val: 0.682                                                                                                    
Epoch 7/20
roc-auc: 0.7066 - roc-auc_val: 0.6846                               

roc-auc: 0.7458 - roc-auc_val: 0.6938                                                                                                    
Epoch 16/20
roc-auc: 0.7487 - roc-auc_val: 0.6934                                                                                                    
Epoch 17/20
roc-auc: 0.753 - roc-auc_val: 0.6942                                                                                                    
Epoch 18/20
roc-auc: 0.7572 - roc-auc_val: 0.6953                                                                                                    
Epoch 19/20
roc-auc: 0.7571 - roc-auc_val: 0.6971                                                                                                    
Epoch 20/20
roc-auc: 0.7624 - roc-auc_val: 0.696                                                                                                    
******************************
Epoch 1/20
roc-auc: 0.6619 - roc-auc_val: 0.6595                                   

roc-auc: 0.7175 - roc-auc_val: 0.688                                                                                                    
Epoch 10/20
roc-auc: 0.7231 - roc-auc_val: 0.6888                                                                                                    
Epoch 11/20
roc-auc: 0.7276 - roc-auc_val: 0.6908                                                                                                    
Epoch 12/20
roc-auc: 0.7318 - roc-auc_val: 0.691                                                                                                    
Epoch 13/20
roc-auc: 0.7375 - roc-auc_val: 0.6921                                                                                                    
Epoch 14/20
roc-auc: 0.7409 - roc-auc_val: 0.6936                                                                                                    
Epoch 15/20
roc-auc: 0.7444 - roc-auc_val: 0.6935                                                                 

In [15]:
##valid & Test
BATCH_SIZE = 1024

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_generator = DataSequence(test_x, np.zeros(test_x.shape[0]), [], num_feat, batch_size=BATCH_SIZE)

stack_test = np.zeros((test_x.shape[0], 1))
stack_train = np.zeros((train_x.shape[0], 1))

for i, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y.values)):
    print('-'*100)
    print('Fold %d' % i)
    X_train, y_train, X_valid, y_valid = train_x.iloc[tr_idx], train_y[tr_idx], train_x.iloc[va_idx], train_y[va_idx]

    K.clear_session()
    model = DNN()
    model.compile(loss='binary_crossentropy', optimizer='adam',)

    val_generator = DataSequence(X_valid, y_valid, [], num_feat, batch_size=BATCH_SIZE)

    model.load_weights(os.path.join(WEIGHT_PATH, 'mlp_fold_%d.h5' % i))

    stack_train[va_idx] = model.predict_generator(val_generator, verbose=1)
    stack_test += model.predict_generator(test_generator, verbose=1) / 5
    
    print(roc_auc_score(y_valid, stack_train[va_idx]))
    
    del X_train, y_train, X_valid, y_valid
    gc.collect()
    print('Predict Done.')

----------------------------------------------------------------------------------------------------
Fold 0
0.6831091556916525
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 1
0.6832178829122443
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 2
0.6833419535485171
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 3
0.6828106345053794
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 4
0.6820476438130859
Predict Done.


In [16]:
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
df_stack['w2v_sum_nn'] = stack[:,0]

In [17]:
df_stack.to_pickle(os.path.join(FEAT_PATH, 'w2v_sum_nn.pkl'))
print("Feature saved, shape:",df_stack.shape)

Feature saved, shape: (10630880, 1)
