## LSTM

In [1]:
import os
import numpy as np
import pandas as pd
from collections import namedtuple

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, roc_curve

from keras.preprocessing.sequence import pad_sequences

##### 获取数据

In [2]:
train_df = pd.read_csv('../data/ml-1m/train_df.csv')
test_df = pd.read_csv('../data/ml-1m/test_df.csv')
data = train_df.append(test_df)
print('train_df.shape: {}, test_df.shape: {}'.format(train_df.shape, test_df.shape))
data.head()

train_df.shape: (1956191, 14), test_df.shape: (12078, 14)


Unnamed: 0,user_id,hist_item_id,hist_s1,hist_s2,item_id,label,rating,click_timestamp,hist_len,gender,age,item_date,item_title,item_cate_id
0,1741,"2904,2008,230,1169,2064,3475,1952,1228,943,275...","898,948,3022,1888,2891,1080,2204,618,1372,852,...","3722,1244,3023,1594,3348,288,323,36,1226,1254,...",3266,1,4,974711543,50,1,4,29,456645670000000000000,6
1,1880,"1280,1213,1075,1274,1135,3292,2902,2008,3000,3...","3410,2666,3723,1109,2046,2078,1891,3639,2874,2...","3101,2671,1271,2406,3616,1158,2483,2858,2861,2...",3266,0,0,0,50,1,4,29,456645670000000000000,6
2,3292,"1895,1016,2030,2981,3008,2068,998,1002,1006,28...","3438,1948,3692,1963,1964,1178,1360,2134,3027,2...","2171,2972,3031,3176,1110,2041,2711,1059,3554,3...",3266,1,5,968098376,50,2,6,29,456645670000000000000,6
3,566,"2789,2069,1013,613,3657,3677,774,1965,3219,164...","1248,590,897,605,2118,2694,110,3483,2751,1024,...","1644,643,3377,1557,1599,2058,159,2270,50,1074,...",3266,1,2,976210413,50,1,3,29,456645670000000000000,6
4,1088,"52,1394,2851,2643,2909,416,1665,733,984,2266,1...","1233,2772,1815,539,3431,2761,838,1225,11,3029,...","2703,886,1253,1261,2365,3087,941,305,3061,711,...",3266,1,5,1023534057,50,2,1,29,456645670000000000000,6


##### 数据预处理

行为序列padding

    pad_sequences([[1,2,3], [2, 3]], maxlen=5, value=0) =》
    array([[0, 0, 1, 2, 3],
           [0, 0, 0, 2, 3]], dtype=int32)

In [3]:
# 当前item作为序列最后一个item
data['hist_item_id'] = data['hist_item_id'] + ',' + data['item_id'].map(str)

def parse_seqs_padding(seqs, maxlen=50):
    """ 左补零填充 """
    x = [i for i in seqs.split(',') if i != '0'][-maxlen:]
    return ','.join(['0']*(maxlen-len(x)) + x)

data['hist_item_id'] = data['hist_item_id'].apply(parse_seqs_padding, args=(20,))

# dense feture标准化
dense_column_names, sparse_column_names, varlen_sparse_column_names = ['hist_len'], [], ['hist_item_id']

def data_processing(df, dense_column_names,
                    sparse_column_names,
                    varlen_sparse_column_names):
    df[dense_column_names] = df[dense_column_names].fillna(0.0)
    for f in dense_column_names:
        df[f] = df[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    
    df[sparse_column_names] = df[sparse_column_names].fillna("-1")
    for f in sparse_column_names:
        lbe = LabelEncoder()
        df[f] = lbe.fit_transform(df[f])
    return df[dense_column_names + sparse_column_names + varlen_sparse_column_names]

df = data_processing(data, dense_column_names, sparse_column_names, varlen_sparse_column_names)
df['label'] = data['label']

train_df, test_df = df.iloc[0:train_df.shape[0]], df.iloc[train_df.shape[0]:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


##### 模型构建

Dense特征和序列特征输入LSTM模型中学习。

SpatialDropout1D 随机的对部分区域置0(Dropout随机的对部分元素置0)，输入为(samples, timesteps, channels)或(samples, sequence_length, embedding_dim)。

LSTM(units,input_shape(3,1)), units指cell中隐藏神经元个数。

In [4]:
SparseFeature = namedtuple('SparseFeature', ['name', 'vocabulary_size', 'embedding_size'])
DenseFeature = namedtuple('DenseFeature', ['name', 'dimension'])
VarLenSparseFeature = namedtuple('VarLenSparseFeature', ['name', 'vocabulary_size', 'embedding_size', 'maxlen'])

feature_columns = [
#     SparseFeature('user_id', data.user_id.max()+1, embedding_size=4),
#     SparseFeature('gender', data.gender.max()+1, embedding_size=4),
#     SparseFeature('age', data.age.max()+1, embedding_size=4),
#     SparseFeature('item_id', data.item_id.max()+1, embedding_size=4),
#     SparseFeature('item_cate_id', data.item_cate_id.max()+1, embedding_size=4),
    DenseFeature('hist_len', 1),
    VarLenSparseFeature('hist_item_id', data.item_id.max()+1, embedding_size=4, maxlen=20)
]

feature_columns

[DenseFeature(name='hist_len', dimension=1),
 VarLenSparseFeature(name='hist_item_id', vocabulary_size=3884, embedding_size=4, maxlen=20)]

In [5]:
def model_metric(prob, label, thr=0.5):
    """ 模型评估 """
    # AUC
    fpr, tpr, threshold = metrics.roc_curve(label, prob)
    auc = metrics.auc(fpr, tpr)
    score = metrics.accuracy_score(label, prob > thr)
    # LogLoss
    logloss = log_loss(label, prob)
    print('模型准确率:{}, AUC得分:{}, LogLoss:{}'.format(score, auc, logloss))
    print(classification_report(label, prob > thr, digits=2))
    print('==========================================================')

class Pooling(Layer):
    def __init__(self, pooling_type='max'):
        super(Pooling, self).__init__()
        self.pooling_type = pooling_type
        
    def call(self, inputs):
        _input = inputs
        if self.pooling_type=='max':
            output = tf.reduce_max(_input, 1, keepdims=False)
        elif self.pooling_type=='mean':
            output = tf.reduce_mean(_input, 1, keepdims=False)
        else:
            raise Exception("pooling_type error")
        return output
    
    def compute_output_shape(self, input_shape):
        return (None, 1)
    
def build_input_layers(feature_columns):
    """ 构建输入层 """
    dense_input_dict, sparse_input_dict, varlen_sparse_input_dict = {}, {}, {}
    for f in feature_columns:
        if isinstance(f, DenseFeature):
            dense_input_dict[f.name] = Input(shape=(f.dimension, ), name=f.name)
        elif isinstance(f, SparseFeature):
            sparse_input_dict[f.name] = Input(shape=(1, ), name=f.name)
        elif isinstance(f, VarLenSparseFeature):
            varlen_sparse_input_dict[f.name] = Input(shape=(f.maxlen, ), name=f.name)
    return dense_input_dict, sparse_input_dict, varlen_sparse_input_dict
    
def build_embedding_layers(feature_columns, is_linear):
    """ 构建embedding层 """
    embedding_layer_dict = {}
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    varlen_sparse_feature_columns = list(filter(lambda x: isinstance(x, VarLenSparseFeature), feature_columns))
    if is_linear:
        # 序列特征不参与线性模型的运算
        for f in sparse_feature_columns:
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, 1, name='1d_emb_' + f.name)
    else:
        for f in sparse_feature_columns:
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='kd_emb_' + f.name)
        for f in varlen_sparse_feature_columns:
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='var_emb_' + f.name, mask_zero=True)
    return embedding_layer_dict

def LSTMRec(feature_columns, behavior_seq_column_names):
    dense_input_dict, sparse_input_dict, varlen_sparse_input_dict = build_input_layers(feature_columns)
    input_list = list(dense_input_dict.values()) + list(sparse_input_dict.values()) + list(varlen_sparse_input_dict.values())
    kd_embedding_layer_dict = build_embedding_layers(feature_columns, is_linear=False)
    
    lstm_pooling_list = []
    for i in range(len(behavior_seq_column_names)):
        seq_embed = kd_embedding_layer_dict[behavior_seq_column_names[i]](varlen_sparse_input_dict[behavior_seq_column_names[i]])
        _dropout = SpatialDropout1D(rate=0.2)
        dropout_seq_embed = _dropout(seq_embed) # (None, 50, 4)    
        #print('dropout_seq_embed.shape: ', dropout_seq_embed.shape) 
        _lstm = LSTM(200, return_sequences=True, dropout=0.2)(dropout_seq_embed) # (None, 50, 200)
        # 相当于对每一个时间步增加一个dense，改变最后一个维度。
        _time_dist = TimeDistributed(Dense(4, activation='tanh'))(_lstm) # (None, 50, 4)
        print('lstm.shape: ', _lstm.shape)
        print('time_dist.shape: ', _time_dist.shape)
        
        # 对最后一个维度做max和mean
        max_pooling = Pooling(pooling_type='max')(_time_dist) # (None, 4)
        mean_pooling = Pooling(pooling_type='mean')(_time_dist) # (None, 4)
        lstm_pooling_list.append(max_pooling)
        lstm_pooling_list.append(mean_pooling)

    # 拼接dense特征
    concat_dense_inputs = Concatenate(axis=1)(list(dense_input_dict.values())) # (None, 8)
    dense_output = Dense(3)(concat_dense_inputs)
    
    concat_lstm_dense = Concatenate(axis=1)(lstm_pooling_list)#Concatenate(axis=1)(lstm_pooling_list + [dense_output])
    x = Dropout(0.2)(Activation(activation='relu')(BatchNormalization()(Dense(3)(concat_lstm_dense))))
    output_layer = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=input_list, outputs=output_layer)
    return model

behavior_seq_column_names = ['hist_item_id']
model = LSTMRec(feature_columns, behavior_seq_column_names)
model.summary()

lstm.shape:  (None, 20, 200)
time_dist.shape:  (None, 20, 4)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
hist_item_id (InputLayer)       [(None, 20)]         0                                            
__________________________________________________________________________________________________
var_emb_hist_item_id (Embedding (None, 20, 4)        15540       hist_item_id[0][0]               
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 20, 4)        0           var_emb_hist_item_id[0][0]       
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 20, 200)      164000      spatial_dropout1d[0][0]          
_________________________________

In [6]:
%%time
train_input = {
#     'user_id': np.array(train['user_id']),
#     'gender': np.array(train['gender']),
#     'age': np.array(train['age']),
#     'item_id': np.array(train['item_id']),
#     'item_cate_id': np.array(train['item_cate_id']),
    'hist_item_id': np.array([[int(i) for i in s.split(',')] for s in train_df['hist_item_id']]),
    'hist_len': np.array(train_df['hist_len']),
}
test_input = {
#     'user_id': np.array(test['user_id']),
#     'gender': np.array(test['gender']),
#     'age': np.array(test['age']),
#     'item_id': np.array(test['item_id']),
#     'item_cate_id': np.array(test['item_cate_id']),
    'hist_item_id': np.array([[int(i) for i in s.split(',')] for s in test_df['hist_item_id']]),
    'hist_len': np.array(test_df['hist_len']),
}

# 模型训练
my_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='auto')
]
model.compile('adam',
              loss='binary_crossentropy',
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])
model.fit(train_input,
          train_df['label'].values,
          batch_size=1024,
          epochs=100,
          validation_split=0.2,
          callbacks=my_callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 00017: early stopping
Wall time: 2h 41min 20s


<keras.callbacks.History at 0x2029fc4deb8>

In [7]:
# 模型预测与评估
result = model.predict(test_input)
model_metric(np.array([i[0] for i in result]), test_df['label'].values)

模型准确率:0.7630402384500745, AUC得分:0.8479906371128547, LogLoss:0.5091028924468036
              precision    recall  f1-score   support

           0       0.83      0.66      0.73      6038
           1       0.72      0.87      0.79      6040

    accuracy                           0.76     12078
   macro avg       0.78      0.76      0.76     12078
weighted avg       0.78      0.76      0.76     12078

