## GBDT+LR

In [1]:
import os
import gc
import numpy as np
import pandas as pd
from collections import namedtuple

import lightgbm as lgb 
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_curve
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.python.keras.callbacks import EarlyStopping

##### 获取数据

In [3]:
def get_criteo_data():
    """ 读取criteo数据集 """
    train_path = '../data/criteo/train.txt'
    test_path = '../data/criteo/test.txt'

    dense_column_names = ['I' + str(i) for i in range(1, 14)]
    sparse_column_names = ['C' + str(i) for i in range(14, 40)]
    column_names = ['label'] + dense_column_names + sparse_column_names

    train = pd.read_csv(train_path, names=column_names, sep='\t')
    test = pd.read_csv(test_path, names=column_names, sep='\t')
    return train, test, dense_column_names, sparse_column_names

train_df, test_df, dense_column_names, sparse_column_names = get_criteo_data()
train_len = train_df.shape[0]
print('train_df.shape: {}, test_df.shape: {}, train_len:{}'.format(train_df.shape, test_df.shape, train_len))
print('dense_column_names:{}\nsparse_column_names:{} '.format(dense_column_names, sparse_column_names))

train_df.shape: (1000000, 40), test_df.shape: (1000000, 40), train_len:1000000
dense_column_names:['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13']
sparse_column_names:['C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27', 'C28', 'C29', 'C30', 'C31', 'C32', 'C33', 'C34', 'C35', 'C36', 'C37', 'C38', 'C39'] 


##### 数据预处理

使用sklearn的LR要进行one-hot编码，当类别数量特别多时，可能出现内存不够的情况，这时需要对数据构造成稀疏矩阵格式（scipy中csr_matrix格式），再进行训练。

如果直接使用TF的Embedding方式进行one-hot编码，此时直接查表，无需再进行one-hot编码，也就不存在内存不够的问题。

In [4]:
data = pd.concat([train_df, test_df], axis=0)

def label_encoder(y, frequency_threshold=100):
    """ 字典编码，eg: label_encoder([2,2,3,3,3,5,6], 2) => array([2, 2, 1, 1, 1, 0, 0], dtype=int8)
    :params y 类别特征
    :params frequency_threshold 阈值，低于阈值的类别编码为0
    :return: 编码后的numpy数组
    """
    value_counts = pd.value_counts(y)
    categories = value_counts[value_counts >= frequency_threshold].index.to_numpy()
    #print('categories', categories)
    return pd.Categorical(y, categories=categories).codes+1

def data_processing(df, dense_column_names, sparse_column_names):
    """ 数据预处理 """
    df[dense_column_names] = df[dense_column_names].fillna(0.0)
    for f in dense_column_names:
        df[f] = df[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    
    df[sparse_column_names] = df[sparse_column_names].fillna("-1")
    for f in sparse_column_names:
        df[f] = label_encoder(df[f], 100)
    return df[dense_column_names + sparse_column_names]

df = data_processing(data, dense_column_names, sparse_column_names)
df['label'] = data['label']

train_df, test_df = df.iloc[0:train_len], df.iloc[train_len:]
print('train_df.shape:{}, test_df.shape:{}'.format(train_df.shape, test_df.shape))

# 编码
for c in sparse_column_names:
    train_df[c] = train_df[c].astype('category')
    test_df[c] = test_df[c].astype('category')
    
x_train, x_val, y_train, y_val = train_test_split(train_df.drop(['label'], axis=1), train_df['label'], test_size = 0.2, random_state = 2018)

train_df.shape:(1000000, 40), test_df.shape:(1000000, 40)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


##### GBDT模型构建

gbdt特征向量维度为num_trees参数决定，某一离散特征下的类别数量由num_leaves决定。

- num_trees GBDT特征的维度数量
- num_leaves GBDT特征下的类别数量

In [8]:
lgb_train = lgb.Dataset(
            x_train,
            y_train)
lgb_eval = lgb.Dataset(
            x_val,
            y_val,
            reference=lgb_train)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'auc'},
    'learning_rate': 0.03,
    'max_depth': 7,
    'num_leaves': 80,
    'subsample': 0.9,
    'min_data_in_leaf':100,
    'bagging_fraction':0.7,
    'bagging_freq' :1,
    'verbose': -1,
    'num_leaves': 1000,
    'num_trees': 10, # 控制gbdt特征纬度
}
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=40000,
                valid_sets=lgb_eval,
                early_stopping_rounds=50,
                verbose_eval=10)







Training until validation scores don't improve for 50 rounds
[10]	valid_0's auc: 0.740876
Did not meet early stopping. Best iteration is:
[10]	valid_0's auc: 0.740876


In [10]:
# 预测得到叶子节点
gbdt_feats_train = gbm.predict(train_df.drop(['label'], axis=1), pred_leaf=True, predict_disable_shape_check=True)
gbdt_feats_test = gbm.predict(test_df.drop(['label'], axis=1), pred_leaf=True, predict_disable_shape_check=True)

# 合并GBDT特征和原始特征
gbdt_feats_name = ['gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1])]
df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns = gbdt_feats_name) 
df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns = gbdt_feats_name)
train_df = pd.concat([train_df, df_train_gbdt_feats], axis=1)
test_df = pd.concat([test_df, df_test_gbdt_feats], axis=1)

del gbdt_feats_train
del gbdt_feats_test 
gc.collect()

df = pd.concat([train_df, test_df], axis=0)

##### LR模型构建

直接使用sklearn的LR模型在onehot编码过程中存在维度爆炸的问题，故这里考虑使用TF的Embeding(embedding_size=1)查表的做法，而无需进行onehot操作。

In [11]:
SparseFeature = namedtuple('SparseFeature', ['name', 'vocabulary_size', 'embedding_size'])
DenseFeature = namedtuple('DenseFeature', ['name', 'dimension'])

feature_columns = [SparseFeature(f, vocabulary_size=df[f].nunique(), embedding_size=4) for f in sparse_column_names + gbdt_feats_name] + \
[DenseFeature(f, 1) for f in dense_column_names]
feature_columns

[SparseFeature(name='C14', vocabulary_size=138, embedding_size=4),
 SparseFeature(name='C15', vocabulary_size=421, embedding_size=4),
 SparseFeature(name='C16', vocabulary_size=1308, embedding_size=4),
 SparseFeature(name='C17', vocabulary_size=1708, embedding_size=4),
 SparseFeature(name='C18', vocabulary_size=52, embedding_size=4),
 SparseFeature(name='C19', vocabulary_size=10, embedding_size=4),
 SparseFeature(name='C20', vocabulary_size=3251, embedding_size=4),
 SparseFeature(name='C21', vocabulary_size=82, embedding_size=4),
 SparseFeature(name='C22', vocabulary_size=3, embedding_size=4),
 SparseFeature(name='C23', vocabulary_size=2355, embedding_size=4),
 SparseFeature(name='C24', vocabulary_size=2308, embedding_size=4),
 SparseFeature(name='C25', vocabulary_size=1347, embedding_size=4),
 SparseFeature(name='C26', vocabulary_size=1979, embedding_size=4),
 SparseFeature(name='C27', vocabulary_size=26, embedding_size=4),
 SparseFeature(name='C28', vocabulary_size=1960, embedding_si

In [12]:
def build_input_layers(feature_columns):
    """ 构建输入层 """
    dense_input_dict, sparse_input_dict = {}, {}
    for f in feature_columns:
        if isinstance(f, DenseFeature):
            dense_input_dict[f.name] = Input(shape=(f.dimension, ), name=f.name)
        elif isinstance(f, SparseFeature):
            sparse_input_dict[f.name] = Input(shape=(1, ), name=f.name)
    return dense_input_dict, sparse_input_dict

def build_embedding_layers(feature_columns, is_linear):
    """ 构建embedding层 """
    embedding_layer_dict = {}
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    if is_linear:
        for f in sparse_feature_columns:
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size + 1, 1, name='1d_emb_' + f.name)
    else:
        for f in sparse_feature_columns:
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size + 1, f.embedding_size, name='kd_emb_' + f.name)
    return embedding_layer_dict
    
def get_linear_logits(dense_input_dict, sparse_input_dict, feature_columns):
    """ 线性回归 """
    concat_dense_inputs = Concatenate(axis=1)(list(dense_input_dict.values()))
    dense_logits_output = Dense(1, kernel_regularizer='l1')(concat_dense_inputs)
    
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    embedding_layer_dict = build_embedding_layers(sparse_feature_columns, is_linear=True)
    sparse_1d_embed_list = []
    for f in sparse_feature_columns:
        _input = sparse_input_dict[f.name]
        _embed = Flatten()(embedding_layer_dict[f.name](_input))
        sparse_1d_embed_list.append(_embed)
    if len(sparse_1d_embed_list) > 0:
        sparse_logits_output = Add()(sparse_1d_embed_list)
        linear_logits = Add()([dense_logits_output, sparse_logits_output])
    else:
        linear_logits = dense_logits_output
    return linear_logits

def model_metric(prob, label, thr=0.5):
    """ 模型评估 """
    # AUC
    fpr, tpr, threshold = metrics.roc_curve(label, prob)
    auc = metrics.auc(fpr, tpr)
    score = metrics.accuracy_score(label, prob > thr)
    # LogLoss
    logloss = log_loss(label, prob)
    print('模型准确率:{}, AUC得分:{}, LogLoss:{}'.format(score, auc, logloss))
    print(classification_report(label, prob > thr, digits=2))
    print('==========================================================')
    
def LR(feature_columns, seed=1024, l2_reg=1e-5, task='binary'):
    """ Instantiates LR architecture
    :param feature_columns 
    :param seed
    :param l2_reg L2regularization
    :return: A kears model instance
    """
    dense_input_dict, sparse_input_dict = build_input_layers(feature_columns)
    input_list = list(dense_input_dict.values()) + list(sparse_input_dict.values())
    
    linear_logits = get_linear_logits(dense_input_dict, sparse_input_dict, feature_columns)
    output_layer = Activation("sigmoid")(linear_logits)
    model = Model(input_list, output_layer)
    return model

model = LR(feature_columns)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
C14 (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
C15 (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
C16 (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
C17 (InputLayer)                [(None, 1)]          0                                            
______________________________________________________________________________________________

In [14]:
train_input = {f: train_df[f] for f in dense_column_names + sparse_column_names + gbdt_feats_name}
test_input = {f: test_df[f] for f in dense_column_names + sparse_column_names + gbdt_feats_name}
# train_input = {f: train[f] for f in dense_column_names}
# test_input = {f: test[f] for f in dense_column_names}

my_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='auto')
]
model.compile('adam',
              loss='binary_crossentropy',
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])
model.fit(train_input,
          train_df['label'].values,
          batch_size=1024,
          epochs=100,
          validation_split=0.2,
          callbacks=my_callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 00016: early stopping


<keras.callbacks.History at 0x2a8bae407b8>

In [15]:
# 模型预测与评估
result = model.predict(test_input)
model_metric(np.array([i[0] for i in result]), test_df['label'].values)

模型准确率:0.783871, AUC得分:0.7713829679237706, LogLoss:0.46848234563941915
              precision    recall  f1-score   support

           0       0.81      0.93      0.87    751819
           1       0.62      0.34      0.44    248181

    accuracy                           0.78   1000000
   macro avg       0.71      0.64      0.65   1000000
weighted avg       0.76      0.78      0.76   1000000



In [16]:
# ====== Sklearn LR ======
# data = pd.concat([train, test])
# # 稀疏特征维度过多，one-hot后维度爆炸，故再进行一个label_encoder编码
# for f in sparse_column_names:
#     data[f] = label_encoder(data[f], 2000)

# # onehot编码
# for col in gbdt_feats_name+sparse_column_names:
#     print('feature: {}, count:{}'.format(col, len(data[col].value_counts())))
#     onehot_features = pd.get_dummies(data[col], prefix=col)
#     data.drop([col], axis=1, inplace=True)
#     data = pd.concat([data, onehot_features], axis=1)

# train = data[: train_len]
# test = data[train_len:]
# del data
# gc.collect()

# # 训练
# x_train, x_val, y_train, y_val = train_test_split(train.drop(['label'], axis=1), train['label'], test_size=0.3, random_state = 1024)
# lr = LogisticRegression()
# lr.fit(x_train, y_train)

# # 训练集和验证集
# model_metric(lr.predict_proba(x_train)[:, 1], y_train)
# model_metric(lr.predict_proba(x_val)[:, 1], y_val)

# # # 模型预测与评估
# preds = lr.predict_proba(test.drop(['label'], axis=1))[:, 1]
# model_metric(preds, test['label'], thr=0.5)