### DCN模型


In [4]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import re

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Embedding, Concatenate, Dropout, Input, Layer
from tensorflow.keras.regularizers import l2

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC

import os

In [16]:


def sparseFeature(feat, feat_num, embed_dim=4):
    """
    create dictionary for sparse feature
    :param feat: feature name
    :param feat_num: the total number of sparse features that do not repeat
    :param embed_dim: embedding dimension
    :return:
    """
    return {'feat': feat, 'feat_num': feat_num, 'embed_dim': embed_dim}

def denseFeature(feat):
    """
    create dictionary for dense feature
    :param feat: dense feature name
    :return:
    """
    return {'feat': feat}


def create_criteo_dataset(file_train, file_test, dense_features, sparse_features,
                          embed_dim=8, read_part=False, sample_num=100000, test_size=0.2):

    # 训练数据
    df_train = pd.read_csv(file_train)
    # 测试数据
    df_apply_new = pd.read_csv(file_test)
    # 合并训练集，验证集
    data = pd.concat([df_train, df_apply_new], axis=0, ignore_index=True)
    data['label'] = data['label'].fillna(str(-1))

    def clean_data(string):
        # 对数据清洗
        string = re.sub(r"[^0-9()]", "", string)
        return string.strip().lower()

    # ==============Age ===================
    # 处理Age
    # 缺失值填充
    data['age'] = data['age'].fillna(0)
    a = data['age'].copy()
    a = a.apply(lambda x: str(x).lower())
    # 统一字符类型转化成str()
    a = a.apply(lambda x: clean_data(x))
    data['age'] = a

    # ==============Gender ===================
    data['gender'] = data['gender'].fillna(str(2))
    g = data['gender'].copy()
    # 统一字符类型转化成str()
    g = g.apply(lambda x: str(x).lower())
    g = g.apply(lambda x: clean_data(x))
    data['gender'] = g

    # ==============appid_num ===================
    appid_num = data['appid']
    def get_appid_num(string):
        # 对数据清洗
        string = string.split(',')
        return len(string)

    appid_num = appid_num.apply(lambda x: get_appid_num(x))
    data['appid_num'] = appid_num
    dense_features = dense_features + ['appid_num']


    # ==============appid split ===================
    # 删除掉一些字符
    a = lambda s: re.sub('[^A-Za-z0-9 ]+', ' ', s)
    data_str = map(a, data[sparse_features])
    data[sparse_features] = list(data_str)

    df_sparse_features = data[sparse_features].str.split(' ', expand=True)
    data_df = data[dense_features]

    for i in range(1, 30):
        data_df['sparse_' + str(i)] = df_sparse_features.iloc[:, i].str.strip()

    # ==============将sparse_features转化成数字 ===================
    data_df.replace(to_replace='None', value=np.nan).dropna(axis=1, how='all', inplace=True)
    sparse_ = [val for val in data_df.columns if 'sparse_' in val]
    data_df[sparse_] = data_df[sparse_].fillna('-1')

    # 将sparse_features转化成数字，用fit_transform()函数相同的字符转化成相同的数字。
    # labelencoder 转化
    encoder = ['province', 'city', 'model']
    for feat in sparse_ + encoder:
        le = LabelEncoder()
        data_df[feat] = le.fit_transform(data_df[feat])


    # ==============Feature Engineering===================
    # ====================================================
    dense_features = [feat for feat in data_df.columns if feat not in sparse_ + ['label']]


    # 统计dense_features、sparse_features每个特征的个数和
    feature_columns = [[denseFeature(feat) for feat in dense_features]] + \
                      [[sparseFeature(feat, len(data_df[feat].unique()), embed_dim=embed_dim)
                        for feat in sparse_]]

    # 划分训练集和测试集
    """
    ### 本次案例中：将所有的样本作为训练集。
    ### 使用全部的样本作为训练集，通过交叉验证的方法划分为：测试集+验证集
    """
    data_df_1 = data_df[data_df.label != '-1']  # data.click != -1的样本为训练的样本集合
    train, test = train_test_split(data_df_1, test_size=test_size)

    train_X = [train[dense_features].values.astype('int32'), train[sparse_].values.astype('int32')]
    train_y = train['label'].values.astype('int32')
    test_X = [test[dense_features].values.astype('int32'), test[sparse_].values.astype('int32')]
    test_y = test['label'].values.astype('int32')


    # 划分需要预测的样本集
    data_df_2 = data_df[data_df.label == '-1']  # data.click == -1的样本为需要预测的样本集合
    total_test = [data_df_2[dense_features].values.astype('int32'), data_df_2[sparse_].values.astype('int32')]

    return feature_columns, (train_X, train_y), (test_X, test_y), total_test

In [17]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

read_part = True
sample_num = 6000000
test_size = 0.2

embed_dim = 8
dnn_dropout = 0.5
hidden_units = [256, 128, 64]

learning_rate = 0.001
batch_size = 256
epochs = 10

# you can modify your file path
file_train = '../data/train.csv'
file_test = '../data/apply_new.csv'

dense_features = ['label', 'gender', 'age', 'city', 'province', 'model']
sparse_features = "appid"

# ========================== Create dataset =======================
feature_columns, train, test, vail = create_criteo_dataset(file_train=file_train,
                                                           file_test=file_test,
                                                           dense_features=dense_features,
                                                           sparse_features=sparse_features,
                                                           embed_dim=embed_dim,
                                                           read_part=read_part,
                                                           sample_num=sample_num,
                                                           test_size=test_size)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [21]:
vail[1].shape

(100000, 29)

In [11]:

class CrossNetwork(Layer):
    """
    Cross Network
    """

    def __init__(self, layer_num, reg_w=1e-4, reg_b=1e-4):
        """
        :param layer_num: A scalar. The depth of cross network
        :param reg_w: A scalar. The regularizer of w
        :param reg_b: A scalar. The regularizer of b
        """
        super(CrossNetwork, self).__init__()
        self.layer_num = layer_num  # 定义的交叉层数，每一层的w，b参数都是不同的。
        self.reg_w = reg_w
        self.reg_b = reg_b

    def build(self, input_shape):
        dim = int(input_shape[-1])
        self.cross_weights = [
            self.add_weight(name='w_' + str(i),
                            shape=(dim, 1),
                            initializer='random_uniform',
                            regularizer=l2(self.reg_w),
                            trainable=True
                            )
            for i in range(self.layer_num)]
        self.cross_bias = [
            self.add_weight(name='b_' + str(i),
                            shape=(dim, 1),
                            initializer='random_uniform',
                            regularizer=l2(self.reg_b),
                            trainable=True
                            )
            for i in range(self.layer_num)]

    def call(self, inputs, **kwargs):
        # tf.expand_dims(a,axis=b)    在b轴上插入一维向量
        # tf.tensordot(a,b,axis=[a,b])    将a和b元素在a_轴和b_轴指定的索引上的乘积求和。
        # tf.squeeze(a,axis)    压缩axis的张量
        # input:shape=(none,221)
        x_0 = tf.expand_dims(inputs, axis=2)  # (None, dim, 1)
        x_l = x_0  # (None, dim, 1)
        for i in range(self.layer_num):
            # x_l1 = tf.tensordot(x_l, self.cross_weights[i], axes=[1, 0])  # (None, 1, 1)  # 1:行，0:列
            x_l1 = tf.matmul(tf.transpose(x_l, perm=[0,2,1]), self.cross_weights[i])  # (None, 1, 1)  # 1:行，0:列
            x_l = tf.matmul(x_0, x_l1) + self.cross_bias[i] + x_l  # (None, dim, 1)
        x_l = tf.squeeze(x_l, axis=2)  # (None, dim)
        return x_l


class DNN(Layer):
    """
	Deep Neural Network
	"""

    def __init__(self, hidden_units, activation='relu', dropout=0.):
        """
		:param hidden_units: A list. Neural network hidden units.
		:param activation: A string. Activation function of dnn.
		:param dropout: A scalar. Dropout number.
		"""
        super(DNN, self).__init__()
        self.dnn_network = [Dense(units=unit, activation=activation) for unit in hidden_units]
        self.dropout = Dropout(dropout)

    def call(self, inputs, **kwargs):
        x = inputs
        for dnn in self.dnn_network:
            x = dnn(x)
        x = self.dropout(x)
        return x


class DCN(keras.Model):
    def __init__(self, feature_columns, hidden_units, activation='relu',
                 dnn_dropout=0., embed_reg=1e-4, cross_w_reg=1e-4, cross_b_reg=1e-4):
        """
        Deep&Cross Network
        :param feature_columns: A list. dense_feature_columns + sparse_feature_columns
        :param hidden_units: A list. Neural network hidden units.
        :param activation: A string. Activation function of dnn.
        :param dnn_dropout: A scalar. Dropout of dnn.
        :param embed_reg: A scalar. The regularizer of embedding.
        :param cross_w_reg: A scalar. The regularizer of cross network.
        :param cross_b_reg: A scalar. The regularizer of cross network.
        """

        # Embedding and Stacking Layer
        # 因此需要Embedding操作将高维稀疏特征转化为低维密集型特征
        super(DCN, self).__init__()
        self.dense_feature_columns, self.sparse_feature_columns = feature_columns
        self.layer_num = len(hidden_units)
        self.embed_layers = {
            'embed_' + str(i): Embedding(input_dim=feat['feat_num'],
                                         input_length=1,
                                         output_dim=feat['embed_dim'],
                                         embeddings_initializer='random_uniform',
                                         embeddings_regularizer=l2(embed_reg))
            for i, feat in enumerate(self.sparse_feature_columns)
        }
        self.cross_network = CrossNetwork(self.layer_num, cross_w_reg, cross_b_reg)
        self.dnn_network = DNN(hidden_units, activation, dnn_dropout)
        self.dense_final = Dense(1)

    def call(self, inputs):
        dense_inputs, sparse_inputs = inputs
        sparse_embed = tf.concat([self.embed_layers['embed_{}'.format(i)](sparse_inputs[:, i])
                                  for i in range(sparse_inputs.shape[1])], axis=-1)
        dense_inputs = tf.cast(dense_inputs, dtype=tf.float32)  # dense_inputs 类型转化成float
        x = tf.concat([sparse_embed, dense_inputs], axis=-1)
        # Cross Network
        cross_x = self.cross_network(x)
        # DNN
        dnn_x = self.dnn_network(x)
        # Concatenate
        total_x = tf.concat([cross_x, dnn_x], axis=-1)
        outputs = tf.nn.sigmoid(self.dense_final(total_x))
        return outputs

    def summary(self):
        dense_inputs = Input(shape=(len(self.dense_feature_columns),), dtype=tf.int32)
        sparse_inputs = Input(shape=(len(self.sparse_feature_columns),), dtype=tf.int32)
        keras.Model(inputs=[dense_inputs, sparse_inputs],
                    outputs=self.call([dense_inputs, sparse_inputs])).summary()

In [12]:

# ========================== Create dataset =======================
train_X, train_y = train
test_X, test_y = test

# ============================Build Model==========================
model = WideDeep(feature_columns, hidden_units=hidden_units, dnn_dropout=dnn_dropout)
model.summary()
# ============================model checkpoint======================
# check_path = '../save/wide_deep_weights.epoch_{epoch:04d}.val_loss_{val_loss:.4f}.ckpt'
# checkpoint = tf.keras.callbacks.ModelCheckpoint(check_path, save_weights_only=True,
#                                                 verbose=1, period=5)
# ============================Compile============================
model.compile(loss=binary_crossentropy, 
              optimizer=Adam(learning_rate=learning_rate),
              metrics=[AUC()])
# ==============================Fit==============================
history = model.fit(
    train_X,
    train_y,
    epochs=epochs,
    callbacks=[EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)],  # checkpoint
    batch_size=batch_size,
    validation_split=0.1
)
# ===========================Test==============================
print('test AUC: %f' % model.evaluate(test_X, test_y)[1])

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 29)]         0                                            
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [(None,)]            0           input_2[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_1 (Te [(None,)]            0           input_2[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_2 (Te [(None,)]            0           input_2[0][0]                    
______________________________________________________________________________________________

Train on 216000 samples, validate on 24000 samples
Epoch 1/30


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30


test AUC: 0.704404


In [22]:
prediction = model.predict(vail)


In [40]:
prediction[:, 0][:10]

array([0.53916144, 0.49008954, 0.14620918, 0.26469025, 0.25581652,
       0.45006603, 0.5511661 , 0.23452327, 0.820209  , 0.53196937],
      dtype=float32)

In [46]:
a = np.round(prediction[:, 0])

In [41]:
# 读入文件并写入预测值
label_submission = pd.read_csv('../data/submit_sample.csv')
label_submission.head()

Unnamed: 0,user_id,category_id
0,1400001,0
1,1400002,1
2,1400003,0
3,1400004,0
4,1400005,0


In [44]:
len(label_submission)

100000

In [47]:
len(a)

100000

In [52]:
label_submission['category_id']=a.astype(int)

In [53]:
label_submission

Unnamed: 0,user_id,category_id
0,1400001,1
1,1400002,0
2,1400003,0
3,1400004,0
4,1400005,0
...,...,...
99995,1499996,0
99996,1499997,0
99997,1499998,1
99998,1499999,0


In [54]:
label_submission.to_csv("submission_Wide&deep.csv",index=False)