### PNN_v2 模型


In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import re

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Embedding, Concatenate, Dropout, Input, Layer
from tensorflow.keras.regularizers import l2

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC

import os

In [18]:
def sparseFeature(feat, feat_num, embed_dim=4):
    """
    create dictionary for sparse feature
    :param feat: feature name
    :param feat_num: the total number of sparse features that do not repeat
    :param embed_dim: embedding dimension
    :return:
    """
    return {'feat': feat, 'feat_num': feat_num, 'embed_dim': embed_dim}

def denseFeature(feat):
    """
    create dictionary for dense feature
    :param feat: dense feature name
    :return:
    """
    return {'feat': feat}


def create_criteo_dataset(file_train, file_test, dense_features, sparse_features,
                          embed_dim=8, read_part=False, sample_num=100000, test_size=0.2):

    # 训练数据
    df_train = pd.read_csv(file_train)
    # 测试数据
    df_apply_new = pd.read_csv(file_test)
    # 合并训练集，验证集
    data = pd.concat([df_train, df_apply_new], axis=0, ignore_index=True)
    data['label'] = data['label'].fillna(str(-1))

    def clean_data(string):
        # 对数据清洗
        string = re.sub(r"[^0-9()]", "", string)
        return string.strip().lower()

    # ==============Age ===================
    # 处理Age
    # 缺失值填充
    data['age'] = data['age'].fillna(0)
    a = data['age'].copy()
    a = a.apply(lambda x: str(x).lower())
    # 统一字符类型转化成str()
    a = a.apply(lambda x: clean_data(x))
    data['age'] = a

    # ==============Gender ===================
    data['gender'] = data['gender'].fillna(str(2))
    g = data['gender'].copy()
    # 统一字符类型转化成str()
    g = g.apply(lambda x: str(x).lower())
    g = g.apply(lambda x: clean_data(x))
    data['gender'] = g

    # ==============appid_num ===================
    appid_num = data['appid']
    def get_appid_num(string):
        # 对数据清洗
        string = string.split(',')
        return len(string)

    appid_num = appid_num.apply(lambda x: get_appid_num(x))
    data['appid_num'] = appid_num
    dense_features = dense_features + ['appid_num']
    data_df = data[dense_features]

    # ==============Feature Engineering===================
    sparse_ = ['city', 'province', 'model', 'appid_num', 'gender']
    for feat in  sparse_:
        le = LabelEncoder()
        data_df[feat] = le.fit_transform(data_df[feat])

    # ====================================================
    dense_features = [feat for feat in data_df.columns if feat != 'label']

    # 统计dense_features、sparse_features每个特征的个数和
    feature_columns = [[denseFeature(feat) for feat in dense_features]] + \
                      [[sparseFeature(feat, len(data_df[feat].unique()), embed_dim=embed_dim)
                        for feat in sparse_]]
    
    
    # 划分训练集和测试集
    """
    ### 本次案例中：将所有的样本作为训练集。
    ### 使用全部的样本作为训练集，通过交叉验证的方法划分为：测试集+验证集
    """
    data_df_1 = data_df[data_df.label != '-1']  # data.click != -1的样本为训练的样本集合
    train, test = train_test_split(data_df_1, test_size=test_size)

    train_X = [train[dense_features].values.astype('int32'), train[sparse_].values.astype('int32')]
    train_y = train['label'].values.astype('int32')
    test_X = [test[dense_features].values.astype('int32'), test[sparse_].values.astype('int32')]
    test_y = test['label'].values.astype('int32')


    # 划分需要预测的样本集
    data_df_2 = data_df[data_df.label == '-1']  # data.click == -1的样本为需要预测的样本集合
    total_test = [data_df_2[dense_features].values.astype('int32'), data_df_2[sparse_].values.astype('int32')]

    return feature_columns, (train_X, train_y), (test_X, test_y), total_test

In [19]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# you can modify your file path
file_train = '../data/train.csv'
file_test = '../data/apply_new.csv'
dense_features = ['label', 'gender', 'age', 'city', 'province', 'model']
sparse_features = "appid"

embed_dim = 8
read_part = True
sample_num = 6000000
test_size = 0.2

# ========================== Create dataset =======================
feature_columns, train, test, vail = create_criteo_dataset(file_train=file_train,
                                                           file_test=file_test,
                                                           dense_features=dense_features,
                                                           sparse_features=sparse_features,
                                                           embed_dim=embed_dim,
                                                           read_part=read_part,
                                                           sample_num=sample_num,
                                                           test_size=test_size)

  if (await self.run_code(code, result,  async_=asy)):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [20]:
feature_columns

[[{'feat': 'gender'},
  {'feat': 'age'},
  {'feat': 'city'},
  {'feat': 'province'},
  {'feat': 'model'},
  {'feat': 'appid_num'}],
 [{'feat': 'city', 'feat_num': 316, 'embed_dim': 8},
  {'feat': 'province', 'feat_num': 34, 'embed_dim': 8},
  {'feat': 'model', 'feat_num': 129, 'embed_dim': 8},
  {'feat': 'appid_num', 'feat_num': 392, 'embed_dim': 8},
  {'feat': 'gender', 'feat_num': 3, 'embed_dim': 8}]]

In [21]:

class DNN(Layer):
    """
	Deep Neural Network
	"""

    def __init__(self, hidden_units, activation='relu', dropout=0.):
        """
		:param hidden_units: A list. Neural network hidden units.
		:param activation: A string. Activation function of dnn.
		:param dropout: A scalar. Dropout number.
		"""
        super(DNN, self).__init__()
        self.dnn_network = [Dense(units=unit, activation=activation) for unit in hidden_units]
        self.dropout = Dropout(dropout)

    def call(self, inputs, **kwargs):
        x = inputs
        for dnn in self.dnn_network:
            x = dnn(x)
        x = self.dropout(x)
        return x


class PNN(keras.Model):
    def __init__(self, feature_columns, hidden_units, mode='in', dnn_dropout=0.,
                 activation='relu', embed_reg=1e-4, w_z_reg=1e-4, w_p_reg=1e-4, l_b_reg=1e-4):
        """
        Product-based Neural Networks
        :param feature_columns: A list. dense_feature_columns + sparse_feature_columns
        :param hidden_units: A list. Neural network hidden units.
        :param mode: A string. 'in' IPNN or 'out'OPNN.
        :param activation: A string. Activation function of dnn.
        :param dnn_dropout: A scalar. Dropout of dnn.
        :param embed_reg: A scalar. The regularizer of embedding.
        :param w_z_reg: A scalar. The regularizer of w_z_ in product layer
        :param w_p_reg: A scalar. The regularizer of w_p in product layer
        :param l_b_reg: A scalar. The regularizer of l_b in product layer
        """
        super(PNN, self).__init__()
        # inner product or outer product
        self.mode = mode
        self.dense_feature_columns, self.sparse_feature_columns = feature_columns
        # the number of feature fields
        self.field_num = len(self.sparse_feature_columns)
        self.embed_dim = self.sparse_feature_columns[0]['embed_dim']
        # The embedding dimension of each feature field must be the same
        self.embed_layers = {
            'embed_' + str(i): Embedding(input_dim=feat['feat_num'],
                                         input_length=1,
                                         output_dim=feat['embed_dim'],
                                         embeddings_initializer='random_uniform',
                                         embeddings_regularizer=l2(embed_reg))
            for i, feat in enumerate(self.sparse_feature_columns)
        }
        # parameters
        self.w_z = self.add_weight(name='w_z',
                                   shape=(self.field_num, self.embed_dim, hidden_units[0]),
                                   initializer='random_uniform',
                                   regularizer=l2(w_z_reg),
                                   trainable=True
                                   )
        if mode == 'in':
            self.w_p = self.add_weight(name='w_p',
                                       shape=(self.field_num, self.field_num, hidden_units[0]),
                                       initializer='random_uniform',
                                       reguarizer=l2(w_p_reg),
                                       trainable=True)
        # out
        else:
            self.w_p = self.add_weight(name='w_p',
                                       shape=(self.embed_dim, self.embed_dim, hidden_units[0]),
                                       initializer='random_uniform',
                                       regularizer=l2(w_p_reg),
                                       trainable=True)
        self.l_b = self.add_weight(name='l_b', shape=(hidden_units[0], ),
                                   initializer='random_uniform',
                                   regularizer=l2(l_b_reg),
                                   trainable=True)
        # dnn
        self.dnn_network = DNN(hidden_units[1:], activation, dnn_dropout)
        self.dense_final = Dense(1)

    def call(self, inputs):
        dense_inputs, sparse_inputs = inputs
        dense_inputs = tf.cast(dense_inputs, dtype=tf.float32)  # dense_inputs 类型转化成float
        embed = [self.embed_layers['embed_{}'.format(i)](sparse_inputs[:, i])
                 for i in range(sparse_inputs.shape[1])]
        embed = tf.transpose(tf.convert_to_tensor(embed), [1, 0, 2])  # (None, field_num, embed_dim)
        z = embed  # (None, field, embed_dim)
        # product layer
        if self.mode == 'in':
            p = tf.matmul(embed, tf.transpose(embed, [0, 2, 1]))  # (None, field_num, field_num)
        else:  # out
            f_sum = tf.reduce_sum(embed, axis=1, keepdims=True)  # (None, 1 embed_num)
            p = tf.matmul(tf.transpose(f_sum, [0, 2, 1]), f_sum)  # (None, embed_num, embed_num)

        l_z = tf.tensordot(z, self.w_z, axes=2)  # (None, h_unit)
        l_p = tf.tensordot(p, self.w_p, axes=2)  # (None, h_unit)
        l_1 = tf.nn.relu(tf.concat([l_z + l_p + self.l_b, dense_inputs], axis=-1))
        # dnn layer
        dnn_x = self.dnn_network(l_1)
        outputs = tf.nn.sigmoid(self.dense_final(dnn_x))
        return outputs

    def summary(self):
        dense_inputs = Input(shape=(len(self.dense_feature_columns),), dtype=tf.float32)
        sparse_inputs = Input(shape=(len(self.sparse_feature_columns),), dtype=tf.int32)
        keras.Model(inputs=[dense_inputs, sparse_inputs],
                    outputs=self.call([dense_inputs, sparse_inputs])).summary()


In [22]:
read_part = True
sample_num = 6000000
test_size = 0.2

embed_dim = 8
mode = 'in'
dnn_dropout = 0.5
hidden_units = [256, 128, 64]

learning_rate = 0.001
batch_size = 256
epochs = 30

# ========================== Create dataset =======================
train_X, train_y = train
test_X, test_y = test

# ============================Build Model==========================
model = PNN(feature_columns, hidden_units, dnn_dropout)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 5)]          0                                            
__________________________________________________________________________________________________
tf_op_layer_strided_slice_4 (Te [(None,)]            0           input_4[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_5 (Te [(None,)]            0           input_4[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_6 (Te [(None,)]            0           input_4[0][0]                    
____________________________________________________________________________________________

In [23]:
# ============================Compile============================
model.compile(loss=binary_crossentropy, 
              optimizer=Adam(learning_rate=learning_rate),
              metrics=[AUC()])
# ==============================Fit==============================
history = model.fit(
    train_X,
    train_y,
    epochs=epochs,
    callbacks=[EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)],  # checkpoint
    batch_size=batch_size,
    validation_split=0.1
)
# ===========================Test==============================
print('test AUC: %f' % model.evaluate(test_X, test_y)[1])

Train on 216000 samples, validate on 24000 samples
Epoch 1/30


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/30
Epoch 3/30


test AUC: 0.652658


In [24]:
prediction = model.predict(vail)

In [25]:
prediction[:, 0][:10]

array([0.48763663, 0.4001873 , 0.4553542 , 0.36895916, 0.5099634 ,
       0.37988955, 0.45259756, 0.33756942, 0.48164344, 0.56130886],
      dtype=float32)

In [10]:
prediction[:, 0][:10]

array([0.5213383 , 0.3881776 , 0.46123424, 0.64364207, 0.5414387 ,
       0.47406936, 0.5654541 , 0.5656936 , 0.701681  , 0.538635  ],
      dtype=float32)

In [26]:
a = np.round(prediction[:, 0])

In [27]:
# 读入文件并写入预测值
label_submission = pd.read_csv('../data/submit_sample.csv')
label_submission.head()

Unnamed: 0,user_id,category_id
0,1400001,0
1,1400002,1
2,1400003,0
3,1400004,0
4,1400005,0


In [28]:
len(label_submission)

100000

In [29]:
len(a)

100000

In [30]:
label_submission['category_id']=a.astype(int)

In [31]:
label_submission

Unnamed: 0,user_id,category_id
0,1400001,0
1,1400002,0
2,1400003,0
3,1400004,0
4,1400005,1
...,...,...
99995,1499996,1
99996,1499997,0
99997,1499998,1
99998,1499999,0


In [33]:
label_submission.to_csv("../submission/submission_PNN_v3.csv",index=False)