In [1]:
import numpy as np
import pandas as pd
# from sklearn.preprocessing import OneHotEncoder,StandarScaler
from sklearn.metrics import accuracy_score
import random
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score

import tensorflow as tf

from collections import Counter

import math

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [23]:
class PNN(tf.keras.Model):
    def __init__(self,num_feat,num_field,dropout_deep,deep_layer_sizes,product_layer_dim=10,reg_l1=0.01,reg_l2=1e-5,embedding_size=10,product_type='outer'):
        super().__init__()
        self.reg_l1 = reg_l1
        self.reg_l2 = reg_l2
        self.num_feat = num_feat # F =features nums
        self.num_field = num_field # N =fields of a feature 
        self.product_layer_dim = product_layer_dim # D1 pnn dim
        self.dropout_deep  = dropout_deep
        
        # Embedding 这里采用embeddings层因此大小为F* M F为特征数量，M为embedding的维度
        feat_embeddings = tf.keras.layers.Embedding(num_feat, embedding_size, embeddings_initializer='uniform') # F * M 
        self.feat_embeddings = feat_embeddings
        
        # 定义随机初始化
        initializer = tf.initializers.GlorotUniform()
        
        # linear part 线性层就是embedding层的复制，因此线性信号权重大小是D1 * N * M，为什么因此是线性层维度为 D1，embedding层维度为N* M
        # 因此权重大小为D1 * N *M
        self.linear_weights = tf.Variable(initializer(shape=(product_layer_dim,num_field,embedding_size))) # D1 * N * M
        
        # quadratic part 
        self.product_type = product_type
        if product_type == 'inner':
            self.theta = tf.Variable(initializer(shape=(product_layer_dim,num_field))) # D1 * N

        else:
            self.quadratic_weights = tf.Variable(initializer(shape=(product_layer_dim,embedding_size, embedding_size)))# D1 * M * M
        
        # fc layer
        self.deep_layer_sizes = deep_layer_sizes
        #神经网络方面的参数
        for i in range(len(deep_layer_sizes)):
            setattr(self, 'dense_' + str(i),tf.keras.layers.Dense(deep_layer_sizes[i]))
            setattr(self, 'batchNorm_' + str(i),tf.keras.layers.BatchNormalization())
            setattr(self, 'activation_' + str(i),tf.keras.layers.Activation('relu'))
            setattr(self, 'dropout_' + str(i),tf.keras.layers.Dropout(dropout_deep[i]))
        
        # last layer
        self.fc = tf.keras.layers.Dense(1,activation=None,use_bias=True)
        
    def call(self,feat_index,feat_value):
        # call函数接收输入变量
        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。
        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M 
#         print(feat_value.get_shape())
        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)
        # linear part 
        lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1
        
        # quadratic part
        if self.product_type == 'inner':
            theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M 
            lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1
        else:
            embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M
            p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)
            lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1
        
        y_deep = tf.concat((lz,lp),axis=1)
        y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)
        
        for i in range(len(self.deep_layer_sizes)):
            y_deep = getattr(self,'dense_' + str(i))(y_deep)
            y_deep = getattr(self,'batchNorm_' + str(i))(y_deep)
            y_deep = getattr(self,'activation_' + str(i))(y_deep)
            y_deep = getattr(self,'dropout_' + str(i))(y_deep)
        
        output = self.fc(y_deep)
        
        return output 

In [7]:
train = pd.read_csv(r'F:\baidudownload\kaggle-2014-criteo-master\kaggle-2014-criteo-master\train.tiny.csv')

train = train.fillna(0)

traindrop = train.drop(columns = ['Id'])

traindrop.to_csv(r'F:\baidudownload\kaggle-2014-criteo-master\kaggle-2014-criteo-master\train.txt',sep='\t', index=False,header=None)

In [11]:
freq_ = 10
# dir_feat_dict_ = 'feat_dict_' + str(freq_) + '.pkl2'
continuous_range_ = range(1, 14)
categorical_range_ = range(14, 40)

# 统计离散特征每个离散值出现的次数组成字典
feat_cnt = Counter()
with open(r'F:\baidudownload\kaggle-2014-criteo-master\kaggle-2014-criteo-master\train.txt', 'r') as fin:
    for line_idx, line in enumerate(fin):
        features = line.rstrip('\n').split('\t')
        for idx in categorical_range_:
            if features[idx] == '': continue
            feat_cnt.update([features[idx]])

In [13]:
# Only retain discrete features with high frequency
dis_feat_set = set() # 高频段的离散字符
for feat, ot in feat_cnt.items():
    if ot >= freq_:
        dis_feat_set.add(feat)

In [14]:
# Create a dictionary for continuous and discrete features
feat_dict = {}
tc = 1
# Continuous features
for idx in continuous_range_:
    feat_dict[idx] = tc
    tc += 1 # 代表占据一列

# Discrete features
cnt_feat_set = set()
with open(r'F:\baidudownload\kaggle-2014-criteo-master\kaggle-2014-criteo-master\train.txt', 'r') as fin:
    for line_idx, line in enumerate(fin):
        features = line.rstrip('\n').split('\t')
        for idx in categorical_range_:
            # 排除空字符和低频离散字符
            if features[idx] == '' or features[idx] not in dis_feat_set:
                continue
            # 排除连续性数值
            if features[idx] not in cnt_feat_set:
                cnt_feat_set.add(features[idx])
                # 获取种类数
                feat_dict[features[idx]] = tc
                tc += 1

In [16]:
file_path = "F:\\baidudownload\\kaggle-2014-criteo-master\\kaggle-2014-criteo-master\\"

In [18]:
cont_features=['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',
       'I10', 'I11', 'I12', 'I13']
dist_features = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',
       'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17',
       'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']

In [21]:
train_label = []
train_value = []
train_idx = []
test_label = []
test_value = []
test_idx = []

continuous_range_ = range(1, 14)
categorical_range_ = range(14, 40)
cont_max_=[]
cont_min_=[]
for cf in cont_features:
    cont_max_.append(max(train[cf]))
    cont_min_.append(min(train[cf]))
cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))]

def process_line_(line):
    features = line.rstrip('\n').split('\t')
    feat_idx, feat_value, label = [], [], []

    # MinMax Normalization
    for idx in continuous_range_:
        if features[idx] == '':
            feat_idx.append(0)
            feat_value.append(0.0)
        else:
            feat_idx.append(feat_dict[idx])
            # 归一化
            feat_value.append(round((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1], 6))

    # 处理离散型数据
    for idx in categorical_range_:
        if features[idx] == '' or features[idx] not in feat_dict:
            feat_idx.append(0)
            feat_value.append(0.0)
        else:
            feat_idx.append(feat_dict[features[idx]])
            feat_value.append(1.0)
    return feat_idx, feat_value, [int(features[0])]
split_ratio = 0.9
with open(file_path + 'train.txt', 'r') as fin:
    for line_idx, line in enumerate(fin):

        feat_idx, feat_value, label = process_line_(line)
        if np.random.random() <= split_ratio:
            train_label.append(label)
            train_idx.append(feat_idx)
            train_value.append(feat_value)
        else:
            test_label.append(label)
            test_idx.append(feat_idx)
            test_value.append(feat_value)

In [24]:
pnn = PNN(num_feat=len(feat_dict) + 1, num_field=39, dropout_deep=[0.5, 0.5, 0.5],
                deep_layer_sizes=[400, 400], product_layer_dim=10,
                reg_l1=0.01, reg_l2=1e-5, embedding_size=10, product_type='outer')

In [25]:
train_ds = tf.data.Dataset.from_tensor_slices(
    (train_label,train_idx,train_value)).shuffle(10000).batch(32)

In [26]:
@tf.function
def train_one_step(model, optimizer, idx, value, label):
    with tf.GradientTape() as tape:
        output = model(idx,value)
        loss = loss_object(y_true=label, y_pred=output)
    grads = tape.gradient(loss, model.trainable_variables)
    grads = [tf.clip_by_norm(g, 100) for g in grads]
    optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))
    
    train_loss(loss)
    train_accuracy(label,output)

In [27]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_acc')

loss_object = tf.keras.losses.BinaryCrossentropy()

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

In [35]:
EPOCHS = 50
for epoch in range(EPOCHS):
    for label, idx, value in train_ds:
        train_one_step(pnn,optimizer,idx, value,label)
    template = 'Epoch {}, Loss: {}, Accuracy: {}'
    print (template.format(epoch+1,
                             train_loss.result(),train_accuracy.result()))

    def call(self,feat_index,feat_value):
        # call函数接收输入变量
        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。
        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M 
#         print(feat_value.get_shape())
        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)
        # linear part 
        lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1
        
        # quadratic part
        if self.product_type == 'inner':
            theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M 
            lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1
        else:
            embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M
            p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)
            lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1
        
        y_deep = tf.concat((lz,lp),axis=1)
        y_deep = tf.keras.layers.Dropout

    def call(self,feat_index,feat_value):
        # call函数接收输入变量
        # embedding part  feat_index = inputs为输入 feat_embeddings为一个layer。
        feat_embedding_0 = self.feat_embeddings(feat_index) # Batch * N * M 
#         print(feat_value.get_shape())
        feat_embedding = tf.einsum('bnm,bn->bnm',feat_embedding_0,feat_value)
        # linear part 
        lz = tf.einsum('bnm,dnm->bd',feat_embedding,self.linear_weights) # Batch * D1
        
        # quadratic part
        if self.product_type == 'inner':
            theta = tf.einsum('bnm,dn->bdnm',feat_embedding,self.theta) # Batch * D1 * N * M 
            lp = tf.einsum('bdnm,bdnm->bd',theta,theta) # Batch * D1
        else:
            embed_sum = tf.reduce_sum(feat_embedding,axis=1) # Batch * M
            p = tf.einsum('bm,bn->bmn',embed_sum,embed_sum)
            lp = tf.einsum('bmn,dmn->bd',p,self.quadratic_weights) # Batch * D1
        
        y_deep = tf.concat((lz,lp),axis=1)
        y_deep = tf.keras.layers.Dropout