In [1]:
import numpy as np
import pandas as pd

In [61]:
from keras import layers
from keras.layers import Input,Dense,Embedding,Reshape,Add,Flatten,merge,Lambda,concatenate
from keras.optimizers import Adam
from keras.models import Model
from keras.utils import plot_model, np_utils
# from sklearn.preprocessing import OneHotEncoder,StandarScaler
from sklearn.metrics import accuracy_score
import random

In [36]:
from keras import backend as K

In [79]:
from keras.utils import to_categorical

In [194]:
from sklearn.preprocessing import LabelEncoder

In [7]:
# config.py
TRAIN_FILE = r"F:\Data\recsys-data\dcn\train.csv"
TEST_FILE = r"F:\Data\recsys-data\dcn\test.csv"

SUB_DIR = "output"


NUM_SPLITS = 3
RANDOM_SEED = 2017

# types of columns of the dataset dataframe
CATEGORICAL_COLS = [
    'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat',
    'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat',
    'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
    'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
    'ps_car_10_cat', 'ps_car_11_cat',
]

NUMERIC_COLS = [
    # # binary
    # "ps_ind_06_bin", "ps_ind_07_bin", "ps_ind_08_bin",
    # "ps_ind_09_bin", "ps_ind_10_bin", "ps_ind_11_bin",
    # "ps_ind_12_bin", "ps_ind_13_bin", "ps_ind_16_bin",
    # "ps_ind_17_bin", "ps_ind_18_bin",
    # "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
    # "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin",
    # numeric
    "ps_reg_01", "ps_reg_02", "ps_reg_03",
    "ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",

    # feature engineering
    "missing_feat", "ps_car_13_x_ps_reg_03",
]

IGNORE_COLS = [
    "id", "target",
    "ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
    "ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08",
    "ps_calc_09", "ps_calc_10", "ps_calc_11", "ps_calc_12",
    "ps_calc_13", "ps_calc_14",
    "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
    "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin"
]

In [224]:
class FeatureDictionary(object):
    def __init__(self,df = None,numeric_cols = [],ignore_cols = [],cate_cols = []):
        self.df = df
        self.cate_cols = cate_cols
        self.numeric_cols = numeric_cols
        self.ignore_cols = ignore_cols
        self.gen_feat_dict()
        # feat_dict 获取cate feature每一列的字典长度。
    def gen_feat_dict(self):
#         df = self.trainfile
        self.feat_cate_len = {}
        tc = 0
        for col in self.cate_cols:
            # 获取每一列的类别
            us = self.df[col].unique()
            us_len = len(us)
            # 获取每一列的类别对应的维度
            self.feat_cate_len[col] = us_len

In [238]:
def load_data(df,numeric_cols = NUMERIC_COLS,cate_cols=CATEGORICAL_COLS):
    def preprocess(df):
        cols = [c for c in df.columns if c not in ['id','target']]
        df['missing_feat'] = np.sum((df[cols]==-1).values,axis=1)
        df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
        for cl in cate_cols:
            le = LabelEncoder()
            df[cl] = le.fit_transform(df[cl])
        return df
    
    Df = preprocess(df)
    cols = cate_cols + numeric_cols
    dfTrain = Df[Df['target'] != -1]
    dfTest = Df[Df['target'] == -1]
    X_train = Df[Df['target'] != -1][cols]
    y_train = to_categorical(dfTrain['target'].values)

    X_test = dfTest[cols].values
    return X_train,y_train,X_test

In [235]:
dfTrain = pd.read_csv(TRAIN_FILE)
dfTest = pd.read_csv(TEST_FILE)
dfTest['target'] = -1
df = pd.concat([dfTrain,dfTest])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  after removing the cwd from sys.path.


In [242]:
X_train,y_train,X_test = load_data(df,numeric_cols = NUMERIC_COLS,cate_cols=CATEGORICAL_COLS)
fd = FeatureDictionary(df,numeric_cols = NUMERIC_COLS,ignore_cols=IGNORE_COLS,cate_cols=CATEGORICAL_COLS)

In [176]:
def embedding_layers(fd):
    # 该函数主要是定义输入和embedding输入的网络层
    embeddings_tensors = []
    continus_tensors = []
    cate_feature = fd.feat_cate_len
    numeric_feature = fd.numeric_cols
    for ec in cate_feature:
        layer_name = ec + '_inp'
        # for categorical features, embedding特征在维度保持在6×(category cardinality)**(1/4)
        embed_dim = cate_feature[ec] if int(6 * np.power(cate_feature[ec],1/4)) > cate_feature[ec] else int(6 * np.power(cate_feature[ec],1/4))
        t_inp, t_embedding = embedding_input(layer_name,cate_feature[ec],embed_dim)
        embeddings_tensors.append((t_inp,t_embedding))
        del (t_inp, t_embedding)
    for cc in numeric_feature:
        layer_name = cc +'_in'
        t_inp,t_build = continus_input(layer_name)
        continus_tensors.append((t_inp,t_build))
        del (t_inp,t_build)
    # category feature的输入 这里的输入特征顺序要与xu
    inp_layer = [et[0] for et in embeddings_tensors]
    inp_embed = [et[1] for et in embeddings_tensors]
    # numeric feature的输入
    inp_layer += [ct[0] for ct in continus_tensors]
    inp_embed += [ct[1] for ct in continus_tensors]
    
    return inp_layer,inp_embed

In [177]:
def embedding_input(name,input_dim,output_dim):
    inp = Input(shape = (1,),dtype = 'int64',name = name)
    embeddings = Embedding(input_dim,output_dim,input_length =1)(inp)
    return inp,embeddings
def continus_input(name):
    inp = Input(shape=(1,),dtype = 'float32',name = name)
    return inp, Reshape((1,1))(inp)

In [261]:
class CrossLayer(layers.Layer):
    def __init__(self,output_dim,num_layer,**kwargs):
        self.output_dim = output_dim
        self.num_layer = num_layer
        super(CrossLayer,self).__init__(**kwargs)
    
    def build(self,input_shape):
        self.input_dim = input_shape[2]
        self.W = []
        self.bias = []
        for i in range(self.num_layer):
            self.W.append(self.add_weight(shape=[1,self.input_dim],initializer = 'glorot_uniform',name='w_{}'.format(i),trainable=True))
            self.bias.append(self.add_weight(shape=[1,self.input_dim],initializer = 'zeros',name='b_{}'.format(i),trainable=True))
        self.built = True
    def call(self,input):
        for i in range(self.num_layer):
            if i==0:
#                 cross = Lambda(lambda x: Add()([K.sum(self.W[i]*K.batch_dot(K.reshape(x,(-1,self.input_dim,1)),x),axis=1,keepdims=True),self.bias[i],x]))(input)
                # 这种方法利于内存释放，先计算矩阵中简单的计算
                cross = Lambda(lambda x: K.batch_dot(K.dot(x,K.transpose(self.W[i])),x) + self.bias[i] + x)(input)
            else:
#                 cross = Lambda(lambda x: Add()([K.sum(self.W[i]*K.batch_dot(K.reshape(x,(-1,self.input_dim,1)),input),axis=1,keepdims=True),self.bias[i],input]))(cross)
                cross = Lambda(lambda x: K.batch_dot(K.dot(x,K.transpose(self.W[i])),input) + self.bias[i] + x)(cross)
        return Flatten()(cross)
        
    def compute_output_shape(self,input_shape):
        return (None,self.output_dim)   

In [179]:
# def fit(inp_layer,inp_embed,X,y):
#     inp = concatenate(inp_embed,axis=-1)
#     #deep layer
#     for i in range(6):
#         if i ==0:
#             deep = Dense(272,activation='relu')(Flatten()(inp))
#         else:
#             deep = Dense(272,activation='relu')(deep)
#     cross = CrossLayer(output_dim = inp.shape[2],num_layer=8,name = "cross_layer")(inp)
    
#     # concat both layers 
#     output = concatenate([deep,cross],axis=-1)
#     output = Dense(y.shape[1],activation='sigmoid')(output)
#     model = Model(inp_layer,output)
#     print(model.summary())
#     model.compile(Adam(0.01),loss = 'binary_crossentropy',metrics = ['accuracy'])
#     model.fit([X[c] for c in X.columns],y,batch_size=256,epochs=10)
#     return model
# def evaluate(X,y,model):
#     y_pred = model.predict(X)
#     acc = np.sum(np.argmax(y_pred,1)==np.argamx(y,1))/y.shape[0]
#     print("Accuracy:",acc)

In [None]:
def DCN(inp_layer,inp_embed):
    inp = concatenate(inp_embed,axis=-1)
    #deep layer
    for i in range(6):
        if i ==0:
            deep = Dense(272,activation='relu')(Flatten()(inp))
        else:
            deep = Dense(272,activation='relu')(deep)
    cross = CrossLayer(output_dim = inp.shape[2],num_layer=8,name = "cross_layer")(inp)
        # concat both layers 
    output = concatenate([deep,cross],axis=-1)
    output = Dense(2,activation='sigmoid')(output)
    model = Model(inp_layer,output)
    return model
def fit(model,X,y):
    model = DCN(inp_layer,inp_embed)
    print(model.summary())
    model.compile(Adam(0.01),loss = 'binary_crossentropy',metrics = ['accuracy'])
    model.fit([X[c] for c in X.columns],y,batch_size=256,epochs=10)

In [256]:
inp_layer,inp_embed = embedding_layers(fd)
model = DCN(inp_layer,inp_embed)
fit(model,X_train,y_train)

Model: "model_24"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
ps_ind_02_cat_inp (InputLayer)  (None, 1)            0                                            
__________________________________________________________________________________________________
ps_ind_04_cat_inp (InputLayer)  (None, 1)            0                                            
__________________________________________________________________________________________________
ps_ind_05_cat_inp (InputLayer)  (None, 1)            0                                            
__________________________________________________________________________________________________
ps_car_01_cat_inp (InputLayer)  (None, 1)            0                                            
___________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [260]:
inp_layer,inp_embed = embedding_layers(fd)
model = DCN(inp_layer,inp_embed)
fit(model,X_train,y_train)

Model: "model_28"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
ps_ind_02_cat_inp (InputLayer)  (None, 1)            0                                            
__________________________________________________________________________________________________
ps_ind_04_cat_inp (InputLayer)  (None, 1)            0                                            
__________________________________________________________________________________________________
ps_ind_05_cat_inp (InputLayer)  (None, 1)            0                                            
__________________________________________________________________________________________________
ps_car_01_cat_inp (InputLayer)  (None, 1)            0                                            
___________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
