In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [3]:
input_fields = ["city", "gender", "source_type", "source_screen_name",
     "source_system_tab", "song_length_norm", "song_count_quant"]

In [4]:
song_count = pd.read_csv("feature/song_play_count.csv")
unigrams = [0]+song_count['song_play_count'].values.tolist()

In [5]:
class EmbeddingLayer:
    def __init__(self, vocab_size, embedding_dim, var_scope,padding_size = 0,flatten=True):
        self.var_scope = var_scope
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.flatten = flatten
        if padding_size==0:
            with tf.variable_scope(self.var_scope):
                self.embedding_matrix = tf.get_variable(shape=[vocab_size, embedding_dim], dtype=tf.float32,
                                                        name="embedding_matrix", trainable=True,
                                                        initializer=tf.glorot_uniform_initializer())
        else:
            with tf.variable_scope(self.var_scope):
                self.padding_mat = tf.zeros(shape=[padding_size,embedding_dim],dtype=tf.float32,name="padding_mat")
                self.embedding_matrix_ = tf.get_variable(shape=[vocab_size-padding_size, embedding_dim], dtype=tf.float32,
                                                        name="embedding_matrix_", trainable=True,
                                                        initializer=tf.glorot_uniform_initializer())
                self.embedding_matrix = tf.concat([self.padding_mat,self.embedding_matrix_],axis=0,name="embedding_matrix")

    def __call__(self, input_index):
        with tf.variable_scope(self.var_scope):
            input_identity = tf.cast(input_index, tf.int32, name="lookup_index")
            result = tf.nn.embedding_lookup(self.embedding_matrix, input_identity,
                                                  name="lookup_result")
            if self.flatten:
                return tf.squeeze(result,axis=[1])
            else:
                return result

In [8]:
def process_data(data_path,user_play_song_seq_data_path,max_seq_length,input_fields):
    data = pd.read_csv(data_path)
    data = data[data['target']==1]
    user_play_song_seq_data = pd.read_csv(user_play_song_seq_data_path)
    data = data.merge(user_play_song_seq_data, on=['msno', 'global_index'], how="left")
    data['user_play_seq'] = data['song_seq'].apply(lambda x: process_user_play_song_seq_feature(x,max_seq_length))
    feature_cols = input_fields
    cols = feature_cols + ['song_id'] + ['user_play_seq']
    return data.loc[:,cols]

In [9]:
def process_user_play_song_seq_feature(feature,max_seq_length):
    feature_str = str(feature)
    if feature_str is None or feature_str=="" or feature_str=="nan":
        return np.zeros(shape=(max_seq_length),dtype=float)
    else:
        split_array = [float(i) for i in feature_str.split(",")]
        if len(split_array)==max_seq_length:
            return np.asarray(split_array)
        elif len(split_array)>max_seq_length:
            return np.asarray(split_array)[:max_seq_length]
        else:
            np_array = np.asarray(split_array)
            return np.hstack([np.zeros((max_seq_length-len(np_array))),np_array])

In [None]:
class YtbDNN:
    def __init__(self,input_fields,unigrams):
        self.negative_sample_size = 4000
        self.batch_size = 100
        self.input_fields = input_fields
        self.user_play_seq_max_length=10
        self.lr = 0.001
        self.epochs = 2
        self.inputs = {}
        self.topk = 50
        self.unigrams = unigrams
        self.mlp_size_config = [256,128,16]
        self.embedding_config = {
            "gender": {"vocab_size": 3, "embedding_dim": 2, "padding_size": 0},
            "city": {"vocab_size": 21, "embedding_dim": 4, "padding_size": 0},
            "song_id": {"vocab_size": 359967, "embedding_dim": 16, "padding_size": 1},
            "inference_song_id": {"vocab_size": 359967, "embedding_dim": 16, "padding_size": 1},
            "source_type": {"vocab_size": 13, "embedding_dim": 4, "padding_size": 0},
            "source_screen_name": {"vocab_size": 21, "embedding_dim": 4, "padding_size": 0},
            "source_system_tab": {"vocab_size": 9, "embedding_dim": 4, "padding_size": 0}
        }
        self.build()

    def build_mlp_layer(self,input_features):
        net = tf.identity(input_features, "input_op")
        for units in self.mlp_size_config:
            net = tf.layers.dense(net, units=units, activation=tf.nn.relu,
                                  kernel_regularizer=tf.keras.regularizers.l2(0.0005),
                                  kernel_initializer=tf.glorot_normal_initializer())
        return net


    def avg_pooling(self,seq_embedding,seq_length):
        embedding_sum = tf.reduce_sum(seq_embedding,axis=1)
        length = tf.cast(seq_length,dtype=tf.float32)
        avg_emb = tf.div(embedding_sum,length,name="avg_pooling_embedding")
        return avg_emb


    def build_embedding_layer(self,feature_name,flatten=True):
        f_vocab_size = self.embedding_config[feature_name]['vocab_size']
        f_embedding_size = self.embedding_config[feature_name]['embedding_dim']
        f_padding_size = self.embedding_config[feature_name]['padding_size']
        return EmbeddingLayer(f_vocab_size,f_embedding_size,feature_name,f_padding_size,flatten)

    # 定义网络的结构和输入
    def build(self):
        ## 定义输入的placeholder，后缀ph是placeholder的缩写
        self.source_type_ph = tf.placeholder(dtype=tf.int64,shape=[None,1],name="source_type")
        self.inputs["source_type"] = self.source_type_ph
        self.source_screen_name_ph = tf.placeholder(dtype=tf.int64,shape=[None,1],name="source_screen_name")
        self.inputs["source_screen_name"] = self.source_screen_name_ph
        self.source_system_tab_ph = tf.placeholder(dtype=tf.int64,shape=[None,1],name="source_system_tab")
        self.inputs["source_system_tab"] = self.source_system_tab_ph
        self.gender_ph = tf.placeholder(dtype=tf.int64,shape=[None,1],name="gender")
        self.inputs["gender"] = self.gender_ph
        self.city_ph = tf.placeholder(dtype=tf.int64,shape=[None,1],name="city")
        self.inputs["city"] = self.city_ph
        self.label_ph = tf.placeholder(dtype=tf.int64,shape=[None,1],name="label")
        self.inputs["label"] = self.label_ph
        self.user_play_seq_ph = tf.placeholder(dtype=tf.float64,shape=[None,self.user_play_seq_max_length],name="user_play_seq")
        self.inputs["user_play_seq"] = self.user_play_seq_ph
        self.song_length_norm_ph = tf.placeholder(dtype=tf.float64,shape=[None,1],name="song_length_norm")
        self.inputs["song_length_norm"] = self.song_length_norm_ph
        self.song_count_quant_ph = tf.placeholder(dtype=tf.float64,shape=[None,1],name="song_count_quant")
        self.inputs["song_count_quant"] = self.song_count_quant_ph

        ## 定义类别特征的embedding层
        gender_emb_layer = self.build_embedding_layer("gender")
        city_emb_layer = self.build_embedding_layer("city")
        source_type_emb_layer = self.build_embedding_layer("source_type")
        source_screen_name_emb_layer = self.build_embedding_layer("source_screen_name")
        source_system_tab_emb_layer = self.build_embedding_layer("source_system_tab")
        song_id_emb_layer = self.build_embedding_layer("song_id",flatten=False)

        ## dense特征
        dense_feature_input = tf.cast(tf.concat([self.song_length_norm_ph,self.song_count_quant_ph],axis=-1),dtype=tf.float32,name="dense_feature")

        ## demographic feature embedding
        city_embedding = city_emb_layer(self.city_ph)
        gender_embedding = gender_emb_layer(self.gender_ph)
        demographic_embedding_feature = tf.concat([city_embedding,gender_embedding],axis=-1)

        ## context feature embedding
        source_type_embedding = source_type_emb_layer(self.source_type_ph)
        source_system_tab_embedding = source_system_tab_emb_layer(self.source_system_tab_ph)
        source_screen_name_embedding = source_screen_name_emb_layer(self.source_screen_name_ph)
        context_embedding_feature = tf.concat([source_type_embedding,source_system_tab_embedding,
                                               source_screen_name_embedding],axis=-1)

        ## play action seq embedding feature
        song_play_seq_embeddings = song_id_emb_layer(self.user_play_seq_ph)
        song_play_seq_length = tf.count_nonzero(self.user_play_seq_ph)
        seq_avg_pooling_embedding = self.avg_pooling(song_play_seq_embeddings,song_play_seq_length)

        ## concat all features
        input_features = tf.concat([dense_feature_input,demographic_embedding_feature,
                                    context_embedding_feature,seq_avg_pooling_embedding],axis=-1)

        ## build mlp and pass the features,get the user embedding
        self.user_embedding = self.build_mlp_layer(input_features)

        ## get inference song embedding
        inference_song_id_emb_layer = self.build_embedding_layer("inference_song_id")
        self.inference_song_weights = inference_song_id_emb_layer.embedding_matrix
        self.inference_song_bias = tf.zeros([self.embedding_config['song_id']['vocab_size']])

        self.sampled_values = tf.nn.fixed_unigram_candidate_sampler(
            true_classes=self.label_ph,
            num_true=1,
            num_sampled=self.negative_sample_size,
            unique=True,
            range_max=self.embedding_config['song_id']['vocab_size'],
            unigrams=self.unigrams
        )

        self.sampled_loss = tf.nn.sampled_softmax_loss(
            weights=self.inference_song_weights,
            biases=self.inference_song_bias,
            labels=self.label_ph,
            inputs=self.user_embedding,
            num_sampled=self.negative_sample_size,
            num_classes=self.embedding_config['song_id']['vocab_size'],
            num_true=1,
            sampled_values=self.sampled_values,
            partition_strategy='div'
        )
        self.loss = tf.reduce_mean(self.sampled_loss)
        self.global_step = tf.train.get_or_create_global_step()
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.minimize(self.loss, global_step=self.global_step)

        ## define predict op
        self.score = tf.matmul(self.user_embedding, tf.transpose(self.inference_song_weights))  # + self.biases
        self.predictions = tf.argmax(self.score, 1)
        self.topk_value, self.topk_idx = tf.nn.top_k(self.score, k=self.topk)


    def feed_feature_and_label(self, df):
        feeds = self.feed_feature(df)
        feeds[self.inputs["label"]] = df['song_id'].values.reshape([-1, 1])
        return feeds

    def feed_feature(self, df):
        feeds = {}
        for field in df.columns:
            if field in self.input_fields:
                feeds[self.inputs[field]] = df[field].values.reshape([-1, 1])
        seq_feature = np.concatenate(df['user_play_seq'].values).reshape([-1,self.user_play_seq_max_length])
        feeds[self.inputs['user_play_seq']] = seq_feature
        return feeds

    def train(self,train_data):
        train_df = train_data
        loss_list = []
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for i, df in train_df.groupby(np.arange(train_df.shape[0]) // self.batch_size):
                _, loss = sess.run([self.train_op,self.loss],feed_dict=self.feed_feature_and_label(df))
                loss_list.append(loss)


    def predict_top_k(self,predict_data):
        # logits and prediction
        with tf.Session() as sess:
            topk_scores, topk_idx = sess.run([self.topk_value,self.topk_idx],feed_dict=self.feed_feature(predict_data))
            return topk_scores,topk_idx

In [11]:
train_data = process_data("preprocess/train.csv","feature/user_action_seq_feature.csv",10,input_fields)

In [30]:
tf.reset_default_graph()

In [31]:
model = YtbDNN(input_fields=input_fields,unigrams=unigrams)



In [32]:
model.train(train_data)