In [None]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('preprocess/train.csv')
val_df = pd.read_csv('preprocess/val.csv')

In [None]:
# msno, song_id 编码
from sklearn.preprocessing import LabelEncoder
def encode_label(train_df, val_df, field):
    le = LabelEncoder()
    le.fit(list(train_df[field].unique()) + list(val_df[field].unique()))
    train_df[field] = le.transform(train_df[field])
    val_df[field] = le.transform(val_df[field])
    return train_df, val_df

train_df, val_df  = encode_label(train_df, val_df, 'msno')
train_df, val_df = encode_label(train_df, val_df, 'song_id')

In [None]:
import tensorflow as tf

class DNN(object):
    def __init__(self, config):
        self.config = config    
        self.input_fields = config['input_fields']
        self.dnn_units = config['dnn_units']
        self.learning_rate = config['learning_rate']
        self.batch_size = config['batch_size']
    
        with tf.variable_scope('inputs', reuse=tf.AUTO_REUSE) as scope:
            self.inputs = {}
            for field in self.input_fields:
                self.inputs[field] = tf.placeholder(dtype=np.float32, shape=[None, 1], name=field.replace(' ', '_'))
            self.labels = tf.placeholder(dtype=tf.float32, name='labels')
                    
        # deep 
        features = [self.inputs[f] for f in self.inputs]
        self.feature_concat = tf.concat(features, axis=1)
        
        layer = self.feature_concat
        for unit in self.dnn_units:
            layer = tf.layers.dense(layer, units=unit, activation=tf.nn.relu)
            
        self.logit = tf.layers.dense(layer, units=1, activation=None)
        
        self.outputs = tf.nn.sigmoid(self.logit)
        
        # loss
        self.loss = tf.losses.log_loss(self.labels, self.outputs)
        optmizer = tf.train.AdamOptimizer(self.learning_rate)
        self.global_step = tf.train.get_or_create_global_step()
        self.train_op = optmizer.minimize(self.loss, global_step=self.global_step)

    def feed_feature_and_label(self, df):
        feeds = {}
        for field in df.columns:
            if field in self.input_fields:
                feeds[self.inputs[field]] = df[field].astype(np.float32).values.reshape([-1, 1])
        feeds[self.labels] = df['target'].astype(np.float32).values.reshape([-1, 1])
        return feeds

In [None]:
import time
import os
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt


def format_log(step, eval_info):
    time_info = time.strftime('%Y-%m-%d %X', time.localtime())
    eval_info_str = ', '.join([k + ": " + str(v) for k, v in eval_info.items()])
    log_info = time_info + " iteration " + str(step) + " " + eval_info_str
    return log_info


def train(config):
    checkpoint_dir = os.path.join(config['ckpt_dir'], config['model_name'])    
    model = config['model']
    train_df = config['train_df']
    val_df = config['val_df']
    
    train_loss = []
    gpu_config = tf.ConfigProto()
    gpu_config.gpu_options.allow_growth = True
    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=checkpoint_dir,
        config=gpu_config
    ) as session:
        for epoch_index in range(config['epoch']):
            batch_size = config['batch_size']
            
            # train
            for i, df in train_df.sample(frac=1).groupby(np.arange(train_df.shape[0]) // batch_size):
                feeds = model.feed_feature_and_label(df)
                session.run(model.train_op, feed_dict=feeds)
                if i % 100 == 0:
                    loss, step = session.run([model.loss, model.global_step], feed_dict=feeds)
                    train_loss.append((step, loss))
                    eval_info = {'loss/train': loss}
                    log_info = format_log(step, eval_info)
                    print(log_info)
            
            # validation
            preds_all, label_all = [], []
            loss_all = []
            for i, df in val_df.groupby(np.arange(val_df.shape[0]) // batch_size):
                feeds = model.feed_feature_and_label(df)
                pred, label, loss = session.run([model.outputs, model.labels, model.loss], feed_dict=feeds)
                preds_all.append(pred)
                label_all.append(label)
                loss_all.append(loss)
            auc = roc_auc_score(np.concatenate(label_all), np.concatenate(preds_all))                
            eval_info = {'loss/val': loss, 'auc': auc}
            log_info = format_log(step, eval_info)
            print(log_info)
            
    return train_loss

In [None]:
tf.reset_default_graph()
config = {
    'model_name': 'dnn_pretrain',
    'ckpt_dir': 'checkpoint',
    'epoch': 1,
    'embedding_size': 4,
    'dnn_units': [128, 64, 32],
    'learning_rate': 0.001,
    'batch_size': 1024
}
train_df.columns = [c.replace(' ', '_') for c in train_df.columns]
train_df.columns = [c.replace(' ', '_') for c in train_df.columns]

input_fields = [field for field in train_df_svd.columns if field not in [
    'target', 'msno', 'song_id', 'source_system_tab','source_screen_name', \
        'source_type','language','city','gender','registered_via']]

config['input_fields'] = input_fields

config['model'] = DNN(config=config)
config['train_df'] = train_df
config['val_df'] = val_df



train_loss = train(config)
t = list(zip(*train_loss))
plt.plot(t[0], t[1])


In [None]:
# svd分解
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
label, user_idx, song_idx = train_df['target'].values, train_df['msno'].values, train_df['song_id'].values

song_user_matrix = csr_matrix((label, (song_idx, user_idx)))

dimension = 16
svd = TruncatedSVD(n_components=dimension)
song_vec = svd.fit_transform(song_user_matrix)

song_vec_pretrain = pd.DataFrame(np.concatenate([np.reshape(range(len(song_vec)), [-1, 1]), song_vec], axis=1), 
                                 columns=['song_id'] + ['song_svd_' + str(c) for c in range(dimension)])
train_df_svd = train_df.merge(song_vec_pretrain, left_on='song_id', right_on='song_id', how='left')
val_df_svd = val_df.merge(song_vec_pretrain, left_on='song_id', right_on='song_id', how='left')

In [None]:

tf.reset_default_graph()

config = {
    'model_name': 'dnn_pretrain',
    'ckpt_dir': 'checkpoint',
    'epoch': 1,
    'embedding_size': 4,
    'dnn_units': [128, 64, 32],
    'learning_rate': 0.001,
    'batch_size': 1024
}
train_df_svd.columns = [c.replace(' ', '_') for c in train_df_svd.columns]
train_df_svd.columns = [c.replace(' ', '_') for c in train_df_svd.columns]

input_fields = [field for field in train_df_svd.columns if field not in [
    'target', 'msno', 'song_id', 'source_system_tab','source_screen_name', \
        'source_type','language','city','gender','registered_via']]

config['input_fields'] = input_fields

config['model'] = DNNPretrain(config=config)
config['train_df'] = train_df_svd
config['val_df'] = val_df_svd



train_loss = train(config)
t = list(zip(*train_loss))
plt.plot(t[0], t[1])


In [None]:
import tensorflow as tf

class DNNEnd2End(object):
    def __init__(self, config):
        self.config = config    
        self.embedding_fields = config['embedding_fields']
        self.input_fields = config['input_fields']
        self.embedding_vocab_size = config['embedding_vocab_size']
        self.embedding_size = config['embedding_size']
        self.dnn_units = config['dnn_units']
        self.learning_rate = config['learning_rate']
        self.batch_size = config['batch_size']
    
        with tf.variable_scope('inputs', reuse=tf.AUTO_REUSE) as scope:
            self.inputs = {}
            for field in self.input_fields:
                self.inputs[field] = tf.placeholder(dtype=np.float32, shape=[None, 1], name=field.replace(' ', '_'))
            for field in self.embedding_fields:
                self.inputs[field] = tf.placeholder(dtype=np.int64, shape=[None, 1], name=field.replace(' ', '_'))
            self.labels = tf.placeholder(dtype=tf.float32, name='labels')
            
        with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE) as scope:
            self.embedding_matrix_dict = {}
            self.embedding_dict = {}
            for field in self.embedding_fields:
                self.embedding_matrix_dict[field] = tf.get_variable(name=field,shape=[self.embedding_vocab_size[field], self.embedding_size])
                self.embedding_dict[field] = tf.nn.embedding_lookup(self.embedding_matrix_dict[field], self.inputs[field])
                    
        # deep 
        features = [self.inputs[f] for f in self.inputs if f not in self.embedding_dict] + [tf.squeeze(e, axis=1) for e in self.embedding_dict.values()]
        self.feature_concat = tf.concat(features, axis=1)
        
        layer = self.feature_concat
        for unit in self.dnn_units:
            layer = tf.layers.dense(layer, units=unit, activation=tf.nn.relu)
            
        self.logit = tf.layers.dense(layer, units=1, activation=None)
        
        self.outputs = tf.nn.sigmoid(self.logit)
        
        # loss
        self.loss = tf.losses.log_loss(self.labels, self.outputs)
        optmizer = tf.train.AdamOptimizer(self.learning_rate)
        self.global_step = tf.train.get_or_create_global_step()
        self.train_op = optmizer.minimize(self.loss, global_step=self.global_step)

    def feed_feature_and_label(self, df):
        feeds = {}
        for field in df.columns:
            if field in self.embedding_fields:
                feeds[self.inputs[field]] = df[field].astype(np.int64).values.reshape([-1, 1])
            elif field in self.input_fields:
                feeds[self.inputs[field]] = df[field].astype(np.float32).values.reshape([-1, 1])
        feeds[self.labels] = df['target'].astype(np.float32).values.reshape([-1, 1])
        return feeds

In [None]:

tf.reset_default_graph()

config = {
    'model_name': 'dnn_end2end',
    'ckpt_dir': 'checkpoint',
    'epoch': 1,
    'embedding_size': 16,
    'dnn_units': [128, 64, 32],
    'learning_rate': 0.001,
    'batch_size': 1024,
    'train_df': train_df,
    'val_df': val_df
}
train_df.columns = [c.replace(' ', '_') for c in train_df.columns]
val_df.columns = [c.replace(' ', '_') for c in val_df.columns]

input_fields = [field for field in train_df.columns if field not in [
    'target', 'msno', 'song_id', 'source_system_tab','source_screen_name', \
        'source_type','language','city','gender','registered_via']]

config['input_fields'] = input_fields
config['embedding_fields'] = ['song_id', 'source_system_tab','source_screen_name', 'source_type']
config['embedding_vocab_size'] = {}
for field in config['embedding_fields']:
    vocab_size = len(set(list(train_df[field].unique()) + list(val_df[field].unique())))
    config['embedding_vocab_size'][field] = vocab_size

config['model'] = DNNEnd2End(config=config)



train_loss = train(config)
t = list(zip(*train_loss))
plt.plot(t[0], t[1])


In [None]:
import tensorflow as tf

class DeepFM(object):
    def __init__(self, config):
        self.config = config    
        self.embedding_fields = config['embedding_fields']
        self.input_fields = config['input_fields']
        self.embedding_vocab_size = config['embedding_vocab_size']
        self.embedding_size = config['embedding_size']
        self.dnn_units = config['dnn_units']
        self.learning_rate = config['learning_rate']
        self.batch_size = config['batch_size']
    
        with tf.variable_scope('inputs', reuse=tf.AUTO_REUSE) as scope:
            self.inputs = {}
            for field in self.input_fields:
                self.inputs[field] = tf.placeholder(dtype=np.float32, shape=[None, 1], name=field.replace(' ', '_'))
            for field in self.embedding_fields:
                self.inputs[field] = tf.placeholder(dtype=np.int64, shape=[None, 1], name=field.replace(' ', '_'))
            self.labels = tf.placeholder(dtype=tf.float32, name='labels')
            
        with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE) as scope:
            self.embedding_matrix_dict = {}
            self.embedding_dict = {}
            for field in self.embedding_fields:
                self.embedding_matrix_dict[field] = tf.get_variable(name=field,shape=[self.embedding_vocab_size[field], self.embedding_size])
                self.embedding_dict[field] = tf.nn.embedding_lookup(self.embedding_matrix_dict[field], self.inputs[field])
                    
        # deep 
        features = [self.inputs[f] for f in self.inputs if f not in self.embedding_dict] + [tf.squeeze(e, axis=1) for e in self.embedding_dict.values()]
        self.feature_concat = tf.concat(features, axis=1)
        
        layer = self.feature_concat
        for unit in self.dnn_units:
            layer = tf.layers.dense(layer, units=unit, activation=tf.nn.relu)
            
        self.logit = tf.layers.dense(layer, units=1, activation=None)
        
        # fm 
        fm_linear_input = tf.squeeze(tf.concat(list(self.embedding_dict.values()), axis=2), axis=1)
        self.logit_fm_linear = tf.layers.dense(fm_linear_input, units=1, activation=None)
        
        fm_cross_input = tf.concat(list(self.embedding_dict.values()), axis=1)
        square_of_sum = tf.square(tf.reduce_sum(fm_cross_input, axis=1, keep_dims=True))
        sum_of_square = tf.reduce_sum(fm_cross_input * fm_cross_input, axis=1, keep_dims=True)
        self.logit_fm_corss = 0.5 * tf.reduce_sum(square_of_sum - sum_of_square, axis=2, keep_dims=False)
        
        self.outputs = tf.nn.sigmoid(self.logit + self.logit_fm_linear + self.logit_fm_corss)
        
        # loss
        self.loss = tf.losses.log_loss(self.labels, self.outputs)
        optmizer = tf.train.AdamOptimizer(self.learning_rate)
        self.global_step = tf.train.get_or_create_global_step()
        self.train_op = optmizer.minimize(self.loss, global_step=self.global_step)

    def feed_feature_and_label(self, df):
        feeds = {}
        for field in df.columns:
            if field in self.embedding_fields:
                feeds[self.inputs[field]] = df[field].astype(np.int64).values.reshape([-1, 1])
            elif field in self.input_fields:
                feeds[self.inputs[field]] = df[field].astype(np.float32).values.reshape([-1, 1])
        feeds[self.labels] = df['target'].astype(np.float32).values.reshape([-1, 1])
        return feeds

In [None]:

tf.reset_default_graph()

config = {
    'model_name': 'deepfm',
    'ckpt_dir': 'checkpoint',
    'epoch': 1,
    'embedding_size': 16,
    'dnn_units': [128, 64, 32],
    'learning_rate': 0.001,
    'batch_size': 1024,
    'train_df': train_df,
    'val_df': val_df
}
train_df.columns = [c.replace(' ', '_') for c in train_df.columns]
val_df.columns = [c.replace(' ', '_') for c in val_df.columns]

input_fields = [field for field in train_df.columns if field not in [
    'target', 'msno', 'song_id', 'source_system_tab','source_screen_name', \
        'source_type','language','city','gender','registered_via']]

config['input_fields'] = input_fields
config['embedding_fields'] = ['song_id', 'source_system_tab','source_screen_name', 'source_type']
config['embedding_vocab_size'] = {}
for field in config['embedding_fields']:
    vocab_size = len(set(list(train_df[field].unique()) + list(val_df[field].unique())))
    config['embedding_vocab_size'][field] = vocab_size

config['model'] = DeepFM(config=config)


train_loss = train(config)
t = list(zip(*train_loss))
plt.plot(t[0], t[1])