In [None]:
import os
import sys
import click
import random
import collections

import numpy as np
import lightgbm as lgb
import json
import pandas as pd
from sklearn.metrics import mean_squared_error


In [2]:
import pickle

def save_params(params):
    """
    Save parameters to file
    """
    pickle.dump(params, open('params.p', 'wb'))


def load_params():
    """
    Load parameters from file
    """
    return pickle.load(open('params.p', mode='rb'))


def save_params_with_name(params, name):
    """
    Save parameters to file
    """
    pickle.dump(params, open('{}.p'.format(name), 'wb'))


def load_params_with_name(name):
    """
    Load parameters from file
    """
    return pickle.load(open('{}.p'.format(name), mode='rb'))


In [3]:
b = max(2,3)

In [4]:
# There are 13 integer features and 26 categorical features
#创建列表
continous_features = range(1, 14)
categorial_features = range(14, 40)

# Clip integer features. The clip point for each integer feature
# is derived from the 95% quantile of the total values in each feature
#使列数据平滑，去掉异常大的值
continous_clip = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]

#连续特征生成器
class ContinuousFeatureGenerator:
    """
    Normalize the integer features to [0, 1] by min-max normalization
    将数值型特征归一化到[0,1]
    """
    #num_feature为数值型特征的种类数
    def __init__(self, num_feature):
        self.num_feature = num_feature
        #生成长度为num_feature的列表
        self.min1 = [sys.maxsize] * num_feature
        self.max1 = [-sys.maxsize] * num_feature

    def build(self, datafile, continous_features):
#         data = pd.read_csv(datafile)
#         for line 
        with open(datafile, 'r') as f:
            for line in f:
                #print(line)
                #生成每行特征值的列表，长度为40
                features = line.split(',')
                #print(features)
                #对数值型特征求最大值和最小值，分行求，存储在min1和max1中
                for i in range(0, self.num_feature):
                    #print(len(features))
                    val = features[continous_features[i]]
                    #print(val) 
                    if val != '':
                        val = int(val)
                        #如果数据过大，则进行截断
                        if val > continous_clip[i]:
                            val = continous_clip[i]
                        #print(val)
                        #a=self.min[i]
                        #b=self.max[i]
                        #对每列特征的最小值进行对比替换
                        
                        self.min1[i] = min(self.min1[i], val)
                        self.max1[i] = max(self.max1[i], val)

    #对数值型特征值进行[0,1]归一化
    def gen(self, idx, val):
        if val == '':
            return 0.0
        val = float(val)
        return (val - self.min1[idx]) / (self.max1[idx] - self.min1[idx])

class CategoryDictGenerator:
    """
    Generate dictionary for each of the categorical features
    出现次数大于cutoff的类别留下
    """
    #类别数目num_feature为26
    def __init__(self, num_feature):
        self.dicts = []
        self.num_feature = num_feature
        #产生元素为字典的列表，其长度为26
        for i in range(0, num_feature):
            self.dicts.append(collections.defaultdict(int))

    def build(self, datafile, categorial_features, cutoff=0):
        with open(datafile, 'r') as f:
            for line in f:
                features = line.split(',')
                #计算每一列不同特征出现的次数
                for i in range(0, self.num_feature):
                    if features[categorial_features[i]] != '':
                        self.dicts[i][features[categorial_features[i]]] += 1
            #print(self.dicts)
        #过滤掉出现次数小于cutoff的特征,将其key值设为unk，并进行排序
        for i in range(0, self.num_feature):
            self.dicts[i] = filter(lambda x: x[1] >= cutoff,
                                   self.dicts[i].items())

            self.dicts[i] = sorted(self.dicts[i], key=lambda x: (-x[1], x[0]))
            vocabs, _ = list(zip(*self.dicts[i]))
            self.dicts[i] = dict(zip(vocabs, range(1, len(vocabs) + 1)))
            self.dicts[i]['<unk>'] = 0

    def gen(self, idx, key):
        if key not in self.dicts[idx]:
            res = self.dicts[idx]['<unk>']
        else:
            res = self.dicts[idx][key]
        return res
    #计算每一列过滤后的特征长度，生成长度列表
    def dicts_sizes(self):
        return list(map(len, self.dicts))

In [5]:
a=collections.defaultdict(int)
print(a)

defaultdict(<class 'int'>, {})


In [6]:
def preprocess(datadir, outdir):
    """
    All the 13 integer features are normalzied to continous values and these
    continous features are combined into one vecotr with dimension 13.

    Each of the 26 categorical features are one-hot encoded and all the one-hot
    vectors are combined into one sparse binary vector.
    """
    #计算生成每一列数值型特征的最大值和最小值
    dists = ContinuousFeatureGenerator(len(continous_features))
    dists.build(os.path.join(datadir, 'Train_400000.csv'), continous_features)
    
    #生成每一列类别型特征过滤后的特征字典
    dicts = CategoryDictGenerator(len(categorial_features))
    dicts.build(
        os.path.join(datadir, 'Train_400000.csv'), categorial_features, cutoff=200)#200 50
    
    #返回每一列类别特征长度列表
    dict_sizes = dicts.dicts_sizes()
    categorial_feature_offset = [0]
    for i in range(1, len(categorial_features)):
        offset = categorial_feature_offset[i - 1] + dict_sizes[i - 1]
        categorial_feature_offset.append(offset)

    random.seed(0)

    # 90% of the data are used for training, and 10% of the data are used
    # for validation.
    train_ffm = open(os.path.join(outdir, 'train_ffm.txt'), 'w')
    valid_ffm = open(os.path.join(outdir, 'valid_ffm.txt'), 'w')

    train_lgb = open(os.path.join(outdir, 'train_lgb.txt'), 'w')
    valid_lgb = open(os.path.join(outdir, 'valid_lgb.txt'), 'w')

    with open(os.path.join(outdir, 'train.txt'), 'w') as out_train:
        with open(os.path.join(outdir, 'valid.txt'), 'w') as out_valid:
            with open(os.path.join(datadir, 'Train_400000.csv'), 'r') as f:
                for line in f:
                    features = line.split(',')
                    continous_feats = []
                    continous_vals = []
                    for i in range(0, len(continous_features)):
                        val = dists.gen(i, features[continous_features[i]])
                        continous_vals.append(
                            "{0:.6f}".format(val).rstrip('0').rstrip('.'))
                        continous_feats.append(
                            "{0:.6f}".format(val).rstrip('0').rstrip('.'))#('{0}'.format(val))

                    categorial_vals = []
                    categorial_lgb_vals = []
                    for i in range(0, len(categorial_features)):
                        val = dicts.gen(i, features[categorial_features[i]]) + categorial_feature_offset[i]
                        categorial_vals.append(str(val))
                        val_lgb = dicts.gen(i, features[categorial_features[i]])
                        categorial_lgb_vals.append(str(val_lgb))

                    continous_vals = ','.join(continous_vals)
                    categorial_vals = ','.join(categorial_vals)
                    label = features[0]
                    if random.randint(0, 9999) % 10 != 0:
                        out_train.write(','.join(
                            [continous_vals, categorial_vals, label]) + '\n')
                        train_ffm.write('\t'.join(label) + '\t')
                        train_ffm.write('\t'.join(
                            ['{}:{}:{}'.format(ii, ii, val) for ii,val in enumerate(continous_vals.split(','))]) + '\t')
                        train_ffm.write('\t'.join(
                            ['{}:{}:1'.format(ii + 13, str(np.int32(val) + 13)) for ii, val in enumerate(categorial_vals.split(','))]) + '\n')
                        
                        train_lgb.write('\t'.join(label) + '\t')
                        train_lgb.write('\t'.join(continous_feats) + '\t')
                        train_lgb.write('\t'.join(categorial_lgb_vals) + '\n')

                    else:
                        out_valid.write(','.join(
                            [continous_vals, categorial_vals, label]) + '\n')
                        valid_ffm.write('\t'.join(label) + '\t')
                        valid_ffm.write('\t'.join(
                            ['{}:{}:{}'.format(ii, ii, val) for ii,val in enumerate(continous_vals.split(','))]) + '\t')
                        valid_ffm.write('\t'.join(
                            ['{}:{}:1'.format(ii + 13, str(np.int32(val) + 13)) for ii, val in enumerate(categorial_vals.split(','))]) + '\n')
                                                
                        valid_lgb.write('\t'.join(label) + '\t')
                        valid_lgb.write('\t'.join(continous_feats) + '\t')
                        valid_lgb.write('\t'.join(categorial_lgb_vals) + '\n')
                        
    train_ffm.close()
    valid_ffm.close()

    train_lgb.close()
    valid_lgb.close()

    test_ffm = open(os.path.join(outdir, 'test_ffm.txt'), 'w')
    test_lgb = open(os.path.join(outdir, 'test_lgb.txt'), 'w')

    with open(os.path.join(outdir, 'test.txt'), 'w') as out:
        with open(os.path.join(datadir, 'Test_200000data.csv'), 'r') as f:
            for line in f:
                features = line.split(',')

                continous_feats = []
                continous_vals = []
                for i in range(0, len(continous_features)):
                    val = dists.gen(i, features[continous_features[i] - 1])
                    continous_vals.append(
                        "{0:.6f}".format(val).rstrip('0').rstrip('.'))
                    continous_feats.append(
                            "{0:.6f}".format(val).rstrip('0').rstrip('.'))#('{0}'.format(val))

                categorial_vals = []
                categorial_lgb_vals = []
                for i in range(0, len(categorial_features)):
                    val = dicts.gen(i,
                                    features[categorial_features[i] -
                                             1]) + categorial_feature_offset[i]
                    categorial_vals.append(str(val))

                    val_lgb = dicts.gen(i, features[categorial_features[i] - 1])
                    categorial_lgb_vals.append(str(val_lgb))

                continous_vals = ','.join(continous_vals)
                categorial_vals = ','.join(categorial_vals)

                out.write(','.join([continous_vals, categorial_vals]) + '\n')
                
                test_ffm.write('\t'.join(['{}:{}:{}'.format(ii, ii, val) for ii,val in enumerate(continous_vals.split(','))]) + '\t')
                test_ffm.write('\t'.join(
                    ['{}:{}:1'.format(ii + 13, str(np.int32(val) + 13)) for ii, val in enumerate(categorial_vals.split(','))]) + '\n')
                                                                
                test_lgb.write('\t'.join(continous_feats) + '\t')
                test_lgb.write('\t'.join(categorial_lgb_vals) + '\n')

    test_ffm.close()
    test_lgb.close()
    return dict_sizes

In [45]:
dict_sizes = preprocess('./data','./data')

In [46]:
save_params_with_name((dict_sizes), 'dict_sizes') #pickle.dump((dict_sizes), open('dict_sizes.p', 'wb'))

In [47]:
dict_sizes = load_params_with_name('dict_sizes') #pickle.load(open('dict_sizes.p', mode='rb'))

In [48]:
sum(dict_sizes)

3805

# 训练FFM
数据准备好了，开始调用LibFFM，训练FFM模型。

learning rate是0.1，迭代32次，训练好后保存的模型文件是model_ffm。

In [37]:
import subprocess, sys, os, time

NR_THREAD = 1

In [38]:
cmd = './libffm/libffm/ffm-train --auto-stop -r 0.1 -t 32 -s {nr_thread} -p ./data/valid_ffm.txt ./data/train_ffm.txt model_ffm'.format(nr_thread=NR_THREAD) 
os.popen(cmd).readlines()

[]

FFM模型训练好了，我们把训练、验证和测试数据输入给FFM，得到FFM层的输出，输出的文件名为*.out.logit

In [76]:
cmd = './libffm/libffm/ffm-predict ./data/train_ffm.txt model_ffm tr_ffm.out'.format(nr_thread=NR_THREAD) 
os.popen(cmd).readlines()

[]

In [77]:
cmd = './libffm/libffm/ffm-predict ./data/valid_ffm.txt model_ffm va_ffm.out'.format(nr_thread=NR_THREAD) 
os.popen(cmd).readlines()

[]

In [78]:
cmd = './libffm/libffm/ffm-predict ./data/test_ffm.txt model_ffm te_ffm.out true'.format(nr_thread=NR_THREAD) 
os.popen(cmd).readlines()

[]

# 训练GBDT¶
现在调用LightGBM训练GBDT模型，因为决策树较容易过拟合，我们设置树的个数为32，叶子节点数设为30，深度就不设置了，学习率设为0.05。

In [79]:
def lgb_pred(tr_path, va_path, _sep = '\t', iter_num = 32):
    # load or create your dataset
    print('Load data...')
    df_train = pd.read_csv(tr_path, header=None, sep=_sep)
    df_test = pd.read_csv(va_path, header=None, sep=_sep)
    
    y_train = df_train[0].values
    y_test = df_test[0].values
    X_train = df_train.drop(0, axis=1).values
    X_test = df_test.drop(0, axis=1).values
    
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    # specify your configurations as a dict
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'l2', 'auc', 'logloss'},
        'num_leaves': 30,
#         'max_depth': 7,
        'num_trees': 32,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }
    
    print('Start training...')
    # train
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=iter_num,
                    valid_sets=lgb_eval,
                    feature_name=["I1","I2","I3","I4","I5","I6","I7","I8","I9","I10","I11","I12","I13","C1","C2","C3","C4","C5","C6","C7","C8","C9","C10","C11","C12","C13","C14","C15","C16","C17","C18","C19","C20","C21","C22","C23","C24","C25","C26"],
                    categorical_feature=["C1","C2","C3","C4","C5","C6","C7","C8","C9","C10","C11","C12","C13","C14","C15","C16","C17","C18","C19","C20","C21","C22","C23","C24","C25","C26"],
                    early_stopping_rounds=5)
    
    print('Save model...')
    # save model to file
    gbm.save_model('lgb_model.txt')
    
    print('Start predicting...')
    # predict
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    # eval
    print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

    return gbm,y_pred,X_train,y_train

In [18]:
gbm,y_pred,X_train ,y_train = lgb_pred('./data/train_lgb.txt', './data/valid_lgb.txt', '\t', 256)

Load data...
Start training...


New categorical_feature is ['C1', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C2', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	valid_0's l2: 0.185497	valid_0's auc: 0.701065
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.183755	valid_0's auc: 0.712439
[3]	valid_0's l2: 0.18212	valid_0's auc: 0.716175
[4]	valid_0's l2: 0.180617	valid_0's auc: 0.719581
[5]	valid_0's l2: 0.179246	valid_0's auc: 0.723186
[6]	valid_0's l2: 0.177917	valid_0's auc: 0.724594
[7]	valid_0's l2: 0.176705	valid_0's auc: 0.726352
[8]	valid_0's l2: 0.175644	valid_0's auc: 0.727929
[9]	valid_0's l2: 0.174641	valid_0's auc: 0.728366
[10]	valid_0's l2: 0.173646	valid_0's auc: 0.729264
[11]	valid_0's l2: 0.172776	valid_0's auc: 0.730176
[12]	valid_0's l2: 0.17199	valid_0's auc: 0.731295
[13]	valid_0's l2: 0.17125	valid_0's auc: 0.732242
[14]	valid_0's l2: 0.170516	valid_0's auc: 0.733075
[15]	valid_0's l2: 0.169859	valid_0's auc: 0.733591
[16]	valid_0's l2: 0.169215	valid_0's auc: 0.7345
[17]	valid_0's l2: 0.168632	valid_0's auc: 0.735134
[18]	valid_0's l2: 0.168061	valid_0's auc: 0.735961
[19]	valid_0's l2

In [None]:
查看每个特征的重要程度¶

In [80]:
gbm.feature_importance()

array([ 19,   0,  28,   6,  15,  62,  19,   8,  11,   0,  33,   0,  27,
         0,  34,   0,  68,   0,   3,  22,   0,   3,  12,  36,   5, 162,
        17, 149,   9,  13,  83,   0,   2,   8,   0,  20,  35,   0,  19])

In [81]:
gbm.feature_importance("gain")

array([17588.60017395,     0.        , 13697.72706604,  1527.25202942,
        5273.76097107, 61430.39205933, 58175.77937317,  4058.06893921,
        5047.81399536,     0.        , 60615.7379303 ,     0.        ,
       15147.96902466,     0.        , 10214.86604309,     0.        ,
       20004.60100555,     0.        ,   489.92799377,  4250.53800964,
           0.        ,  1140.58499146,  1641.02298737,  5589.14899445,
        1097.49501801, 46073.73500824,  6028.73600769, 40460.85097504,
        2438.44700623,  7649.37095642, 23015.25582886,     0.        ,
         379.0269928 ,  3998.13604736,     0.        ,  5586.1529541 ,
       11735.71200562,     0.        ,  4622.90000153])

In [82]:
def ret_feat_impt(gbm):
    gain = gbm.feature_importance("gain").reshape(-1, 1) / sum(gbm.feature_importance("gain"))
    col = np.array(gbm.feature_name()).reshape(-1, 1)
    return sorted(np.column_stack((col, gain)),key=lambda x: x[1],reverse=True)

In [83]:
ret_feat_impt(gbm)

[array(['I6', '0.13993905549460622'], dtype='<U32'),
 array(['I11', '0.13808326513499225'], dtype='<U32'),
 array(['I7', '0.13252501482135512'], dtype='<U32'),
 array(['C13', '0.10495643514589945'], dtype='<U32'),
 array(['C15', '0.0921702284511962'], dtype='<U32'),
 array(['C18', '0.05242898595774266'], dtype='<U32'),
 array(['C4', '0.045570683767737805'], dtype='<U32'),
 array(['I1', '0.040067009395574664'], dtype='<U32'),
 array(['I13', '0.034507226910175645'], dtype='<U32'),
 array(['I3', '0.031203561035282972'], dtype='<U32'),
 array(['C24', '0.02673407084943356'], dtype='<U32'),
 array(['C2', '0.023269568338316327'], dtype='<U32'),
 array(['C17', '0.017425344538508466'], dtype='<U32'),
 array(['C14', '0.01373352170579938'], dtype='<U32'),
 array(['C11', '0.012732138035941956'], dtype='<U32'),
 array(['C23', '0.012725313025683329'], dtype='<U32'),
 array(['I5', '0.012013680923329197'], dtype='<U32'),
 array(['I9', '0.011498971423479259'], dtype='<U32'),
 array(['C26', '0.010531013

保存GBDT参数

In [84]:
dump = gbm.dump_model()

In [85]:
save_params_with_name((gbm, dump), 'gbm_dump') 

In [86]:
gbm, dump = load_params_with_name('gbm_dump') 

In [None]:
通过eli5分析参数

In [87]:
import eli5 

from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [88]:
import csv
import numpy as np

with open('./data/train_eli5.csv', 'rt') as f:
    data = list(csv.DictReader(f))
data[:1]

FileNotFoundError: [Errno 2] No such file or directory: './data/train_eli5.csv'

In [None]:
_all_xs = [{k: v for k, v in row.items() if k != 'clicked'} for row in data]
_all_ys = np.array([int(row['clicked']) for row in data])

all_xs, all_ys = shuffle(_all_xs, _all_ys, random_state=0)
train_xs, valid_xs, train_ys, valid_ys = train_test_split(
    all_xs, all_ys, test_size=0.25, random_state=0)
print('{} items total, {:.1%} true'.format(len(all_xs), np.mean(all_ys)))

# 用LightGBM的输出生成FM数据
GBDT已经训练好了，我们需要GBDT输出的叶子节点作为输入数据X传给FM，一共30个叶子节点，那么输入给FM的数据格式就是X中不是0的数据的index:value。

一段真实数据如下：0 0:31 1:61 2:93 3:108 4:149 5:182 6:212 7:242 8:277 9:310 10:334 11:365 12:401 13:434 14:465 15:491 16:527 17:552 18:589 19:619 20:648 21:678 22:697 23:744 24:770 25:806 26:826 27:862 28:899 29:928 30:955 31:988

In [None]:
def generat_lgb2fm_data(outdir, gbm, dump, tr_path, va_path, te_path, _sep = '\t'):
    with open(os.path.join(outdir, 'train_lgb2fm.txt'), 'w') as out_train:
        with open(os.path.join(outdir, 'valid_lgb2fm.txt'), 'w') as out_valid:
            with open(os.path.join(outdir, 'test_lgb2fm.txt'), 'w') as out_test:
                df_train_ = pd.read_csv(tr_path, header=None, sep=_sep)
                df_valid_ = pd.read_csv(va_path, header=None, sep=_sep)
                df_test_= pd.read_csv(te_path, header=None, sep=_sep)

                y_train_ = df_train_[0].values
                y_valid_ = df_valid_[0].values                

                X_train_ = df_train_.drop(0, axis=1).values
                X_valid_ = df_valid_.drop(0, axis=1).values
                X_test_= df_test_.values
   
                train_leaves= gbm.predict(X_train_, num_iteration=gbm.best_iteration, pred_leaf=True)
                valid_leaves= gbm.predict(X_valid_, num_iteration=gbm.best_iteration, pred_leaf=True)
                test_leaves= gbm.predict(X_test_, num_iteration=gbm.best_iteration, pred_leaf=True)

                tree_info = dump['tree_info']
                tree_counts = len(tree_info)
                for i in range(tree_counts):
                    train_leaves[:, i] = train_leaves[:, i] + tree_info[i]['num_leaves'] * i + 1
                    valid_leaves[:, i] = valid_leaves[:, i] + tree_info[i]['num_leaves'] * i + 1
                    test_leaves[:, i] = test_leaves[:, i] + tree_info[i]['num_leaves'] * i + 1
#                     print(train_leaves[:, i])
#                     print(tree_info[i]['num_leaves'])

                for idx in range(len(y_train_)):            
                    out_train.write((str(y_train_[idx]) + '\t'))
                    out_train.write('\t'.join(
                        ['{}:{}'.format(ii, val) for ii,val in enumerate(train_leaves[idx]) if float(val) != 0 ]) + '\n')
                    
                for idx in range(len(y_valid_)):                   
                    out_valid.write((str(y_valid_[idx]) + '\t'))
                    out_valid.write('\t'.join(
                        ['{}:{}'.format(ii, val) for ii,val in enumerate(valid_leaves[idx]) if float(val) != 0 ]) + '\n')
                    
                for idx in range(len(X_test_)):                   
                    out_test.write('\t'.join(
                        ['{}:{}'.format(ii, val) for ii,val in enumerate(test_leaves[idx]) if float(val) != 0 ]) + '\n')

In [None]:
generat_lgb2fm_data('./data', gbm, dump, './data/train_lgb.txt', './data/valid_lgb.txt', './data/test_lgb.txt', '\t')

# 训练FM
为训练FM的数据已经准备好了，我们调用LibFM进行训练。

迭代64次，使用sgd训练，学习率是0.00000001，训练好的模型保存为文件fm_model。

训练输出的log，Train和Test的数值不是loss，是accuracy。

In [29]:
cmd = './libfm/libfm/bin/libFM -task c -train ./data/train_lgb2fm.txt -test ./data/valid_lgb2fm.txt -dim ’1,1,8’ -iter 64 -method sgd -learn_rate 0.00000001 -regular ’0,0,0.01’ -init_stdev 0.1 -save_model fm_model'
os.popen(cmd).readlines()

[]

In [None]:
FM模型训练好了，我们把训练、验证和测试数据输入给FM，得到FM层的输出，输出的文件名为*.fm.logits

In [30]:
cmd = './libfm/libfm/bin/libFM -task c -train ./data/train_lgb2fm.txt -test ./data/valid_lgb2fm.txt -dim ’1,1,8’ -iter 32 -method sgd -learn_rate 0.00000001 -regular ’0,0,0.01’ -init_stdev 0.1 -load_model fm_model -train_off true -prefix tr'
os.popen(cmd).readlines()

[]

In [31]:
cmd = './libfm/libfm/bin/libFM -task c -train ./data/valid_lgb2fm.txt -test ./data/valid_lgb2fm.txt -dim ’1,1,8’ -iter 32 -method sgd -learn_rate 0.00000001 -regular ’0,0,0.01’ -init_stdev 0.1 -load_model fm_model -train_off true -prefix va'
os.popen(cmd).readlines()

[]

In [32]:
cmd = './libfm/libfm/bin/libFM -task c -train ./data/test_lgb2fm.txt -test ./data/valid_lgb2fm.txt -dim ’1,1,8’ -iter 32 -method sgd -learn_rate 0.00000001 -regular ’0,0,0.01’ -init_stdev 0.1 -load_model fm_model -train_off true -prefix te -test2predict true'
os.popen(cmd).readlines()

[]

In [33]:
embed_dim = 32
sparse_max = 30000 # sparse_feature_dim = 117568
sparse_dim = 26
dense_dim = 13
out_dim = 400

In [34]:
import tensorflow as tf
def get_inputs():
    dense_input = tf.placeholder(tf.float32, [None, dense_dim], name="dense_input")
    sparse_input = tf.placeholder(tf.int32, [None, sparse_dim], name="sparse_input")
    FFM_input = tf.placeholder(tf.float32, [None, 1], name="FFM_input")
    FM_input = tf.placeholder(tf.float32, [None, 1], name="FM_input")
    
    targets = tf.placeholder(tf.float32, [None, 1], name="targets")
    LearningRate = tf.placeholder(tf.float32, name = "LearningRate")
    return dense_input, sparse_input, FFM_input, FM_input, targets, LearningRate

In [35]:
def get_sparse_embedding(sparse_input):
    with tf.name_scope("sparse_embedding"):
        sparse_embed_matrix = tf.Variable(tf.random_uniform([sparse_max, embed_dim], -1, 1), name = "sparse_embed_matrix")
        sparse_embed_layer = tf.nn.embedding_lookup(sparse_embed_matrix, sparse_input, name = "sparse_embed_layer")
        sparse_embed_layer = tf.reshape(sparse_embed_layer, [-1, sparse_dim * embed_dim])
    return sparse_embed_layer

In [36]:
def get_dnn_layer(dense_input, sparse_embed_layer):
    with tf.name_scope("dnn_layer"):
        input_combine_layer = tf.concat([dense_input, sparse_embed_layer], 1)  #(?, 845 = 832 + 13)
        fc1_layer = tf.layers.dense(input_combine_layer, out_dim, name = "fc1_layer", activation=tf.nn.relu)
        fc2_layer = tf.layers.dense(fc1_layer, out_dim, name = "fc2_layer", activation=tf.nn.relu)
        fc3_layer = tf.layers.dense(fc2_layer, out_dim, name = "fc3_layer", activation=tf.nn.relu)
    return fc3_layer

In [37]:
tf.reset_default_graph()
train_graph = tf.Graph()
with train_graph.as_default():
    dense_input, sparse_input, FFM_input, FM_input, targets, lr = get_inputs()
    sparse_embed_layer = get_sparse_embedding(sparse_input)
    fc3_layer = get_dnn_layer(dense_input, sparse_embed_layer)

    ffm_fc_layer = tf.layers.dense(FFM_input, 1, name = "ffm_fc_layer")
    fm_fc_layer = tf.layers.dense(FM_input, 1, name = "fm_fc_layer")
    feature_combine_layer = tf.concat([ffm_fc_layer, fm_fc_layer, fc3_layer], 1)  #(?, 402)

    with tf.name_scope("inference"):
        logits = tf.layers.dense(feature_combine_layer, 1, name = "logits_layer")
        pred = tf.nn.sigmoid(logits, name = "prediction")
    
    with tf.name_scope("loss"):
        # LogLoss损失，Logistic回归到点击率
#         cost = tf.losses.sigmoid_cross_entropy(targets, logits )
        sigmoid_cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=logits, name = "sigmoid_cost")
        logloss_cost = tf.losses.log_loss(labels=targets, predictions=pred)
        cost = logloss_cost # + sigmoid_cost
        loss = tf.reduce_mean(cost)
    # 优化损失 
#     train_op = tf.train.AdamOptimizer(lr).minimize(loss)  #cost
    global_step = tf.Variable(0, name="global_step", trainable=False)
    optimizer = tf.train.FtrlOptimizer(lr)  #tf.train.FtrlOptimizer(lr)  AdamOptimizer
    gradients = optimizer.compute_gradients(loss)  #cost
    train_op = optimizer.apply_gradients(gradients, global_step=global_step)
    
    # Accuracy
    with tf.name_scope("score"):
        correct_prediction = tf.equal(tf.to_float(pred > 0.5), targets)
        accuracy = tf.reduce_mean(tf.to_float(correct_prediction), name="accuracy")
        
#     auc, uop = tf.contrib.metrics.streaming_auc(pred, targets)

In [42]:
# Number of Epochs
num_epochs = 1
# Batch Size
batch_size = 32

# Learning Rate
learning_rate = 0.01
# Show stats for every n number of batches
show_every_n_batches = 25

save_dir = './save'

ffm_tr_out_path = './tr_ffm.out.logit'
ffm_va_out_path = './va_ffm.out.logit'
fm_tr_out_path = './tr.fm.logits'
fm_va_out_path = './va.fm.logits'
train_path = './data/train.txt'
valid_path = './data/valid.txt'

In [43]:
def get_batches_downsample(Xs, ys, batch_size):
    ind_0 = ys==0
    ind_1 = ys==1
    Xs_0 = Xs[ind_0]
    ys_0 = ys[ind_0]
    Xs_1 = Xs[ind_1]
    ys_1 = ys[ind_1]
    sampling_ind = np.random.permutation(Xs_0.shape[0])[:Xs_1.shape[0]]
    Xs_0_sampling = Xs_0[sampling_ind]
    ys_0_sampling = ys_0[sampling_ind]
    Xs_downsampled = np.concatenate((Xs_0_sampling, Xs_1))
    ys_downsampled = np.concatenate((ys_0_sampling, ys_1))
    downsampled_ind = np.random.permutation(Xs_downsampled.shape[0])
    Xs_downsampled = Xs_downsampled[downsampled_ind]
    ys_downsampled = ys_downsampled[downsampled_ind]
    for start in range(0, len(Xs_downsampled), batch_size):
        end = min(start + batch_size, len(Xs_downsampled))
        yield Xs_downsampled[start:end], ys_downsampled[start:end]

In [44]:
def get_batches(Xs, ys, batch_size):
    for start in range(0, len(Xs), batch_size):
        end = min(start + batch_size, len(Xs))
        yield Xs[start:end], ys[start:end]

In [41]:
ffm_train = pd.read_csv(ffm_tr_out_path, header=None)    
ffm_train = ffm_train[0].values

ffm_valid = pd.read_csv(ffm_va_out_path, header=None)    
ffm_valid = ffm_valid[0].values

FileNotFoundError: File b'./tr_ffm.out.logit' does not exist