TEST

In [1]:
#!/usr/bin/python
# -*- coding=utf-8 -*-

import xgboost as xgb
import random
from sklearn.metrics import log_loss
from sklearn.metrics import roc_curve, auc
import numpy as np
import argparse
import os
import pickle


def mkdir(path):
    path = path.strip()
    isExists = os.path.exists(path)

    if not isExists:
        os.makedirs(path)
        return True
    else:
        return False


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('-f', '--folder', type=str, default='./')  # folder that stores the log
    parser.add_argument('-s', '--seed', type=int, default=27)

    args = parser.parse_args()

    random.seed(args.seed)

    param = {
        "early_stopping_rounds": 100,
        "reg_alpha": 0.0005,
        "colsample_bytree": 1.0,
        "colsample_bylevel": 0.8,
        "scale_pos_weight": 1,
        "learning_rate": 0.3,
        "nthread": 8,
        "min_child_weight": 1,
        "n_estimators": 1000,
        "subsample": 1,
        "reg_lambda": 12,
        "seed": args.seed,
        "objective": 'binary:logistic',
        "max_depth": 9,
        "gamma": 0.45,
        'eval_metric': 'auc',
        'silent': 1,
        'tree_method': 'exact',
        'debug': 0,
        'use_task_gain_self': 0,
        'when_task_split': 1,
        'how_task_split': 0,
        'min_task_gain': 0.0,
        'task_gain_margin': 0.0,
        'max_neg_sample_ratio': 0.5,
        'which_task_value': 2,
        'baseline_alpha': 1.0,
        'baseline_lambda': 1.0,
        'tasks_list_': (1, 2, 3, 4),
        'task_num_for_init_vec': 5,
        'task_num_for_OLF': 4,
    }

    folder = args.folder

    mkdir(folder)
    data_folder = './amazon.'

    # load data
    dtrain = xgb.DMatrix(data_folder + 'train.data')
    dtest = xgb.DMatrix(data_folder + 'val.data')
    deval = xgb.DMatrix(data_folder + 'val.data')

    fout = open(folder+'result.csv', 'a')

    vals = [None] * 30
    for task in param['tasks_list_']:
        vals[task] = xgb.DMatrix(data_folder + 'val_' + str(task) + '.data')

    # train
    evallist = [(dtrain, 'train'), (deval, 'eval')]
    bst = xgb.train(param, dtrain, param['n_estimators'], early_stopping_rounds=param['early_stopping_rounds'], evals=evallist)
    y_real = dtest.get_label()
    y_score = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

    # save model
    # with open('mt-gbdt.model', 'wb') as model:
    #     pickle.dump(bst, model)
    # load model
    # with open('mt-gbdt.model', 'rb') as model:
    #     bst = pickle.load(model)

    # compute ROC
    fpr, tpr, thresholds = roc_curve(y_real, y_score, pos_label=1)
    all_roc_auc = auc(fpr, tpr)
    all_logloss = log_loss(y_real, y_score)

    # output
    fout.write('\n')
    for key in param:
        fout.write(str(key))
        fout.write(',{},'.format(param[key]))
    fout.write('\n')
    fout.write('task,auc,\n')
    for task in param['tasks_list_']:
        best_auc = 0.5
        best_logloss = 0
        y_real = vals[task].get_label()
        tree_num = 0
        for tree in range(2, bst.best_ntree_limit):
            y_score = bst.predict(vals[task], ntree_limit=tree)
            fpr, tpr, thresholds = roc_curve(y_real, y_score, pos_label=1)
            roc_auc = auc(fpr, tpr)
            logloss = log_loss(y_real, y_score)
            if roc_auc > best_auc:
                best_auc = roc_auc
                best_logloss = logloss
                tree_num = tree
        # acc = accuracy_score(y_real, y_score)

        print("task {} 's AUC={} logloss={} at {} tree".format(task, best_auc, best_logloss, tree_num))
        fout.write("{},{},{}\n".format(task, best_auc, best_logloss))
    fout.write("all,{},{},\n".format(all_roc_auc,all_logloss))

    fout.close()


Parameters: { "baseline_alpha", "baseline_lambda", "debug", "early_stopping_rounds", "how_task_split", "max_neg_sample_ratio", "min_task_gain", "n_estimators", "silent", "task_gain_margin", "task_num_for_OLF", "task_num_for_init_vec", "tasks_list_", "use_task_gain_self", "when_task_split", "which_task_value" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-auc:0.86876	eval-auc:0.68617
[1]	train-auc:0.87649	eval-auc:0.66565
[2]	train-auc:0.93405	eval-auc:0.70747
[3]	train-auc:0.95449	eval-auc:0.71609
[4]	train-auc:0.95997	eval-auc:0.71703
[5]	train-auc:0.97312	eval-auc:0.73849
[6]	train-auc:0.97984	eval-auc:0.74186
[7]	train-auc:0.98239	eval-auc:0.74937
[8]	train-auc:0.98601	eval-auc:0.75368
[9]	train-auc:0.98614	eval-auc:0.76300
[10]	train

[186]	train-auc:0.99927	eval-auc:0.82409
[187]	train-auc:0.99927	eval-auc:0.82409
[188]	train-auc:0.99927	eval-auc:0.82409
[189]	train-auc:0.99927	eval-auc:0.82409
[190]	train-auc:0.99927	eval-auc:0.82409
[191]	train-auc:0.99927	eval-auc:0.82409
[192]	train-auc:0.99935	eval-auc:0.82347
[193]	train-auc:0.99935	eval-auc:0.82347
[194]	train-auc:0.99935	eval-auc:0.82347
[195]	train-auc:0.99935	eval-auc:0.82347
[196]	train-auc:0.99935	eval-auc:0.82347
[197]	train-auc:0.99935	eval-auc:0.82347
[198]	train-auc:0.99935	eval-auc:0.82347
[199]	train-auc:0.99935	eval-auc:0.82347
[200]	train-auc:0.99935	eval-auc:0.82347
[201]	train-auc:0.99935	eval-auc:0.82347
[202]	train-auc:0.99935	eval-auc:0.82347
[203]	train-auc:0.99935	eval-auc:0.82347
[204]	train-auc:0.99933	eval-auc:0.82566
[205]	train-auc:0.99933	eval-auc:0.82566
[206]	train-auc:0.99933	eval-auc:0.82566
[207]	train-auc:0.99933	eval-auc:0.82566
[208]	train-auc:0.99933	eval-auc:0.82566
[209]	train-auc:0.99933	eval-auc:0.82566
[210]	train-auc:











task 1 's AUC=0.7877237851662404 logloss=0.5914350965060293 at 205 tree












task 2 's AUC=0.7416879795396419 logloss=0.6273818855639547 at 77 tree












task 3 's AUC=0.9267676767676768 logloss=0.3684943293919787 at 205 tree










task 4 's AUC=0.86 logloss=0.5108452992513776 at 213 tree


