In [1]:
import argparse
import ast
import time

import numpy as np
import xgboost as xgb

RNG = np.random.RandomState(1994)


def run_benchmark(args):
    """Runs the benchmark."""
    try:
        dtest = xgb.DMatrix('dtest.dm')
        dtrain = xgb.DMatrix('dtrain.dm')

        if not (dtest.num_col() == args.columns
                and dtrain.num_col() == args.columns):
            raise ValueError("Wrong cols")
        if not (dtest.num_row() == args.rows * args.test_size
                and dtrain.num_row() == args.rows * (1 - args.test_size)):
            raise ValueError("Wrong rows")
    except:
        print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns))
        print("{}/{} test/train split".format(args.test_size, 1.0 - args.test_size))
        tmp = time.time()
        X = RNG.rand(args.rows, args.columns)
        y = RNG.randint(0, 2, args.rows)
        if 0.0 < args.sparsity < 1.0:
            X = np.array([[np.nan if RNG.uniform(0, 1) < args.sparsity else x for x in x_row]
                          for x_row in X])

        train_rows = int(args.rows * (1.0 - args.test_size))
        test_rows = int(args.rows * args.test_size)
        X_train = X[:train_rows, :]
        X_test = X[-test_rows:, :]
        y_train = y[:train_rows]
        y_test = y[-test_rows:]
        print("Generate Time: %s seconds" % (str(time.time() - tmp)))
        del X, y

        tmp = time.time()
        print("DMatrix Start")
        dtrain = xgb.DMatrix(X_train, y_train, nthread=-1)
        dtest = xgb.DMatrix(X_test, y_test, nthread=-1)
        print("DMatrix Time: %s seconds" % (str(time.time() - tmp)))
        del X_train, y_train, X_test, y_test

        dtest.save_binary('dtest.dm')
        dtrain.save_binary('dtrain.dm')

    param = {'objective': 'binary:logistic'}
    if args.params != '':
        param.update(ast.literal_eval(args.params))

    param['tree_method'] = 'gpu_hist'
    print("Training with '%s'" % param['tree_method'])
    tmp = time.time()
    xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")])
    print("Train Time: %s seconds" % (str(time.time() - tmp)))


def main():
    """The main function.

    Defines and parses command line arguments and calls the benchmark.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--tree_method', default='gpu_hist')
    parser.add_argument('--sparsity', type=float, default=0.0)
    parser.add_argument('--rows', type=int, default=1000000)
    parser.add_argument('--columns', type=int, default=50)
    parser.add_argument('--iterations', type=int, default=500)
    parser.add_argument('--test_size', type=float, default=0.25)
    parser.add_argument('--params', default='',
                        help='Provide additional parameters as a Python dict string, e.g. --params '
                             '\"{\'max_depth\':2}\"')
    args = parser.parse_args()

    run_benchmark(args)


if __name__ == '__main__':
    main()


usage: ipykernel_launcher.py [-h] [--tree_method TREE_METHOD]
                             [--sparsity SPARSITY] [--rows ROWS]
                             [--columns COLUMNS] [--iterations ITERATIONS]
                             [--test_size TEST_SIZE] [--params PARAMS]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\chengben\AppData\Roaming\jupyter\runtime\kernel-102a6bd4-53bc-4e3b-bb7b-2ffd250b310f.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 10 16:42:50 2019

@author: chengben
"""
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import datetime
import warnings
import numpy as np
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int': #前三个字符为int那就为整型
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:#先看八位整型能不能满足条件
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:#不是整型就是浮点型
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

def logloss(y_true, y_pred,deta = 3, eps=1e-15):
    # Prepare numpy array data
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    assert (len(y_true) and len(y_true) == len(y_pred))
    # Clip y_pred between eps and 1-eps
    p = np.clip(y_pred, eps, 1-eps)
    loss = np.sum(- y_true * np.log(p) * deta - (1 - y_true) * np.log(1-p))
    return loss / len(y_true)

train = reduce_mem_usage(pd.read_csv('round1_diac2019_train.csv'))

#drop_duplicates() 去除该列的重复项
all_customer = pd.DataFrame(train[['customer_id']]).drop_duplicates(['customer_id']).dropna()
#print(all_customer.shape)
#
#train['order_pay_time'] = pd.to_datetime(train['order_pay_time'])
#train['order_pay_date'] = train['order_pay_time'].dt.date
#
#train_data = train[train['order_pay_date'].astype(str)<'2013-07-03']
#online_train_data = train
#train_labels = train[train['order_pay_date'].astype(str)>='2013-07-03'] 
#
##简单的特征生成代码，改进空间很大
#
#def make_feature_and_label(date1,date2,isSubmit):
#    date1['count'] = 1   
#    #每个id的购物数量   agg({}) 字典里面的key如果在dataframe里面那么['count']可以省略
#    #被分组后的数据也是dataframe类型
#    #如customer_id  列索引第一个是分组依据：customer_id 第二个是 count
#    customer_id = date1.groupby(['customer_id'], as_index=False)['count'].agg({'count':'count'}) #对count进行count聚合
#    #统计每个用户买的商品的价格
#    #行索引由于是as_index=False 所以还是数字， 但是列索引分别为： customers_id good_price_max,good_price_mean,good_price_min
#    good_price = date1.groupby(['customer_id'], as_index=False)['goods_price'].agg({'goods_price_max':'max',
#                                                                                     'goode_price_mean':'mean',
#                                                                                     'goode_price_max':'min'})
#    
#    last_time = date1.groupby(['customer_id'], as_index=False)['order_pay_date'].agg({'order_pay_date_last':'max','order_pay_date_first':'min'})
#    
#    #将以上特征合并到一个表格，但是由于他们都是dataframe 所以用merge 合并键为customer_id 合并方式以左侧为准
#    data = pd.merge(customer_id,good_price,on=['customer_id'],how='left',copy=False)
#    data = pd.merge(data, last_time, on=['customer_id'],how='left',copy=False)
#    data['long_time'] = pd.to_datetime(data['order_pay_date_last']) - pd.to_datetime(data['order_pay_date_first'])
#    data['long_time'] = data['long_time'].dt.days + 1
#    del data['order_pay_date_first']
#    if isSubmit==False: #表示为训练集不仅要提取特征，还要进行标注，利用date2进行标注
#        data['order_pay_date_last'] = pd.to_datetime(date2['order_pay_date'].min()) - pd.to_datetime(data['order_pay_date_last'])
#        data['order_pay_date_last'] = data['order_pay_date_last'].dt.days + 1
#        data['labels'] = 0
#        data.loc[data['customer_id'].isin(list(date2['customer_id'].unique())),'labels'] = 1
#        print(data['labels'].mean())
#    else:#表示为线上数据集，只需要进行特征提取就行
#        data['order_pay_date_last'] = pd.to_datetime('2013-12-31')- pd.to_datetime(data['order_pay_date_last'])
#        data['order_pay_date_last'] = data['order_pay_date_last'].dt.days + 1
#    print(data.shape)
#    return data
#train = make_feature_and_label(train_data,train_labels,False)
#train.to_csv('data/train_data.csv',index=False)
#submit = make_feature_and_label(online_train_data,None,True)
## ============================获取用户行为特征=========================
##def user_item(data_date):
##    """
##    返回用户在data_date时间段内购买的商品数，返回形式
##     DataFrame = {index = ‘customer_id’
##                 columns = '(hours,behavours)'
##                 values = 'user_id in hour behavour counts'
##                 }
##    """
##    #透视表用来统计频率，数据变为index和columns
##    # pd.crosstab([df.columns_1,df.columns_2],df.columns_3,dropna=False) 数据交叉表用于统计在索引为df.columns_1 和df.columns_2的条件下，统计df.columns_3的数值
##    # 返回一个crosstab对象，使用函数可以返回一个DataFrame对象
##    # 使用df.unstack(fill_value = 0)可以将第二级行索引转换为由上至下的最下面的列索引
##    #交叉表在于一个交叉二字，以两个参数为例，df.columns_1,df.coulmns_3 df.columns_1 为表格的index，df.columns_2为表格的columns ,二者之间对应的表格
##    #表示二者发生关系的次数。比如用户1和8的交集为6，表明用户1，在8这个时间段这几天内有过6次操作，如果再细分一点是什么操作，还可以如本程序所示，多加一个参数
##    user_act_count = pd.crosstab([beforesomeday.user_id,beforesomeday.behavior_type],beforesomeday.hours,dropna=False)
##    user_act_count = user_act_count.unstack(fill_value = 0)
##    return user_act_count
#
#param = {
#    'num_leaves':128,
#    'objective':'binary',
#    'max_depth':-1,
#    'learning_rate':0.1,
#    'metric':'binary_logloss'}
# #划分数据集   
#y = train.pop('labels')  
#
#feature = [x for x in train.columns if x not in ['customer_id']]
#X = train[feature]  
#X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42,stratify=y)    

#训练模型
#trn_data = lgb.Dataset(X_train, label=y_train)
#val_data = lgb.Dataset(X_valid, label=y_valid)
#lgbm = lgb.train(param,trn_data,valid_sets=[trn_data,val_data],num_boost_round = 10000 ,early_stopping_rounds=25,verbose_eval=50) 
from lightgbm.sklearn import LGBMRegressor #d导入的是sklearn版本，原生版本略有不同   
#model = LGBMRegressor(n_jobs=-1,learning_rate=0.05,
#                    max_depth=16,
#                     n_estimators=10,
#                     num_leaves=30,
#                     reg_alpha=1.5,
#                     reg_lambda=0.7,
#                     min_child_samples=27,
#                     min_split_gain=0.1,
#                     colsample_bytree=0.2)
model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
       colsample_bytree=0.833, gamma=0.11, learning_rate=0.005, max_delta_step=0,
       max_depth=25, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=-1, nthread=50, objective='reg:linear', random_state=0,
       reg_alpha=1.7, reg_lambda=0.6, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.9)
train = pd.read_csv('train_x_add1.csv')
label = LabelEncoder()
train['order_pay_date_last'] = pd.to_datetime('2013-12-31') - pd.to_datetime(train['order_pay_date_last'])
train['order_pay_date_last'] = train['order_pay_date_last'].dt.days + 1
train['order_pay_date_first'] =  pd.to_datetime(train['order_pay_date_last']) - pd.to_datetime('2013-7-4') 
train['order_pay_date_first'] = train['order_pay_date_first'].dt.days + 1
train['customer_province_code'] = label.fit_transform(train['customer_province'])
train['customer_city_code'] = label.fit_transform(train['customer_city'])
Y = train.pop('labels')
X = train.drop(['customer_id','goods_id','customer_province','customer_city'],axis=1).fillna(0)
model.fit(X,Y)


#进行预测  
test = pd.read_csv('test_x_add1.csv')
test['order_pay_date_last'] = pd.to_datetime('2013-12-31') - pd.to_datetime(test['order_pay_date_last'])
test['order_pay_date_last'] = test['order_pay_date_last'].dt.days + 1
test['order_pay_date_first'] =  pd.to_datetime(test['order_pay_date_last']) - pd.to_datetime('2013-7-4') 
test['order_pay_date_first'] = test['order_pay_date_first'].dt.days + 1
test['customer_province_code'] = label.fit_transform(test['customer_province'])
test['customer_city_code'] = label.fit_transform(test['customer_city'])

X_submit = test.drop(['customer_id','goods_id','customer_province','customer_city'],axis=1).fillna(0)
y_submit = model.predict(X_submit)

#提交的结果
submit_df = test[['customer_id']]
submit_df['result'] = y_submit    
all_customer = pd.merge(all_customer,submit_df,on=['customer_id'],how='left',copy=False)
all_customer = all_customer.sort_values(['customer_id'])#默认升序 ascending=True False降序
all_customer['customer_id'] = all_customer['customer_id'].astype('int64') 
all_customer['result'] = all_customer['result'].fillna(0)
all_customer['result'] = all_customer['result'].apply(lambda x: 1 if x > 1.0 else x)
all_customer['result'] = all_customer['result'].apply(lambda x: 0 if x <0 else x)
all_customer.to_csv('mpdf_baseline111.csv',index=False)

    
    
    
    
    
    
    
    
    
    
    
    
    
    
    

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 16 12:31:09 2019

@author: chengben
"""
#LGBMRegressor
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
def logloss(y_true, y_pred,deta = 3.5, eps=1e-15):
    # Prepare numpy array data
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    assert (len(y_true) and len(y_true) == len(y_pred))
    # Clip y_pred between eps and 1-eps
    p = np.clip(y_pred, eps, 1-eps)
    loss = np.sum(- y_true * np.log(p) * deta - (1 - y_true) * np.log(1-p))
    return loss / len(y_true)

train = pd.read_csv('train_x_add.csv')
#test = pd.read_csv('test_x_ziji.csv')
trian = train.sample(frac=1)
label = LabelEncoder()
train['order_pay_date_last'] = pd.to_datetime('2013-12-31') - pd.to_datetime(train['order_pay_date_last'])
train['order_pay_date_last'] = train['order_pay_date_last'].dt.days + 1
train['order_pay_date_first'] =  pd.to_datetime(train['order_pay_date_last']) - pd.to_datetime('2013-7-4') 
train['order_pay_date_first'] = train['order_pay_date_first'].dt.days + 1
train['customer_province'] = label.fit_transform(train['customer_province'])
train['customer_city_code'] = label.fit_transform(train['customer_city'])
Y = train.pop('labels')
X = train.drop(['customer_id','goods_id','customer_province','customer_city','order_pay_date_last','order_pay_date_first'],axis=1).fillna(0)
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from lightgbm.sklearn import LGBMRegressor #d导入的是sklearn版本，原生版本略有不同
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split

kfold = KFold(n_splits=5, shuffle = False, random_state=12)

model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.1,
       colsample_bytree=0.733, gamma=0.11, learning_rate=0.05, max_delta_step=0,
       max_depth=25, min_child_weight=1, missing=None, n_estimators=20,
       n_jobs=-1, objective='binary:logistic', random_state=0,
       reg_alpha=1.7, reg_lambda=0.6, scale_pos_weight=7, seed=None,
       silent=True, subsample=1,gpu_id = 0, max_bin = 16, tree_method = 'gpu_hist')

# def myloss(real,pre):
#     grad = -(real/pre + (1-real)/(1-pre))
#     hess = real/pre**2 +(1-real)/(1-pre)**2
#     return grad,hess
# mse1 = []
# mse2 = []
for train,val in kfold.split(X): #train_test_split留出法
    X_train = X.iloc[train]
    y_train = Y.iloc[train]
    X_val = X.iloc[val]
    y_val = Y.iloc[val]
    model.fit(X_train,y_train)
    
    y_pred1 = model.predict(X_val)
    error1 = logloss(y_val,y_pred1)
    mse1.append(error1)
    y_pred2 = model.predict(X_train)
    error2 = logloss(y_train,y_pred2)
    mse2.append(error2)
print('验证集准确率',np.mean(mse1),mse1)
print("训练集准确率",np.mean(mse2),mse2)

#param_test = {  'gamma':np.linspace(0.11,0.12,2)
#                'max_depth':range(8,16,2),
#            'min_child_weight':np.linspace(0.1,0.8,8)
#              'num_leaves':range(30,100,5),
#         'min_split_gain':np.linspace(0.1,1,10),
#     'min_child_samples':range(10,30,1),
#     'reg_alpha':np.linspace(1.5,2,6),
#            'reg_lambda':np.linspace(0.5,1.2,8) 
#            }
##缩进的是需要调优的，行间隔表示调优分组
#model = XGBRegressor(n_estimators=21,
#                     
#                     max_depth=25,
#                     min_child_weight=1,
#                     
#                     gamma=0.11, #在节点分裂时，只有分裂后损失函数的值下降了，才会分裂这个节点。Gamma指定了节点分裂所需的最小损失函数下降值。
#                     
#                     subsample=1.0,
#                     colsample_bytree=1,#和GBM里面的max_features参数类似。用来控制每棵随机采样的列数的占比(每一列是一个特征)。
#                                         
#                     colsample_bylevel=1,#用来控制树的每一级的每一次分裂，对列数的采样的占比
#                     
#                     reg_alpha=1.2,  #L1正则化
#                     reg_lambda=0.6, #L2正则化
#                     
#                     learning_rate=0.05, 
#                     
#                       base_score=0.5,
#                       booster='gbtree', 
#                       max_delta_step=0,
#                       missing=None, 
#                       n_jobs=-1, 
#                       nthread=50, 
#                       objective=myloss, 
#                       random_state=0,
#
#                       scale_pos_weight=1, 
#                       seed=None,
#                       silent=True, 
#                       )
#gsearch2 = RandomizedSearchCV(model, param_test,iid=False, cv=5,verbose=1,n_jobs=-1,scoring='neg_mean_squared_error',n_iter=20)
#gsearch2.fit(X, Y)
#print( gsearch2.best_params_, gsearch2.best_score_)


#param_test = {'n_estimators':range(20,800,20),
#             'min_samples_split':range(5,200,20),
#             'min_samples_leaf':range(5,100,10),
#             'max_depth':range(2,12,1),
#             'max_features':range(2,12,1)
#     
#}
#model = RandomForestRegressor(n_estimators=800,
#                                 min_samples_split=40,
#                                 min_samples_leaf=20,
#                                 max_depth=8,
#                                 max_features='sqrt' ,
#                                 random_state=10)
#gsearch2 = RandomizedSearchCV(model, param_test,iid=False, cv=5,verbose=10,n_jobs=-1,scoring='neg_mean_squared_error',n_iter=400)
#gsearch2.fit(X, Y)
#print( gsearch2.best_params_, gsearch2.best_score_)

#}
#max_depth ：设置树深度，深度越大可能过拟合
#num_leaves：因为 LightGBM 使用的是 leaf-wise 的算法，因此在调节树的复杂程度时，使用的是 num_leaves 而不是 max_depth。大致换算关系：num_leaves = 2^(max_depth)，但是它的值的设置应该小于 2^(max_depth)，否则可能会导致过拟合。
#
#作者：慕斯王
#链接：https://www.imooc.com/article/43784?block_id=tuijian_wz
#来源：慕课网
#lgb = LGBMRegressor(n_jobs=-1,learning_rate=0.05,
#                    max_depth=16,
#                     n_estimators=800,
#                     num_leaves=30,
#                     reg_alpha=1.5,
#                     reg_lambda=0.7,
#                     min_child_samples=27,
#                     min_split_gain=0.1,
#                     colsample_bytree=0.2)




In [None]:
from sklearn.externals import joblib
joblib.dump(model,'ou.pkl')

In [None]:
model.save_model('001.model')

In [None]:
%matplotlib inline
model = joblib.load('ou.pkl')
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
# sns.set_style('darkgrid')
 
features_list = X_train.columns.values
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)[-50:]
 
plt.figure(figsize=(5,7))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), features_list[sorted_idx])
plt.xlabel('Importance')
plt.title('Feature importances')
plt.draw()
plt.show()

In [3]:
import xgboost as xgb
import numpy as np
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
import time

In [5]:
# Fetch dataset using sklearn
cov = fetch_covtype()
X = cov.data
y = cov.target

Downloading https://ndownloader.figshare.com/files/5976039


KeyboardInterrupt: 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, random_state=42)
num_round = 25 #3000

# Leave most parameters as default
param = {'objective': 'multi:softmax', # Specify multiclass classification
         'num_class': 8, # Number of possible output classes
         'tree_method': 'gpu_hist' # Use GPU accelerated algorithm
         }
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
gpu_res = {} # Store accuracy result
tmp = time.time()
# Train model
param['tree_method'] = 'gpu_hist'
model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.1,
       colsample_bytree=0.733, gamma=0.11, learning_rate=0.05, max_delta_step=0,
       max_depth=25, min_child_weight=1, missing=None, n_estimators=25,
       n_jobs=-1, objective='binary:logistic', random_state=0,
       reg_alpha=1.7, reg_lambda=0.6, scale_pos_weight=1, seed=None,
       silent=True, subsample=1,gpu_id =1, max_bin = 16, tree_method = 'gpu_hist')
model.fit(X_train,y_train)
# xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res) 
print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))

[0]	test-merror:0.15670
[1]	test-merror:0.15654
[2]	test-merror:0.15652
[3]	test-merror:0.15637
[4]	test-merror:0.15632
[5]	test-merror:0.15618
[6]	test-merror:0.15621
[7]	test-merror:0.15616
[8]	test-merror:0.15609
[9]	test-merror:0.15591
[10]	test-merror:0.15585
[11]	test-merror:0.15584
[12]	test-merror:0.15584
[13]	test-merror:0.15595
[14]	test-merror:0.15588
[15]	test-merror:0.15598
[16]	test-merror:0.15599
[17]	test-merror:0.15593
[18]	test-merror:0.15591
[19]	test-merror:0.15595
[20]	test-merror:0.15597
[21]	test-merror:0.15609
[22]	test-merror:0.15631
[23]	test-merror:0.15629
[24]	test-merror:0.15628
GPU Training Time: 15.797765016555786 seconds


In [5]:
gpu_res = {} # Store accuracy result
tmp = time.time()
# Train model
param['tree_method'] = 'gpu_hist'
xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res)
print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))

[0]	test-merror:0.15670
[1]	test-merror:0.15654
[2]	test-merror:0.15652
[3]	test-merror:0.15637
[4]	test-merror:0.15632
[5]	test-merror:0.15618
[6]	test-merror:0.15621
[7]	test-merror:0.15616
[8]	test-merror:0.15609
[9]	test-merror:0.15591
[10]	test-merror:0.15585
[11]	test-merror:0.15584
[12]	test-merror:0.15584
[13]	test-merror:0.15595
[14]	test-merror:0.15588
[15]	test-merror:0.15598
[16]	test-merror:0.15599
[17]	test-merror:0.15593
[18]	test-merror:0.15591
[19]	test-merror:0.15595
[20]	test-merror:0.15597
[21]	test-merror:0.15609
[22]	test-merror:0.15631
[23]	test-merror:0.15629
[24]	test-merror:0.15628
GPU Training Time: 14.73460841178894 seconds


In [6]:
# Repeat for CPU algorithm
tmp = time.time()
param['tree_method'] = 'hist'
cpu_res = {}
xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=cpu_res)
print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))

[0]	test-merror:0.15668
[1]	test-merror:0.15652
[2]	test-merror:0.15651
[3]	test-merror:0.15636
[4]	test-merror:0.15631
[5]	test-merror:0.15615
[6]	test-merror:0.15609
[7]	test-merror:0.15611
[8]	test-merror:0.15606
[9]	test-merror:0.15615
[10]	test-merror:0.15596
[11]	test-merror:0.15591
[12]	test-merror:0.15574
[13]	test-merror:0.15590
[14]	test-merror:0.15587
[15]	test-merror:0.15584
[16]	test-merror:0.15587
[17]	test-merror:0.15587
[18]	test-merror:0.15594
[19]	test-merror:0.15591
[20]	test-merror:0.15590
[21]	test-merror:0.15600
[22]	test-merror:0.15594
[23]	test-merror:0.15593
[24]	test-merror:0.15598
CPU Training Time: 166.61956787109375 seconds
