In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy.stats import mode
import csv
import matplotlib.dates
from datetime import *
from sklearn.preprocessing import *
from sklearn import ensemble
import xgboost as xgb
from sklearn import metrics
from xgboost.sklearn import XGBClassifier  
from sklearn.model_selection import GridSearchCV,cross_val_score  
import matplotlib.pylab as plt  

from sklearn.preprocessing import *
import xgboost as xgb
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, StratifiedKFold
from sklearn.externals import joblib


In [2]:
class XgbModel:
    def __init__(self, feaNames=None, params={}):
        self.feaNames = feaNames
        self.params = {
            'objective': 'binary:logistic',
            'eval_metric':'logloss',
            'silent': True,
            'eta': 0.02,
            'max_depth': 5,
            'gamma': 1,
            'subsample': 0.9,
            'colsample_bytree': 0.95,
            'min_child_weight': 2,
            'max_delta_step': 1,
            'lambda': 30,
#             'nthread': 20,
        }
#         self.params = {
#             'objective': 'binary:logistic',
#             'eta': 0.01,
#             'colsample_bytree': 0.886,
#             'min_child_weight': 1.2,
#             'max_depth': 4,
#             'subsample': 0.886,
#             #'alpha': 4,
#             'gamma': 0.1,
#             'lambda': 5,
#             'verbose_eval': True,
#             'eval_metric': 'logloss',
#             'seed': 201803,
#             'nthread':24
#             #'missing':-1
#         }
        for k,v in params.items():
            self.params[k] = v
        self.clf = None

    def train(self, X, y, train_size=1, test_size=0.1, verbose=True, num_boost_round=1000, early_stopping_rounds=3):
        X = X.astype(float)
        if train_size==1:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
            X_train, y_train = X, y
        else:
            X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)
        dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=self.feaNames)
        dval = xgb.DMatrix(X_test, label=y_test, feature_names=self.feaNames)
        watchlist = [(dtrain,'train'),(dval,'val')]
        clf = xgb.train(
            self.params, dtrain, 
            num_boost_round = num_boost_round, 
            evals = watchlist, 
            early_stopping_rounds = early_stopping_rounds,
            verbose_eval=verbose
        )
        self.clf = clf

    def trainCV(self, X, y, nFold=3, verbose=True, num_boost_round=8000, early_stopping_rounds=10):
        X = X.astype(float)
        dtrain = xgb.DMatrix(X, label=y, feature_names=self.feaNames)
        cvResult = xgb.cv(
            self.params, dtrain, 
            num_boost_round = num_boost_round, 
            nfold = nFold,
            early_stopping_rounds = early_stopping_rounds,
            verbose_eval=verbose
        )
        clf = xgb.train(
            self.params, dtrain, 
            num_boost_round = cvResult.shape[0], 
        )
        self.clf = clf

    def gridSearch(self, X, y, nFold=3, verbose=1, num_boost_round=130):
        paramsGrids = {
            # 'n_estimators': [50+5*i for i in range(0,30)],
            'gamma': [0,0.01,0.05,0.1,0.5,1,5,10,50,100],
            # 'max_depth': list(range(3,10)),
            'min_child_weight': list(range(0,10)),
            'subsample': [1-0.05*i for i in range(0,8)],
            'colsample_bytree': [1-0.05*i for i in range(0,10)],
            # 'reg_alpha': [0+2*i for i in range(0,10)],
            'reg_lambda': [0+50*i for i in range(0,10)],            
            'max_delta_step': [0+1*i for i in range(0,8)],
        }
        for k,v in paramsGrids.items():
            gsearch = GridSearchCV(
                estimator = xgb.XGBClassifier(
                    max_depth = self.params['max_depth'], 
                    gamma = self.params['gamma'],
                    learning_rate = self.params['eta'],
                    max_delta_step = self.params['max_delta_step'],
                    min_child_weight = self.params['min_child_weight'],
                    subsample = self.params['subsample'],
                    colsample_bytree = self.params['colsample_bytree'],
                    silent = self.params['silent'],
                    reg_lambda = self.params['lambda'],
                    n_estimators = num_boost_round
                ),
                # param_grid = paramsGrids,
                param_grid = {k:v},
                scoring = 'neg_log_loss',
                cv = nFold,
                verbose = verbose,
                n_jobs = 4
            )
            gsearch.fit(X, y)
            print(pd.DataFrame(gsearch.cv_results_))
            print(gsearch.best_params_)
        exit()

    def predict(self, X):
        X = X.astype(float)
        return self.clf.predict(xgb.DMatrix(X, feature_names=self.feaNames))

    def getFeaScore(self, show=False):
        fscore = self.clf.get_score()
        feaNames = fscore.keys()
        scoreDf = pd.DataFrame(index=feaNames, columns=['importance'])
        for k,v in fscore.items():
            scoreDf.loc[k, 'importance'] = v
        if show:
            print(scoreDf.sort_index(by=['importance'], ascending=False))
        return scoreDf

# 划分训练集和测试集
def trainTestSplit(df, splitDate=pd.to_datetime('2018-09-23'), trainPeriod=3, testPeriod=1):
    trainDf = df[(df.context_timestamp<splitDate)&(df.context_timestamp>=splitDate-timedelta(days=trainPeriod))]
    testDf = df[(df.context_timestamp>=splitDate)&(df.context_timestamp<splitDate+timedelta(days=testPeriod))]
    return (trainDf, testDf)


# 统计预测误差
def countDeltaY(predictSeries, labelSeries, show=True, title='', subplot=None):
    deltaSeries = predictSeries - labelSeries
    if subplot!=None:
        plt.subplot(subplot[0], subplot[1], subplot[2])
    deltaSeries.plot(style='b-')
    plt.title(title)
    if show:
        plt.show()
    return deltaSeries

# 获取stacking下一层数据集
def getOof(clf, trainX, trainY, testX, nFold=5, stratify=False):
    oofTrain = np.zeros(trainX.shape[0])
    oofTest = np.zeros(testX.shape[0])
    oofTestSkf = np.zeros((testX.shape[0], nFold))
    if stratify:
        kf = StratifiedKFold(n_splits=nFold, shuffle=True)
    else:
        kf = KFold(n_splits=nFold, shuffle=True)
    for i, (trainIdx, testIdx) in enumerate(kf.split(trainX, trainY)):
        kfTrainX = trainX[trainIdx]
        kfTrainY = trainY[trainIdx]
        kfTestX = trainX[testIdx]
        clf.trainCV(kfTrainX, kfTrainY, verbose=False)
        oofTrain[testIdx] = clf.predict(kfTestX)
        oofTestSkf[:,i] = clf.predict(testX)
    oofTest[:] = oofTestSkf.mean(axis=1)
    return oofTrain, oofTest


In [3]:
#首先导入滑窗训练集数据进行线下数据集划分
train_df = pd.read_csv('~/kengkeng/alimama/data/fusai_b_train_df_weilai.csv')
test_df = pd.read_csv('~/kengkeng/alimama/data/fusai_b_test_df_weilai.csv')

print(train_df.info())
print(test_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1629767 entries, 0 to 1629766
Columns: 103 entries, instance_id to real_last_category_buy_number
dtypes: float64(53), int64(48), object(2)
memory usage: 1.3+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1729656 entries, 0 to 1729655
Columns: 102 entries, instance_id to real_last_category_buy_number
dtypes: float64(53), int64(47), object(2)
memory usage: 1.3+ GB
None


In [4]:
test_df_b = pd.read_csv('~/yuna/alimama/data/round2_ijcai_18_test_b_20180510.txt', sep=' ')
print(len(test_df_b))
print(len(test_df_b.columns.values))


1209768
26


In [5]:
print(len(test_df.columns.values))
test_df = pd.merge(test_df_b[['instance_id']], test_df, on=['instance_id'], how='left')
print(len(test_df))
print(len(test_df.columns.values))
print(test_df.columns.values)


102
1209768
102
['instance_id' 'item_id' 'item_brand_id' 'item_city_id' 'item_price_level'
 'item_sales_level' 'item_collected_level' 'item_pv_level' 'user_id'
 'user_gender_id' 'user_age_level' 'user_occupation_id' 'user_star_level'
 'context_id' 'context_timestamp' 'context_page_id'
 'predict_category_property' 'shop_id' 'shop_review_num_level'
 'shop_review_positive_rate' 'shop_star_level' 'shop_score_service'
 'shop_score_delivery' 'shop_score_description' 'date' 'weekday' 'day'
 'hour' 'prop_jaccard' 'prop_predict_ratio' 'prop_item_ratio'
 'match_category_proportion' 'match_property_proportion'
 'predict_category_number' 'predict_property_number' 'isFirstCategoryIn'
 'isLastCategoryIn' 'category_number' 'property_number'
 'real_first_category' 'real_last_category'
 'all_item_brand_id_click_number' 'all_item_brand_id_buy_number'
 'history_item_brand_id_rate' 'history_item_brand_id_smooth_rate'
 'all_shop_id_click_number' 'all_shop_id_buy_number' 'history_shop_id_rate'
 'history_sho

In [6]:
fea = [

         'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level',
         'item_collected_level', 'item_pv_level', 'user_gender_id',
         'user_age_level', 'user_occupation_id', 'user_star_level',
         'context_page_id',
         'shop_review_num_level', 'shop_review_positive_rate',
         'shop_star_level', 'shop_score_service', 'shop_score_delivery',
         'shop_score_description',
         'prop_jaccard', 'prop_predict_ratio', 'prop_item_ratio',
         'match_category_proportion', 'match_property_proportion', 
         'predict_category_number', 'predict_property_number',
         'isLastCategoryIn', 'isFirstCategoryIn',
         'category_number', 'property_number',
         'all_item_brand_id_click_number',
         'all_item_brand_id_buy_number',
         'history_item_brand_id_smooth_rate', 'all_shop_id_click_number',
         'all_shop_id_buy_number',
         'history_shop_id_smooth_rate', 'all_item_id_click_number',
         'all_item_id_buy_number',
         'history_item_id_smooth_rate', 'lastOneHour_sameItem_count',
         'lastOneHour_sameFirstCategory_count', 'lastOneHour_sameLastCategory_count',
         'lastOneHour_sameBrand_count', 'lastOneHour_sameShop_count',
         'isLastOneHour_firstClickItem',
    #      'is_special', 'hour',
         'hour_rate',
         'userItem_lastClickDeltaTime',
         'userBrand_lastClickDeltaTime', 'userShop_lastClickDeltaTime',
         'userFirstCategory_lastClickDeltaTime',
         'userLastCategory_lastClickDeltaTime',
         'user_id_converse_smooth_rate', 'user_id_total_number', 'user_id_buy_number',
         'item_brand_id_converse_smooth_rate', 'item_brand_id_total_number',
         'item_brand_id_buy_number', 'item_id_converse_smooth_rate',
         'item_id_total_number', 'item_id_buy_number', 'shop_id_converse_smooth_rate',
         'shop_id_total_number', 'shop_id_buy_number',
         'real_last_category_converse_smooth_rate',
         'real_last_category_total_number', 'real_last_category_buy_number',
         'is_later_clickSameItem', 'is_later_clickSameLastCategory',
         'later_clickSameItem_count', 'later_clickSameLastCategory_count',
         'later_clickSameItem_deltaTime', 'later_clickSameLastCategory_deltaTime',
         'shop_item_classNumber', 'brand_item_classNumber', 'city_item_classNumber',
         'shop_user_classNumber', 'brand_user_classNumber', 'city_user_classNumber',
         'item_price_level_mode','item_sales_level_mode', 
         'item_price_level_mean', 'item_sales_level_mean', 
         'item_price_level_max', 'item_sales_level_max', 
         'item_price_level_min', 'item_sales_level_min',  
        'item_sales_level_median', 'item_price_level_median', 
         ]


In [7]:
train_df_7 = train_df[train_df.day == 7]
print(len(train_df_7))
print(len(train_df_7[fea].columns.values))


1077175
86


In [8]:
xgbModel = XgbModel(feaNames=fea)
modelName = "xgb_fusai_a"

# 正式模型
startTime = datetime.now()
xgbModel.trainCV(train_df_7[fea].values, train_df_7['is_trade'].values)
xgbModel.getFeaScore(show=True)
print('training time: ', datetime.now()-startTime)


[0]	train-logloss:0.684119+3.26599e-06	test-logloss:0.684119+7.34847e-06
[1]	train-logloss:0.675191+6.94422e-06	test-logloss:0.675191+1.47045e-05
[2]	train-logloss:0.666362+1.06249e-05	test-logloss:0.666362+2.20656e-05
[3]	train-logloss:0.657634+1.43062e-05	test-logloss:0.657634+2.89943e-05
[4]	train-logloss:0.649005+1.79691e-05	test-logloss:0.649006+3.63532e-05
[5]	train-logloss:0.640477+2.16487e-05	test-logloss:0.640477+4.36985e-05
[6]	train-logloss:0.632048+2.53289e-05	test-logloss:0.632047+5.10577e-05
[7]	train-logloss:0.623718+2.90096e-05	test-logloss:0.623718+5.8418e-05
[8]	train-logloss:0.615488+3.26735e-05	test-logloss:0.615488+6.57791e-05
[9]	train-logloss:0.607356+3.63532e-05	test-logloss:0.607357+7.31224e-05
[10]	train-logloss:0.599325+4.00333e-05	test-logloss:0.599325+8.00666e-05
[11]	train-logloss:0.591392+4.37137e-05	test-logloss:0.591392+8.74274e-05
[12]	train-logloss:0.583557+4.69775e-05	test-logloss:0.583557+9.51875e-05
[13]	train-logloss:0.57582+5.06579e-05	test-loglo

[112]	train-logloss:0.208508+0.000371179	test-logloss:0.208718+0.000621371
[113]	train-logloss:0.207546+0.000372846	test-logloss:0.207759+0.000624628
[114]	train-logloss:0.206609+0.000372767	test-logloss:0.206826+0.000628449
[115]	train-logloss:0.205697+0.000374434	test-logloss:0.205917+0.000632485
[116]	train-logloss:0.204808+0.00037517	test-logloss:0.205031+0.000636202
[117]	train-logloss:0.203943+0.000374634	test-logloss:0.20417+0.000641619
[118]	train-logloss:0.2031+0.000375768	test-logloss:0.203331+0.000644246
[119]	train-logloss:0.202281+0.000377363	test-logloss:0.202515+0.000646337
[120]	train-logloss:0.201484+0.000378503	test-logloss:0.201722+0.000649282
[121]	train-logloss:0.200707+0.000380197	test-logloss:0.200949+0.000651336
[122]	train-logloss:0.199952+0.000381435	test-logloss:0.200197+0.000654552
[123]	train-logloss:0.199216+0.000382184	test-logloss:0.199464+0.000656335
[124]	train-logloss:0.1985+0.000383328	test-logloss:0.198751+0.000658429
[125]	train-logloss:0.197804+0.

[222]	train-logloss:0.174095+0.000420497	test-logloss:0.174789+0.000848211
[223]	train-logloss:0.174035+0.000422945	test-logloss:0.174734+0.000844105
[224]	train-logloss:0.173978+0.000423357	test-logloss:0.174682+0.000846228
[225]	train-logloss:0.173921+0.000423354	test-logloss:0.174628+0.000845563
[226]	train-logloss:0.173867+0.000423357	test-logloss:0.174579+0.000845497
[227]	train-logloss:0.173812+0.000424595	test-logloss:0.174528+0.000845273
[228]	train-logloss:0.173758+0.000421735	test-logloss:0.174479+0.000848658
[229]	train-logloss:0.173709+0.000422146	test-logloss:0.174435+0.000850679
[230]	train-logloss:0.17366+0.000423377	test-logloss:0.17439+0.000851493
[231]	train-logloss:0.173609+0.000424175	test-logloss:0.174346+0.000851589
[232]	train-logloss:0.173561+0.000424588	test-logloss:0.174302+0.000853251
[233]	train-logloss:0.173514+0.000424994	test-logloss:0.174261+0.000854424
[234]	train-logloss:0.173468+0.000424178	test-logloss:0.174219+0.000855271
[235]	train-logloss:0.17342

[332]	train-logloss:0.171059+0.000442228	test-logloss:0.172261+0.000889244
[333]	train-logloss:0.171045+0.000442272	test-logloss:0.172251+0.000888971
[334]	train-logloss:0.17103+0.00044187	test-logloss:0.172242+0.000889816
[335]	train-logloss:0.171013+0.000439835	test-logloss:0.172229+0.000890692
[336]	train-logloss:0.171+0.000441053	test-logloss:0.172221+0.000889909
[337]	train-logloss:0.170986+0.000440664	test-logloss:0.172212+0.000891146
[338]	train-logloss:0.170972+0.000441078	test-logloss:0.172203+0.000890332
[339]	train-logloss:0.170959+0.000441135	test-logloss:0.172194+0.000889613
[340]	train-logloss:0.170945+0.000439134	test-logloss:0.172184+0.000890101
[341]	train-logloss:0.170932+0.000440002	test-logloss:0.172176+0.000888996
[342]	train-logloss:0.170917+0.000442033	test-logloss:0.172166+0.000887334
[343]	train-logloss:0.170905+0.000442033	test-logloss:0.172158+0.000886943
[344]	train-logloss:0.170893+0.000442449	test-logloss:0.17215+0.000887758
[345]	train-logloss:0.170878+0.

[442]	train-logloss:0.169825+0.000449085	test-logloss:0.171531+0.00085563
[443]	train-logloss:0.169817+0.000448272	test-logloss:0.171528+0.00085563
[444]	train-logloss:0.169808+0.000449098	test-logloss:0.171523+0.000854392
[445]	train-logloss:0.169799+0.000447866	test-logloss:0.171519+0.00085488
[446]	train-logloss:0.169791+0.000447871	test-logloss:0.171515+0.000854489
[447]	train-logloss:0.169783+0.000449093	test-logloss:0.171511+0.000854132
[448]	train-logloss:0.169774+0.000446644	test-logloss:0.171506+0.000856086
[449]	train-logloss:0.169764+0.000445421	test-logloss:0.171502+0.000856053
[450]	train-logloss:0.169755+0.00044504	test-logloss:0.171498+0.000855728
[451]	train-logloss:0.169748+0.000443819	test-logloss:0.171495+0.00085576
[452]	train-logloss:0.169739+0.000444642	test-logloss:0.17149+0.000854555
[453]	train-logloss:0.16973+0.00044421	test-logloss:0.171485+0.000853284
[454]	train-logloss:0.169721+0.000445438	test-logloss:0.171481+0.000852437
[455]	train-logloss:0.169712+0.00

[552]	train-logloss:0.168972+0.000432363	test-logloss:0.171134+0.000846246
[553]	train-logloss:0.168964+0.000433591	test-logloss:0.17113+0.000845497
[554]	train-logloss:0.168957+0.000432363	test-logloss:0.171127+0.000846311
[555]	train-logloss:0.16895+0.000431977	test-logloss:0.171125+0.000847126
[556]	train-logloss:0.168943+0.000433585	test-logloss:0.171122+0.000846311
[557]	train-logloss:0.168937+0.00043197	test-logloss:0.171119+0.000847583
[558]	train-logloss:0.168931+0.000432405	test-logloss:0.171117+0.000846835
[559]	train-logloss:0.168926+0.000433205	test-logloss:0.171114+0.000846411
[560]	train-logloss:0.168919+0.000432801	test-logloss:0.171111+0.000846869
[561]	train-logloss:0.168912+0.000432382	test-logloss:0.171109+0.000846835
[562]	train-logloss:0.168905+0.000431985	test-logloss:0.171106+0.000847683
[563]	train-logloss:0.168898+0.000431154	test-logloss:0.171102+0.000847225
[564]	train-logloss:0.168891+0.000431589	test-logloss:0.171099+0.000846869
[565]	train-logloss:0.168885

[662]	train-logloss:0.168303+0.000419739	test-logloss:0.170878+0.000842798
[663]	train-logloss:0.168297+0.000421768	test-logloss:0.170875+0.000840323
[664]	train-logloss:0.16829+0.000422997	test-logloss:0.170873+0.000840385
[665]	train-logloss:0.168286+0.000422578	test-logloss:0.170871+0.000840777
[666]	train-logloss:0.16828+0.000423783	test-logloss:0.170869+0.0008399
[667]	train-logloss:0.168274+0.000422971	test-logloss:0.170867+0.000840354
[668]	train-logloss:0.168268+0.000422962	test-logloss:0.170865+0.000840323
[669]	train-logloss:0.168261+0.000423365	test-logloss:0.170862+0.000840354
[670]	train-logloss:0.168258+0.000422958	test-logloss:0.170861+0.000840746
[671]	train-logloss:0.168251+0.000423382	test-logloss:0.170859+0.000839931
[672]	train-logloss:0.168245+0.000423799	test-logloss:0.170856+0.0008399
[673]	train-logloss:0.168239+0.000424225	test-logloss:0.170854+0.00083957
[674]	train-logloss:0.168235+0.000423409	test-logloss:0.170852+0.000839179
[675]	train-logloss:0.168228+0.0

[772]	train-logloss:0.167732+0.000407032	test-logloss:0.170684+0.000839508
[773]	train-logloss:0.167725+0.000407039	test-logloss:0.170682+0.000839147
[774]	train-logloss:0.16772+0.000405399	test-logloss:0.170681+0.00083957
[775]	train-logloss:0.167716+0.000405399	test-logloss:0.170679+0.000839508
[776]	train-logloss:0.16771+0.000404176	test-logloss:0.170678+0.0008399
[777]	train-logloss:0.167704+0.000404188	test-logloss:0.170676+0.0008399
[778]	train-logloss:0.167699+0.000404989	test-logloss:0.170675+0.000839508
[779]	train-logloss:0.167696+0.00040458	test-logloss:0.170675+0.000839478
[780]	train-logloss:0.167692+0.000402943	test-logloss:0.170672+0.000841047
[781]	train-logloss:0.167687+0.000402535	test-logloss:0.170671+0.000841047
[782]	train-logloss:0.167683+0.000402125	test-logloss:0.17067+0.000841077
[783]	train-logloss:0.167678+0.000402537	test-logloss:0.170668+0.0008415
[784]	train-logloss:0.167671+0.000404577	test-logloss:0.170666+0.000839447
[785]	train-logloss:0.167665+0.00040

[883]	train-logloss:0.167224+0.000403766	test-logloss:0.170547+0.00083133
[884]	train-logloss:0.167219+0.00040459	test-logloss:0.170546+0.000830515
[885]	train-logloss:0.167215+0.000405005	test-logloss:0.170545+0.000830123
[886]	train-logloss:0.16721+0.000406219	test-logloss:0.170543+0.00082967
[887]	train-logloss:0.167206+0.000406616	test-logloss:0.170542+0.000830032
[888]	train-logloss:0.167203+0.00040662	test-logloss:0.170541+0.000830454
[889]	train-logloss:0.167198+0.000407445	test-logloss:0.170539+0.000829639
[890]	train-logloss:0.167195+0.000407848	test-logloss:0.170538+0.000829639
[891]	train-logloss:0.167189+0.000407434	test-logloss:0.170537+0.000829247
[892]	train-logloss:0.167185+0.00040621	test-logloss:0.170536+0.00082967
[893]	train-logloss:0.167181+0.000406622	test-logloss:0.170535+0.000829731
[894]	train-logloss:0.167177+0.000405809	test-logloss:0.170534+0.000829731
[895]	train-logloss:0.167173+0.000407029	test-logloss:0.170533+0.000829309
[896]	train-logloss:0.167167+0.0

[993]	train-logloss:0.166756+0.000415674	test-logloss:0.170442+0.000830877
[994]	train-logloss:0.166753+0.000415253	test-logloss:0.17044+0.000831631
[995]	train-logloss:0.166748+0.000414436	test-logloss:0.170439+0.000831209
[996]	train-logloss:0.166744+0.000413234	test-logloss:0.170438+0.000831631
[997]	train-logloss:0.16674+0.000414464	test-logloss:0.170438+0.000831602
[998]	train-logloss:0.166736+0.000414887	test-logloss:0.170437+0.000831631
[999]	train-logloss:0.166732+0.000414898	test-logloss:0.170436+0.000832024
[1000]	train-logloss:0.166729+0.00041491	test-logloss:0.170435+0.000831631
[1001]	train-logloss:0.166724+0.000415683	test-logloss:0.170434+0.000830394
[1002]	train-logloss:0.166719+0.00041652	test-logloss:0.170433+0.000829943
[1003]	train-logloss:0.166715+0.0004165	test-logloss:0.170432+0.000830757
[1004]	train-logloss:0.166711+0.00041649	test-logloss:0.170431+0.00083118
[1005]	train-logloss:0.166707+0.000416886	test-logloss:0.17043+0.000831121
[1006]	train-logloss:0.16670

[1102]	train-logloss:0.166327+0.000413991	test-logloss:0.170357+0.000830953
[1103]	train-logloss:0.166324+0.000413591	test-logloss:0.170357+0.000830926
[1104]	train-logloss:0.16632+0.000413174	test-logloss:0.170357+0.000831347
[1105]	train-logloss:0.166315+0.00041318	test-logloss:0.170356+0.000830478
[1106]	train-logloss:0.166311+0.000412763	test-logloss:0.170356+0.000830084
[1107]	train-logloss:0.166307+0.000411547	test-logloss:0.170354+0.000830505
[1108]	train-logloss:0.166303+0.000411952	test-logloss:0.170354+0.00083132
[1109]	train-logloss:0.166299+0.000410715	test-logloss:0.170353+0.000830926
[1110]	train-logloss:0.166294+0.000411117	test-logloss:0.170352+0.000830478
[1111]	train-logloss:0.166291+0.000411928	test-logloss:0.170352+0.000829636
[1112]	train-logloss:0.166289+0.000411519	test-logloss:0.170351+0.000830084
[1113]	train-logloss:0.166285+0.000412344	test-logloss:0.17035+0.00082969
[1114]	train-logloss:0.16628+0.000411532	test-logloss:0.170349+0.000830111
[1115]	train-loglo

[1211]	train-logloss:0.165913+0.000411155	test-logloss:0.170288+0.000832529
[1212]	train-logloss:0.16591+0.000412775	test-logloss:0.170287+0.000832135
[1213]	train-logloss:0.165906+0.000414813	test-logloss:0.170287+0.000831714
[1214]	train-logloss:0.165903+0.000413986	test-logloss:0.170287+0.000832082
[1215]	train-logloss:0.1659+0.000412754	test-logloss:0.170286+0.000832503
[1216]	train-logloss:0.165898+0.000413571	test-logloss:0.170286+0.000832923
[1217]	train-logloss:0.165893+0.000412353	test-logloss:0.170286+0.000832897
[1218]	train-logloss:0.165889+0.000413169	test-logloss:0.170285+0.000832056
[1219]	train-logloss:0.165885+0.000413575	test-logloss:0.170285+0.000832056
[1220]	train-logloss:0.165881+0.000414009	test-logloss:0.170285+0.000832082
[1221]	train-logloss:0.165875+0.000414421	test-logloss:0.170283+0.000831293
[1222]	train-logloss:0.16587+0.000415674	test-logloss:0.170282+0.000831741
[1223]	train-logloss:0.165865+0.000415674	test-logloss:0.170281+0.000832108
[1224]	train-log

[1320]	train-logloss:0.165507+0.000417839	test-logloss:0.170235+0.000835647
[1321]	train-logloss:0.165504+0.000418302	test-logloss:0.170236+0.00083604
[1322]	train-logloss:0.1655+0.000418655	test-logloss:0.170235+0.000836011
[1323]	train-logloss:0.165496+0.000417811	test-logloss:0.170234+0.000835647
[1324]	train-logloss:0.165493+0.00041943	test-logloss:0.170234+0.000835225
[1325]	train-logloss:0.165489+0.000421076	test-logloss:0.170233+0.000835618
[1326]	train-logloss:0.165485+0.00042105	test-logloss:0.170233+0.000835254
[1327]	train-logloss:0.165482+0.000421015	test-logloss:0.170233+0.000835225
[1328]	train-logloss:0.165478+0.000421038	test-logloss:0.170232+0.000834803
[1329]	train-logloss:0.165474+0.000421062	test-logloss:0.170232+0.000834803
[1330]	train-logloss:0.165471+0.000421477	test-logloss:0.170231+0.000834803
[1331]	train-logloss:0.165466+0.00042149	test-logloss:0.170231+0.00083441
[1332]	train-logloss:0.165463+0.000421892	test-logloss:0.170231+0.00083441
[1333]	train-logloss

[1429]	train-logloss:0.16513+0.000427928	test-logloss:0.170193+0.000838514
[1430]	train-logloss:0.165127+0.000427948	test-logloss:0.170193+0.000838936
[1431]	train-logloss:0.165124+0.00042797	test-logloss:0.170192+0.000839329
[1432]	train-logloss:0.16512+0.000428372	test-logloss:0.170192+0.000839329
[1433]	train-logloss:0.165116+0.000427132	test-logloss:0.170192+0.000838907
[1434]	train-logloss:0.165113+0.000426296	test-logloss:0.170191+0.000838878
[1435]	train-logloss:0.165108+0.000426682	test-logloss:0.17019+0.000838907
[1436]	train-logloss:0.165105+0.000425453	test-logloss:0.17019+0.000839271
[1437]	train-logloss:0.165103+0.000425453	test-logloss:0.17019+0.000839271
[1438]	train-logloss:0.165099+0.000425034	test-logloss:0.17019+0.000838849
[1439]	train-logloss:0.165095+0.000424205	test-logloss:0.170189+0.000838849
[1440]	train-logloss:0.165091+0.000424194	test-logloss:0.170189+0.000838399
[1441]	train-logloss:0.165089+0.000425015	test-logloss:0.170188+0.000838428
[1442]	train-loglos

[1538]	train-logloss:0.164762+0.000405062	test-logloss:0.170159+0.000842949
[1539]	train-logloss:0.164757+0.000404658	test-logloss:0.170159+0.000843344
[1540]	train-logloss:0.164755+0.000405062	test-logloss:0.170158+0.000842529
[1541]	train-logloss:0.164752+0.000405092	test-logloss:0.170158+0.000842949
[1542]	train-logloss:0.164748+0.000405092	test-logloss:0.170157+0.000842924
[1543]	train-logloss:0.164745+0.000404701	test-logloss:0.170157+0.000842924
[1544]	train-logloss:0.16474+0.000404701	test-logloss:0.170157+0.00084332
[1545]	train-logloss:0.164736+0.000405092	test-logloss:0.170157+0.000843344
[1546]	train-logloss:0.164733+0.00040592	test-logloss:0.170157+0.000843344
[1547]	train-logloss:0.16473+0.000405898	test-logloss:0.170156+0.000843394
[1548]	train-logloss:0.164728+0.000404701	test-logloss:0.170156+0.000843369
[1549]	train-logloss:0.164724+0.000403862	test-logloss:0.170156+0.000843789
[1550]	train-logloss:0.164722+0.000403873	test-logloss:0.170156+0.000843789
[1551]	train-log

[1647]	train-logloss:0.164388+0.000416435	test-logloss:0.170127+0.00083984
[1648]	train-logloss:0.164384+0.000416851	test-logloss:0.170127+0.000839419
[1649]	train-logloss:0.164381+0.00041768	test-logloss:0.170126+0.000839393
[1650]	train-logloss:0.164378+0.000416446	test-logloss:0.170126+0.000839393
[1651]	train-logloss:0.164373+0.000417275	test-logloss:0.170126+0.000839762
[1652]	train-logloss:0.16437+0.000417694	test-logloss:0.170126+0.000840157
[1653]	train-logloss:0.164368+0.000418098	test-logloss:0.170125+0.000839737
[1654]	train-logloss:0.164365+0.000418939	test-logloss:0.170124+0.000839317
[1655]	train-logloss:0.164362+0.000418518	test-logloss:0.170124+0.000839317
[1656]	train-logloss:0.164358+0.00041771	test-logloss:0.170124+0.000840157
[1657]	train-logloss:0.164355+0.000418114	test-logloss:0.170124+0.000840157
[1658]	train-logloss:0.164351+0.000418132	test-logloss:0.170123+0.000840552
[1659]	train-logloss:0.164347+0.000418958	test-logloss:0.170124+0.000840947
[1660]	train-log

[1756]	train-logloss:0.164042+0.000424769	test-logloss:0.170107+0.000845742
[1757]	train-logloss:0.16404+0.000424368	test-logloss:0.170107+0.000845742
[1758]	train-logloss:0.164035+0.000424354	test-logloss:0.170107+0.000845766
[1759]	train-logloss:0.164032+0.000425156	test-logloss:0.170106+0.000845742
[1760]	train-logloss:0.164029+0.000426387	test-logloss:0.170106+0.000845323
[1761]	train-logloss:0.164026+0.000426802	test-logloss:0.170106+0.000845347
[1762]	train-logloss:0.164023+0.000428006	test-logloss:0.170106+0.000845323
[1763]	train-logloss:0.16402+0.00042802	test-logloss:0.170106+0.000844927
[1764]	train-logloss:0.164016+0.000428408	test-logloss:0.170106+0.000844927
[1765]	train-logloss:0.164012+0.000429602	test-logloss:0.170106+0.000844531
[1766]	train-logloss:0.164011+0.000430005	test-logloss:0.170106+0.000844531
[1767]	train-logloss:0.164008+0.00043043	test-logloss:0.170106+0.000844531
[1768]	train-logloss:0.164006+0.000430844	test-logloss:0.170105+0.000844927
[1769]	train-log

[1865]	train-logloss:0.163689+0.000437856	test-logloss:0.170086+0.000843226
[1866]	train-logloss:0.163686+0.000438672	test-logloss:0.170086+0.000842806
[1867]	train-logloss:0.163683+0.000438672	test-logloss:0.170085+0.00084241
[1868]	train-logloss:0.163679+0.000438672	test-logloss:0.170085+0.000842387
[1869]	train-logloss:0.163676+0.000439102	test-logloss:0.170085+0.000843203
[1870]	train-logloss:0.163672+0.000438718	test-logloss:0.170085+0.000843203
[1871]	train-logloss:0.16367+0.000438672	test-logloss:0.170084+0.00084241
[1872]	train-logloss:0.163666+0.000437011	test-logloss:0.170085+0.000842829
[1873]	train-logloss:0.163662+0.000437412	test-logloss:0.170085+0.000843249
[1874]	train-logloss:0.163658+0.00043744	test-logloss:0.170086+0.000843272
[1875]	train-logloss:0.163656+0.000437814	test-logloss:0.170085+0.000843272
[1876]	train-logloss:0.163653+0.000438242	test-logloss:0.170084+0.000843668
[1877]	train-logloss:0.16365+0.000438271	test-logloss:0.170085+0.000843692
[1878]	train-logl



In [9]:
# 开始预测
test_df.loc[:,'predicted_score'] = xgbModel.predict(test_df[fea].values)
print('predicting time: ', datetime.now()-startTime)
print("预测结果：\n",test_df[['instance_id','predicted_score']].head())
print('预测均值：', test_df['predicted_score'].mean())


predicting time:  1:50:03.052035
预测结果：
         instance_id  predicted_score
0    74080508196716         0.100885
1   204576715383250         0.041380
2   339754209266814         0.020792
3  1232471723234702         0.106504
4  1432327895640503         0.031453
预测均值： 0.04479018226265907


In [10]:
# 生成stacking数据集
train_df_7['predicted_score'] = np.nan
test_df['predicted_score'] = np.nan
train_df_7.loc[:,'predicted_score'], test_df.loc[:,'predicted_score'] = getOof(xgbModel, train_df_7[fea].values, train_df_7['is_trade'].values, test_df[fea].values, stratify=True)
print('oof training time: ', datetime.now()-startTime)
xgbModel.getFeaScore(show=True)
cost = metrics.log_loss(train_df_7['is_trade'].values, train_df_7['predicted_score'].values)
print('train loss: ', cost)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


oof training time:  6:26:38.766701




                                        importance
item_sales_level                              1576
later_clickSameLastCategory_deltaTime         1572
history_item_id_smooth_rate                   1437
userFirstCategory_lastClickDeltaTime          1300
shop_score_description                        1268
item_sales_level_mean                         1158
userItem_lastClickDeltaTime                   1142
real_last_category_converse_smooth_rate       1136
user_star_level                               1086
shop_score_delivery                           1030
history_shop_id_smooth_rate                   1018
item_brand_id                                  993
item_price_level_mean                          956
all_item_id_click_number                       920
user_age_level                                 860
shop_id_converse_smooth_rate                   826
property_number                                811
shop_review_positive_rate                      789
history_item_brand_id_smooth_ra

In [11]:
print('7th train loss', metrics.log_loss(train_df_7['is_trade'].values, train_df_7['predicted_score'].values))
print('7th train predict aver:', train_df_7['predicted_score'].mean())
print('test predict: \n',test_df[['instance_id','predicted_score']].head())
print('test predict aver:', test_df['predicted_score'].mean())


7th train loss 0.169973549569
7th train predict aver: 0.04605119214395441
test predict: 
         instance_id  predicted_score
0    74080508196716         0.096662
1   204576715383250         0.041262
2   339754209266814         0.021544
3  1232471723234702         0.110195
4  1432327895640503         0.031533
test predict aver: 0.044855700305955266


In [12]:
# 导出预测结果
def exportResult(df, fileName):
    df.to_csv('~/kengkeng/alimama/result/%s.txt' % fileName, sep=' ', header=True, index=False)

exportResult(test_df[['instance_id', 'predicted_score', 'hour']], 'fusai_b_xgb_5_15')
