In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy.stats import mode
import csv
import matplotlib.dates
from datetime import *
from sklearn.preprocessing import *
from sklearn import ensemble
import xgboost as xgb
from sklearn import metrics
from xgboost.sklearn import XGBClassifier  
from sklearn.model_selection import GridSearchCV,cross_val_score  
import matplotlib.pylab as plt  

from sklearn.preprocessing import *
import xgboost as xgb
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, StratifiedKFold
from sklearn.externals import joblib


In [6]:
class XgbModel:
    def __init__(self, feaNames=None, params={}):
        self.feaNames = feaNames
        self.params = {
            'objective': 'binary:logistic',
            'eval_metric':'logloss',
            'silent': True,
            'eta': 0.02,
            'max_depth': 5,
            'gamma': 1,
            'subsample': 0.9,
            'colsample_bytree': 0.95,
            'min_child_weight': 2,
            'max_delta_step': 1,
            'lambda': 30,
#             'nthread': 20,
        }
        for k,v in params.items():
            self.params[k] = v
        self.clf = None

    def train(self, X, y, train_size=1, test_size=0.1, verbose=True, num_boost_round=1000, early_stopping_rounds=3):
        X = X.astype(float)
        if train_size==1:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
            X_train, y_train = X, y
        else:
            X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)
        dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=self.feaNames)
        dval = xgb.DMatrix(X_test, label=y_test, feature_names=self.feaNames)
        watchlist = [(dtrain,'train'),(dval,'val')]
        clf = xgb.train(
            self.params, dtrain, 
            num_boost_round = num_boost_round, 
            evals = watchlist, 
            early_stopping_rounds = early_stopping_rounds,
            verbose_eval=verbose
        )
        self.clf = clf

    def trainCV(self, X, y, nFold=3, verbose=True, num_boost_round=8000, early_stopping_rounds=10):
        X = X.astype(float)
        dtrain = xgb.DMatrix(X, label=y, feature_names=self.feaNames)
        cvResult = xgb.cv(
            self.params, dtrain, 
            num_boost_round = num_boost_round, 
            nfold = nFold,
            early_stopping_rounds = early_stopping_rounds,
            verbose_eval=verbose
        )
        clf = xgb.train(
            self.params, dtrain, 
            num_boost_round = cvResult.shape[0], 
        )
        self.clf = clf

    def gridSearch(self, X, y, nFold=3, verbose=1, num_boost_round=130):
        paramsGrids = {
            # 'n_estimators': [50+5*i for i in range(0,30)],
            'gamma': [0,0.01,0.05,0.1,0.5,1,5,10,50,100],
            # 'max_depth': list(range(3,10)),
            'min_child_weight': list(range(0,10)),
            'subsample': [1-0.05*i for i in range(0,8)],
            'colsample_bytree': [1-0.05*i for i in range(0,10)],
            # 'reg_alpha': [0+2*i for i in range(0,10)],
            'reg_lambda': [0+50*i for i in range(0,10)],            
            'max_delta_step': [0+1*i for i in range(0,8)],
        }
        for k,v in paramsGrids.items():
            gsearch = GridSearchCV(
                estimator = xgb.XGBClassifier(
                    max_depth = self.params['max_depth'], 
                    gamma = self.params['gamma'],
                    learning_rate = self.params['eta'],
                    max_delta_step = self.params['max_delta_step'],
                    min_child_weight = self.params['min_child_weight'],
                    subsample = self.params['subsample'],
                    colsample_bytree = self.params['colsample_bytree'],
                    silent = self.params['silent'],
                    reg_lambda = self.params['lambda'],
                    n_estimators = num_boost_round
                ),
                # param_grid = paramsGrids,
                param_grid = {k:v},
                scoring = 'neg_log_loss',
                cv = nFold,
                verbose = verbose,
                n_jobs = 4
            )
            gsearch.fit(X, y)
            print(pd.DataFrame(gsearch.cv_results_))
            print(gsearch.best_params_)
        exit()

    def predict(self, X):
        X = X.astype(float)
        return self.clf.predict(xgb.DMatrix(X, feature_names=self.feaNames))

    def getFeaScore(self, show=False):
        fscore = self.clf.get_score()
        feaNames = fscore.keys()
        scoreDf = pd.DataFrame(index=feaNames, columns=['importance'])
        for k,v in fscore.items():
            scoreDf.loc[k, 'importance'] = v
        if show:
            print(scoreDf.sort_index(by=['importance'], ascending=False))
        return scoreDf

# 划分训练集和测试集
def trainTestSplit(df, splitDate=pd.to_datetime('2018-09-23'), trainPeriod=3, testPeriod=1):
    trainDf = df[(df.context_timestamp<splitDate)&(df.context_timestamp>=splitDate-timedelta(days=trainPeriod))]
    testDf = df[(df.context_timestamp>=splitDate)&(df.context_timestamp<splitDate+timedelta(days=testPeriod))]
    return (trainDf, testDf)


# 统计预测误差
def countDeltaY(predictSeries, labelSeries, show=True, title='', subplot=None):
    deltaSeries = predictSeries - labelSeries
    if subplot!=None:
        plt.subplot(subplot[0], subplot[1], subplot[2])
    deltaSeries.plot(style='b-')
    plt.title(title)
    if show:
        plt.show()
    return deltaSeries

# 获取stacking下一层数据集
def getOof(clf, trainX, trainY, testX, nFold=5, stratify=False):
    oofTrain = np.zeros(trainX.shape[0])
    oofTest = np.zeros(testX.shape[0])
    oofTestSkf = np.zeros((testX.shape[0], nFold))
    if stratify:
        kf = StratifiedKFold(n_splits=nFold, shuffle=True)
    else:
        kf = KFold(n_splits=nFold, shuffle=True)
    for i, (trainIdx, testIdx) in enumerate(kf.split(trainX, trainY)):
        kfTrainX = trainX[trainIdx]
        kfTrainY = trainY[trainIdx]
        kfTestX = trainX[testIdx]
        clf.trainCV(kfTrainX, kfTrainY, verbose=False)
        oofTrain[testIdx] = clf.predict(kfTestX)
        oofTestSkf[:,i] = clf.predict(testX)
    oofTest[:] = oofTestSkf.mean(axis=1)
    return oofTrain, oofTest


In [3]:
#首先导入滑窗训练集数据进行线下数据集划分
train_df = pd.read_csv('~/kengkeng/alimama/data/fusai_b_train_df_weilai.csv')
test_df = pd.read_csv('~/kengkeng/alimama/data/fusai_b_test_df_weilai.csv')

print(train_df.info())
print(test_df.info())

train_df['is_special'] = 0
train_df['is_special'][train_df.day == 7] = 1
test_df['is_special'] = 1

print(train_df.columns.values)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1629767 entries, 0 to 1629766
Columns: 103 entries, instance_id to real_last_category_buy_number
dtypes: float64(53), int64(48), object(2)
memory usage: 1.3+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1729656 entries, 0 to 1729655
Columns: 102 entries, instance_id to real_last_category_buy_number
dtypes: float64(53), int64(47), object(2)
memory usage: 1.3+ GB
None
['instance_id' 'item_id' 'item_brand_id' 'item_city_id' 'item_price_level'
 'item_sales_level' 'item_collected_level' 'item_pv_level' 'user_id'
 'user_gender_id' 'user_age_level' 'user_occupation_id' 'user_star_level'
 'context_id' 'context_timestamp' 'context_page_id'
 'predict_category_property' 'shop_id' 'shop_review_num_level'
 'shop_review_positive_rate' 'shop_star_level' 'shop_score_service'
 'shop_score_delivery' 'shop_score_description' 'is_trade' 'date' 'weekday'
 'day' 'hour' 'prop_jaccard' 'prop_predict_ratio' 'prop_item_ratio'
 'match_category_propor

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [4]:
fea = [

         'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level',
         'item_collected_level', 'item_pv_level', 'user_gender_id',
         'user_age_level', 'user_occupation_id', 'user_star_level',
         'context_page_id',
         'shop_review_num_level', 'shop_review_positive_rate',
         'shop_star_level', 'shop_score_service', 'shop_score_delivery',
         'shop_score_description',
         'prop_jaccard', 'prop_predict_ratio', 'prop_item_ratio',
         'match_category_proportion', 'match_property_proportion', 
         'predict_category_number', 'predict_property_number',
         'isLastCategoryIn', 'isFirstCategoryIn',
         'category_number', 'property_number',
         'all_item_brand_id_click_number',
         'all_item_brand_id_buy_number',
         'history_item_brand_id_smooth_rate', 'all_shop_id_click_number',
         'all_shop_id_buy_number',
         'history_shop_id_smooth_rate', 'all_item_id_click_number',
         'all_item_id_buy_number',
         'history_item_id_smooth_rate', 'lastOneHour_sameItem_count',
         'lastOneHour_sameFirstCategory_count', 'lastOneHour_sameLastCategory_count',
         'lastOneHour_sameBrand_count', 'lastOneHour_sameShop_count',
         'isLastOneHour_firstClickItem',
         'is_special', 'hour',
         'hour_rate',
         'userItem_lastClickDeltaTime',
         'userBrand_lastClickDeltaTime', 'userShop_lastClickDeltaTime',
         'userFirstCategory_lastClickDeltaTime',
         'userLastCategory_lastClickDeltaTime',
         'user_id_converse_smooth_rate', 'user_id_total_number', 'user_id_buy_number',
         'item_brand_id_converse_smooth_rate', 'item_brand_id_total_number',
         'item_brand_id_buy_number', 'item_id_converse_smooth_rate',
         'item_id_total_number', 'item_id_buy_number', 'shop_id_converse_smooth_rate',
         'shop_id_total_number', 'shop_id_buy_number',
         'real_last_category_converse_smooth_rate',
         'real_last_category_total_number', 'real_last_category_buy_number',
         'is_later_clickSameItem', 'is_later_clickSameLastCategory',
         'later_clickSameItem_count', 'later_clickSameLastCategory_count',
         'later_clickSameItem_deltaTime', 'later_clickSameLastCategory_deltaTime',
         'shop_item_classNumber', 'brand_item_classNumber', 'city_item_classNumber',
         'shop_user_classNumber', 'brand_user_classNumber', 'city_user_classNumber',
         'item_price_level_mode','item_sales_level_mode', 
         'item_price_level_mean', 'item_sales_level_mean', 
         'item_price_level_max', 'item_sales_level_max', 
         'item_price_level_min', 'item_sales_level_min',  
        'item_sales_level_median', 'item_price_level_median', 
         ]


In [7]:
xgbModel = XgbModel(feaNames=fea)
modelName = "xgb_fusai_a"

# 正式模型
startTime = datetime.now()
xgbModel.trainCV(train_df[fea].values, train_df['is_trade'].values)
xgbModel.getFeaScore(show=True)
print('training time: ', datetime.now()-startTime)


[0]	train-logloss:0.68389+2.16025e-06	test-logloss:0.68389+4.32049e-06
[1]	train-logloss:0.674732+4.32049e-06	test-logloss:0.674732+8.80656e-06
[2]	train-logloss:0.665674+6.18241e-06	test-logloss:0.665675+1.26579e-05
[3]	train-logloss:0.656717+8.52447e-06	test-logloss:0.656717+1.71464e-05
[4]	train-logloss:0.647859+1.06771e-05	test-logloss:0.64786+2.11713e-05
[5]	train-logloss:0.639101+1.28323e-05	test-logloss:0.639102+2.54864e-05
[6]	train-logloss:0.630443+1.4522e-05	test-logloss:0.630443+2.99778e-05
[7]	train-logloss:0.621884+1.668e-05	test-logloss:0.621884+3.38264e-05
[8]	train-logloss:0.613424+1.90146e-05	test-logloss:0.613425+3.83174e-05
[9]	train-logloss:0.605064+2.08859e-05	test-logloss:0.605064+4.23425e-05
[10]	train-logloss:0.596803+2.3041e-05	test-logloss:0.596803+4.66571e-05
[11]	train-logloss:0.58864+2.5197e-05	test-logloss:0.588641+5.1149e-05
[12]	train-logloss:0.580576+2.73537e-05	test-logloss:0.580577+5.54637e-05
[13]	train-logloss:0.57261+2.92233e-05	test-logloss:0.5726

[112]	train-logloss:0.181187+0.000271088	test-logloss:0.181299+0.000441014
[113]	train-logloss:0.180018+0.000274435	test-logloss:0.180132+0.000442916
[114]	train-logloss:0.178878+0.000276773	test-logloss:0.178993+0.000445728
[115]	train-logloss:0.177765+0.000278451	test-logloss:0.177882+0.000449275
[116]	train-logloss:0.176681+0.00027913	test-logloss:0.1768+0.000454446
[117]	train-logloss:0.175622+0.000281033	test-logloss:0.175742+0.000457041
[118]	train-logloss:0.174587+0.000283163	test-logloss:0.174708+0.000459638
[119]	train-logloss:0.173578+0.000286012	test-logloss:0.1737+0.000461755
[120]	train-logloss:0.172592+0.000289311	test-logloss:0.172716+0.000463164
[121]	train-logloss:0.171629+0.000291938	test-logloss:0.171755+0.00046718
[122]	train-logloss:0.170691+0.000294765	test-logloss:0.170818+0.000468833
[123]	train-logloss:0.169775+0.000296178	test-logloss:0.169905+0.000472126
[124]	train-logloss:0.168882+0.000297097	test-logloss:0.169012+0.000475182
[125]	train-logloss:0.168009+0.

[222]	train-logloss:0.13626+0.000392739	test-logloss:0.136602+0.00070123
[223]	train-logloss:0.136179+0.000393511	test-logloss:0.136524+0.000702869
[224]	train-logloss:0.1361+0.000394027	test-logloss:0.136447+0.000704507
[225]	train-logloss:0.136022+0.000393345	test-logloss:0.136372+0.000707356
[226]	train-logloss:0.135948+0.000395439	test-logloss:0.1363+0.000708074
[227]	train-logloss:0.135872+0.000395181	test-logloss:0.136227+0.000710677
[228]	train-logloss:0.135798+0.000396548	test-logloss:0.136155+0.000711171
[229]	train-logloss:0.135726+0.000397793	test-logloss:0.136087+0.000711597
[230]	train-logloss:0.135657+0.000398005	test-logloss:0.13602+0.000713011
[231]	train-logloss:0.135586+0.000399511	test-logloss:0.135951+0.000713707
[232]	train-logloss:0.13552+0.00040003	test-logloss:0.135888+0.000713461
[233]	train-logloss:0.135456+0.000400761	test-logloss:0.135827+0.000714178
[234]	train-logloss:0.135392+0.000401182	test-logloss:0.135765+0.000714222
[235]	train-logloss:0.135327+0.000

[332]	train-logloss:0.132396+0.00041208	test-logloss:0.133061+0.000786384
[333]	train-logloss:0.13238+0.000413904	test-logloss:0.133049+0.000785749
[334]	train-logloss:0.132364+0.000414655	test-logloss:0.133035+0.000785702
[335]	train-logloss:0.132348+0.000412331	test-logloss:0.133022+0.000786809
[336]	train-logloss:0.132335+0.000412331	test-logloss:0.133012+0.00078728
[337]	train-logloss:0.132322+0.000410666	test-logloss:0.133002+0.000787326
[338]	train-logloss:0.132307+0.000411608	test-logloss:0.132991+0.000786855
[339]	train-logloss:0.132294+0.000408531	test-logloss:0.13298+0.000789467
[340]	train-logloss:0.132278+0.000409002	test-logloss:0.132967+0.00078895
[341]	train-logloss:0.132266+0.000409976	test-logloss:0.132957+0.000788479
[342]	train-logloss:0.132254+0.000409976	test-logloss:0.132947+0.00078895
[343]	train-logloss:0.132239+0.000410228	test-logloss:0.132937+0.000790362
[344]	train-logloss:0.132224+0.00041202	test-logloss:0.132926+0.000789513
[345]	train-logloss:0.132209+0.0

[442]	train-logloss:0.131242+0.000427431	test-logloss:0.132238+0.00079302
[443]	train-logloss:0.131233+0.000425773	test-logloss:0.132232+0.00079422
[444]	train-logloss:0.131225+0.000426753	test-logloss:0.132227+0.000793444
[445]	train-logloss:0.131217+0.000425094	test-logloss:0.132223+0.000794644
[446]	train-logloss:0.131209+0.000425546	test-logloss:0.132218+0.00079422
[447]	train-logloss:0.131202+0.000423888	test-logloss:0.132213+0.000794949
[448]	train-logloss:0.131193+0.000423722	test-logloss:0.132208+0.000794644
[449]	train-logloss:0.131185+0.000426573	test-logloss:0.132204+0.000792883
[450]	train-logloss:0.131177+0.000425879	test-logloss:0.132198+0.000793354
[451]	train-logloss:0.131168+0.000424889	test-logloss:0.132194+0.00079434
[452]	train-logloss:0.131163+0.000423947	test-logloss:0.132191+0.00079434
[453]	train-logloss:0.131155+0.000423004	test-logloss:0.132187+0.000795069
[454]	train-logloss:0.131148+0.000423722	test-logloss:0.132182+0.00079434
[455]	train-logloss:0.131141+0.

[552]	train-logloss:0.130543+0.000423562	test-logloss:0.131854+0.000800173
[553]	train-logloss:0.130537+0.000424038	test-logloss:0.131851+0.000799492
[554]	train-logloss:0.130531+0.000423796	test-logloss:0.131848+0.000800173
[555]	train-logloss:0.130526+0.000424265	test-logloss:0.131846+0.000800222
[556]	train-logloss:0.130519+0.000424029	test-logloss:0.131842+0.000800222
[557]	train-logloss:0.130513+0.000422614	test-logloss:0.131839+0.000801633
[558]	train-logloss:0.130508+0.000422614	test-logloss:0.131837+0.000801633
[559]	train-logloss:0.130503+0.000424029	test-logloss:0.131834+0.000801423
[560]	train-logloss:0.130498+0.000423557	test-logloss:0.131832+0.000801683
[561]	train-logloss:0.130492+0.000422378	test-logloss:0.131828+0.000801893
[562]	train-logloss:0.130486+0.000423087	test-logloss:0.131825+0.000801633
[563]	train-logloss:0.130481+0.000423322	test-logloss:0.131823+0.000801633
[564]	train-logloss:0.130476+0.000422619	test-logloss:0.13182+0.000801374
[565]	train-logloss:0.1304

[663]	train-logloss:0.130007+0.000415782	test-logloss:0.131609+0.000810692
[664]	train-logloss:0.130003+0.000414843	test-logloss:0.131608+0.000810904
[665]	train-logloss:0.129999+0.000415088	test-logloss:0.131607+0.000810692
[666]	train-logloss:0.129996+0.000415799	test-logloss:0.131605+0.00081048
[667]	train-logloss:0.129992+0.000416981	test-logloss:0.131604+0.000810221
[668]	train-logloss:0.129988+0.000416734	test-logloss:0.131602+0.000810221
[669]	train-logloss:0.129983+0.000417196	test-logloss:0.131599+0.000809705
[670]	train-logloss:0.129979+0.000417434	test-logloss:0.131598+0.000810646
[671]	train-logloss:0.129975+0.000416019	test-logloss:0.131597+0.000810646
[672]	train-logloss:0.12997+0.000415553	test-logloss:0.131595+0.000810692
[673]	train-logloss:0.129966+0.000415096	test-logloss:0.131594+0.000810738
[674]	train-logloss:0.129962+0.000415577	test-logloss:0.131592+0.000810997
[675]	train-logloss:0.129959+0.000415818	test-logloss:0.131591+0.000811516
[676]	train-logloss:0.12995

[773]	train-logloss:0.129573+0.000416253	test-logloss:0.131443+0.000813728
[774]	train-logloss:0.12957+0.000416019	test-logloss:0.131442+0.000813728
[775]	train-logloss:0.129566+0.000416024	test-logloss:0.131441+0.000814245
[776]	train-logloss:0.129563+0.000416734	test-logloss:0.13144+0.000813774
[777]	train-logloss:0.129559+0.000417905	test-logloss:0.131439+0.000813516
[778]	train-logloss:0.129556+0.000418143	test-logloss:0.131437+0.000813941
[779]	train-logloss:0.129552+0.000418381	test-logloss:0.131436+0.000813728
[780]	train-logloss:0.129548+0.00041862	test-logloss:0.131434+0.000813258
[781]	train-logloss:0.129544+0.000419802	test-logloss:0.131434+0.00081347
[782]	train-logloss:0.129541+0.000420034	test-logloss:0.131432+0.000813258
[783]	train-logloss:0.129537+0.000420506	test-logloss:0.131431+0.000813
[784]	train-logloss:0.129533+0.000420506	test-logloss:0.13143+0.000813213
[785]	train-logloss:0.129529+0.000420281	test-logloss:0.131428+0.000813
[786]	train-logloss:0.129523+0.00041

[883]	train-logloss:0.129179+0.000411788	test-logloss:0.13131+0.000812826
[884]	train-logloss:0.129174+0.000411549	test-logloss:0.131308+0.000812355
[885]	train-logloss:0.129172+0.000411549	test-logloss:0.131308+0.000811884
[886]	train-logloss:0.12917+0.000411549	test-logloss:0.131308+0.000811884
[887]	train-logloss:0.129167+0.000410839	test-logloss:0.131307+0.000812355
[888]	train-logloss:0.129163+0.000410125	test-logloss:0.131305+0.000813082
[889]	train-logloss:0.129159+0.000411306	test-logloss:0.131304+0.000813082
[890]	train-logloss:0.129157+0.000411782	test-logloss:0.131304+0.000812612
[891]	train-logloss:0.129154+0.000411777	test-logloss:0.131303+0.000812868
[892]	train-logloss:0.129151+0.000411782	test-logloss:0.131302+0.000813082
[893]	train-logloss:0.129148+0.000411777	test-logloss:0.131301+0.000812612
[894]	train-logloss:0.129145+0.000410839	test-logloss:0.1313+0.000812397
[895]	train-logloss:0.129141+0.000410368	test-logloss:0.131299+0.000812868
[896]	train-logloss:0.129138+

[994]	train-logloss:0.12883+0.000419079	test-logloss:0.131209+0.00081197
[995]	train-logloss:0.128828+0.000420257	test-logloss:0.131208+0.000811499
[996]	train-logloss:0.128824+0.000420022	test-logloss:0.131207+0.00081197
[997]	train-logloss:0.128821+0.000418608	test-logloss:0.131206+0.000811499
[998]	train-logloss:0.128819+0.000418845	test-logloss:0.131206+0.000811499
[999]	train-logloss:0.128814+0.000418372	test-logloss:0.131205+0.000811242
[1000]	train-logloss:0.128811+0.000418137	test-logloss:0.131203+0.000810943
[1001]	train-logloss:0.128808+0.000419315	test-logloss:0.131203+0.000811199
[1002]	train-logloss:0.128804+0.000419079	test-logloss:0.131201+0.00081167
[1003]	train-logloss:0.128801+0.000420493	test-logloss:0.1312+0.000810687
[1004]	train-logloss:0.128798+0.000420494	test-logloss:0.131199+0.000810901
[1005]	train-logloss:0.128794+0.0004212	test-logloss:0.131198+0.00080996
[1006]	train-logloss:0.128793+0.000420729	test-logloss:0.131198+0.00080996
[1007]	train-logloss:0.12879

[1103]	train-logloss:0.128495+0.000419579	test-logloss:0.131121+0.000810273
[1104]	train-logloss:0.128492+0.000420273	test-logloss:0.13112+0.000810489
[1105]	train-logloss:0.128488+0.000420273	test-logloss:0.131119+0.000810489
[1106]	train-logloss:0.128484+0.0004205	test-logloss:0.131118+0.00080951
[1107]	train-logloss:0.128481+0.000420971	test-logloss:0.131117+0.000809981
[1108]	train-logloss:0.128478+0.000421676	test-logloss:0.131116+0.000809945
[1109]	train-logloss:0.128475+0.000422152	test-logloss:0.131116+0.000809728
[1110]	train-logloss:0.128472+0.00042121	test-logloss:0.131116+0.000809474
[1111]	train-logloss:0.12847+0.000420029	test-logloss:0.131115+0.000809692
[1112]	train-logloss:0.128467+0.000421681	test-logloss:0.131114+0.000809692
[1113]	train-logloss:0.128465+0.000422391	test-logloss:0.131114+0.000809439
[1114]	train-logloss:0.128461+0.00042192	test-logloss:0.131113+0.000809945
[1115]	train-logloss:0.128458+0.00042121	test-logloss:0.131112+0.000810416
[1116]	train-logloss

[1212]	train-logloss:0.128176+0.000421098	test-logloss:0.13105+0.000811723
[1213]	train-logloss:0.128173+0.000420402	test-logloss:0.131049+0.000811978
[1214]	train-logloss:0.12817+0.000421838	test-logloss:0.131048+0.000811292
[1215]	train-logloss:0.128166+0.000422062	test-logloss:0.131047+0.000811076
[1216]	train-logloss:0.128164+0.000422287	test-logloss:0.131047+0.000811076
[1217]	train-logloss:0.128161+0.000421838	test-logloss:0.131046+0.000811292
[1218]	train-logloss:0.128157+0.000422085	test-logloss:0.131045+0.000810861
[1219]	train-logloss:0.128155+0.000420424	test-logloss:0.131045+0.000810861
[1220]	train-logloss:0.128152+0.000421815	test-logloss:0.131044+0.000810861
[1221]	train-logloss:0.128149+0.000422287	test-logloss:0.131044+0.000810605
[1222]	train-logloss:0.128145+0.000421569	test-logloss:0.131043+0.000810821
[1223]	train-logloss:0.128142+0.000422041	test-logloss:0.131042+0.000810821
[1224]	train-logloss:0.12814+0.000421795	test-logloss:0.131042+0.000810821
[1225]	train-lo

[1321]	train-logloss:0.127874+0.000427096	test-logloss:0.130989+0.000808462
[1322]	train-logloss:0.127871+0.000426622	test-logloss:0.130989+0.000807991
[1323]	train-logloss:0.127868+0.000426859	test-logloss:0.130989+0.000808244
[1324]	train-logloss:0.127866+0.000427096	test-logloss:0.130989+0.000808026
[1325]	train-logloss:0.127863+0.000427567	test-logloss:0.130988+0.000807773
[1326]	train-logloss:0.127862+0.00042733	test-logloss:0.130988+0.000808026
[1327]	train-logloss:0.127859+0.00042851	test-logloss:0.130987+0.000807808
[1328]	train-logloss:0.127857+0.000428273	test-logloss:0.130987+0.00080759
[1329]	train-logloss:0.127854+0.000428271	test-logloss:0.130986+0.000807808
[1330]	train-logloss:0.127851+0.000427565	test-logloss:0.130986+0.000807555
[1331]	train-logloss:0.127848+0.000427096	test-logloss:0.130985+0.000808497
[1332]	train-logloss:0.127846+0.000428042	test-logloss:0.130985+0.000808026
[1333]	train-logloss:0.127844+0.000428744	test-logloss:0.130985+0.000808026
[1334]	train-lo

[1430]	train-logloss:0.12759+0.000424976	test-logloss:0.130938+0.000816178
[1431]	train-logloss:0.127587+0.000425447	test-logloss:0.130938+0.000815961
[1432]	train-logloss:0.127585+0.000424973	test-logloss:0.130937+0.000816178
[1433]	train-logloss:0.127583+0.0004245	test-logloss:0.130937+0.000815923
[1434]	train-logloss:0.12758+0.000423557	test-logloss:0.130936+0.000816178
[1435]	train-logloss:0.127577+0.000423557	test-logloss:0.130935+0.000816394
[1436]	train-logloss:0.127575+0.000423086	test-logloss:0.130935+0.000816178
[1437]	train-logloss:0.127572+0.000423557	test-logloss:0.130935+0.000815961
[1438]	train-logloss:0.127569+0.000423324	test-logloss:0.130935+0.000815961
[1439]	train-logloss:0.127566+0.000423328	test-logloss:0.130933+0.000815745
[1440]	train-logloss:0.127563+0.000424509	test-logloss:0.130932+0.00081502
[1441]	train-logloss:0.12756+0.000425219	test-logloss:0.130932+0.000814803
[1442]	train-logloss:0.127557+0.000424277	test-logloss:0.130931+0.00081502
[1443]	train-loglos

[1539]	train-logloss:0.127308+0.000422659	test-logloss:0.13089+0.000816756
[1540]	train-logloss:0.127306+0.000422187	test-logloss:0.130889+0.000817009
[1541]	train-logloss:0.127304+0.000422429	test-logloss:0.130889+0.000817227
[1542]	train-logloss:0.127301+0.000422214	test-logloss:0.130888+0.000816756
[1543]	train-logloss:0.127298+0.00042223	test-logloss:0.130888+0.000816503
[1544]	train-logloss:0.127295+0.000422003	test-logloss:0.130887+0.000816974
[1545]	train-logloss:0.127292+0.000422929	test-logloss:0.130886+0.000816067
[1546]	train-logloss:0.12729+0.000422701	test-logloss:0.130885+0.000816067
[1547]	train-logloss:0.127288+0.000422457	test-logloss:0.130885+0.000815814
[1548]	train-logloss:0.127285+0.000423628	test-logloss:0.130884+0.000815596
[1549]	train-logloss:0.127282+0.000424328	test-logloss:0.130884+0.000815125
[1550]	train-logloss:0.12728+0.00042527	test-logloss:0.130883+0.000814655
[1551]	train-logloss:0.127277+0.000424343	test-logloss:0.130883+0.000815125
[1552]	train-logl

[1648]	train-logloss:0.127036+0.000422945	test-logloss:0.13085+0.000816828
[1649]	train-logloss:0.127035+0.000422474	test-logloss:0.130851+0.000817299
[1650]	train-logloss:0.127032+0.000422457	test-logloss:0.13085+0.000817516
[1651]	train-logloss:0.12703+0.000423856	test-logloss:0.13085+0.000817299
[1652]	train-logloss:0.127027+0.000423614	test-logloss:0.13085+0.000817299
[1653]	train-logloss:0.127023+0.000423372	test-logloss:0.130849+0.000817553
[1654]	train-logloss:0.127021+0.000422685	test-logloss:0.130849+0.000817553
[1655]	train-logloss:0.127019+0.000422442	test-logloss:0.130849+0.000817553
[1656]	train-logloss:0.127016+0.000421271	test-logloss:0.130848+0.00081777
[1657]	train-logloss:0.127013+0.0004215	test-logloss:0.130848+0.00081777
[1658]	train-logloss:0.127011+0.000422457	test-logloss:0.130848+0.00081777
[1659]	train-logloss:0.127008+0.0004234	test-logloss:0.130848+0.000817299
[1660]	train-logloss:0.127006+0.0004234	test-logloss:0.130848+0.000818024
[1661]	train-logloss:0.127

[1757]	train-logloss:0.126774+0.000428315	test-logloss:0.130822+0.000814803
[1758]	train-logloss:0.126771+0.000429257	test-logloss:0.130822+0.000814803
[1759]	train-logloss:0.126768+0.000429971	test-logloss:0.130822+0.000814511
[1760]	train-logloss:0.126765+0.00043043	test-logloss:0.130821+0.000814294
[1761]	train-logloss:0.126763+0.00043042	test-logloss:0.130821+0.000814765
[1762]	train-logloss:0.126761+0.000429949	test-logloss:0.130821+0.000814765
[1763]	train-logloss:0.126759+0.000429488	test-logloss:0.130821+0.000814765
[1764]	train-logloss:0.126757+0.000429949	test-logloss:0.130821+0.000814549
[1765]	train-logloss:0.126754+0.000429006	test-logloss:0.13082+0.000814765
[1766]	train-logloss:0.126752+0.000429718	test-logloss:0.130819+0.00081404
[1767]	train-logloss:0.12675+0.000428775	test-logloss:0.130819+0.000815198
[1768]	train-logloss:0.126748+0.000428063	test-logloss:0.130818+0.000815198
[1769]	train-logloss:0.126746+0.000428055	test-logloss:0.130817+0.000814982
[1770]	train-logl

[1866]	train-logloss:0.126511+0.000423628	test-logloss:0.130791+0.000816255
[1867]	train-logloss:0.126509+0.000424343	test-logloss:0.130791+0.000816981
[1868]	train-logloss:0.126506+0.000424328	test-logloss:0.130791+0.000816726
[1869]	train-logloss:0.126504+0.00042527	test-logloss:0.13079+0.000816511
[1870]	train-logloss:0.126501+0.000425042	test-logloss:0.13079+0.000816981
[1871]	train-logloss:0.126499+0.000425513	test-logloss:0.13079+0.000816981
[1872]	train-logloss:0.126497+0.000426228	test-logloss:0.13079+0.000816981
[1873]	train-logloss:0.126494+0.000426489	test-logloss:0.130789+0.000816255
[1874]	train-logloss:0.126491+0.000426228	test-logloss:0.130789+0.000816726
[1875]	train-logloss:0.126491+0.000426	test-logloss:0.130789+0.000816
[1876]	train-logloss:0.126487+0.000426244	test-logloss:0.130789+0.000816726
[1877]	train-logloss:0.126485+0.000426244	test-logloss:0.130788+0.000816471
[1878]	train-logloss:0.126482+0.000424604	test-logloss:0.130787+0.00081861
[1879]	train-logloss:0.1

[1975]	train-logloss:0.126248+0.000416953	test-logloss:0.130765+0.000822027
[1976]	train-logloss:0.126246+0.00041601	test-logloss:0.130765+0.000822754
[1977]	train-logloss:0.126243+0.000416704	test-logloss:0.130765+0.000822498
[1978]	train-logloss:0.12624+0.000416482	test-logloss:0.130765+0.000822498
[1979]	train-logloss:0.126238+0.000416455	test-logloss:0.130765+0.000822498
[1980]	train-logloss:0.126236+0.000417397	test-logloss:0.130764+0.000822027
[1981]	train-logloss:0.126234+0.000418117	test-logloss:0.130764+0.000822027
[1982]	train-logloss:0.126232+0.000417175	test-logloss:0.130764+0.000822284
[1983]	train-logloss:0.12623+0.000416731	test-logloss:0.130764+0.000821556
[1984]	train-logloss:0.126229+0.000417646	test-logloss:0.130765+0.000821813
[1985]	train-logloss:0.126227+0.000416953	test-logloss:0.130765+0.000821813
[1986]	train-logloss:0.126224+0.00041626	test-logloss:0.130764+0.000821556
[1987]	train-logloss:0.126222+0.00041601	test-logloss:0.130764+0.000821556
[1988]	train-logl



In [8]:
# 开始预测
test_df.loc[:,'predicted_score'] = xgbModel.predict(test_df[fea].values)
print('predicting time: ', datetime.now()-startTime)
print("预测结果：\n",test_df[['instance_id','predicted_score']].head())
print('预测均值：', test_df['predicted_score'].mean())


predicting time:  5:14:29.383014
预测结果：
        instance_id  predicted_score
0   93294255633855         0.056790
1  558322259509454         0.007797
2  594573634113186         0.024006
3  667327653735176         0.071207
4  697732672924394         0.017587
预测均值： 0.04385653883218765


In [9]:
# 生成stacking数据集
train_df['predicted_score'] = np.nan
test_df['predicted_score'] = np.nan
train_df.loc[:,'predicted_score'], test_df.loc[:,'predicted_score'] = getOof(xgbModel, train_df[fea].values, train_df['is_trade'].values, test_df[fea].values, stratify=True)
print('oof training time: ', datetime.now()-startTime)
xgbModel.getFeaScore(show=True)
cost = metrics.log_loss(train_df['is_trade'].values, train_df['predicted_score'].values)
print('train loss: ', cost)



KeyboardInterrupt: 

In [None]:
print('7th train loss', metrics.log_loss(train_df.loc[train_df.is_special == 1,'is_trade'].values, train_df.loc[train_df.is_special == 1,'predicted_score'].values))
print('train predict: \n',train_df[['instance_id','predicted_score']].head())
print('train predict aver:', train_df['predicted_score'].mean())
print('7th train predict aver:', train_df.loc[train_df.is_special == 1,'predicted_score'].mean())
print('test predict: \n',test_df[['instance_id','predicted_score']].head())
print('test predict aver:', test_df['predicted_score'].mean())


In [11]:
test_df_b = pd.read_csv('~/yuna/alimama/data/round2_ijcai_18_test_b_20180510.txt', sep=' ')
print(len(test_df_b))
print(len(test_df_b.columns.values))


1209768
26


In [12]:
print(len(test_df.columns.values))
test_df_b = pd.merge(test_df_b[['instance_id']], test_df, on=['instance_id'], how='left')
print(len(test_df_b))
print(len(test_df_b.columns.values))
print(test_df_b.columns.values)


104
1209768
104
['instance_id' 'item_id' 'item_brand_id' 'item_city_id' 'item_price_level'
 'item_sales_level' 'item_collected_level' 'item_pv_level' 'user_id'
 'user_gender_id' 'user_age_level' 'user_occupation_id' 'user_star_level'
 'context_id' 'context_timestamp' 'context_page_id'
 'predict_category_property' 'shop_id' 'shop_review_num_level'
 'shop_review_positive_rate' 'shop_star_level' 'shop_score_service'
 'shop_score_delivery' 'shop_score_description' 'date' 'weekday' 'day'
 'hour' 'prop_jaccard' 'prop_predict_ratio' 'prop_item_ratio'
 'match_category_proportion' 'match_property_proportion'
 'predict_category_number' 'predict_property_number' 'isFirstCategoryIn'
 'isLastCategoryIn' 'category_number' 'property_number'
 'real_first_category' 'real_last_category'
 'all_item_brand_id_click_number' 'all_item_brand_id_buy_number'
 'history_item_brand_id_rate' 'history_item_brand_id_smooth_rate'
 'all_shop_id_click_number' 'all_shop_id_buy_number' 'history_shop_id_rate'
 'history_sho

In [13]:
# 导出预测结果
def exportResult(df, fileName):
    df.to_csv('~/kengkeng/alimama/result/%s.txt' % fileName, sep=' ', header=True, index=False)

exportResult(test_df_b[['instance_id', 'predicted_score', 'hour']], 'fusai_b_xgb_5_15_normal')


In [15]:
print(test_df_b[['instance_id', 'predicted_score', 'hour']].head(10))

        instance_id  predicted_score  hour
0    74080508196716              NaN    17
1   204576715383250              NaN    16
2   339754209266814              NaN    21
3  1232471723234702              NaN    14
4  1432327895640503              NaN    21
5  1582628758469245              NaN    23
6  1779564246041746              NaN    18
7  1796710634742147              NaN    17
8  1842191994190388              NaN    21
9  1923760687276623              NaN    18
