In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy.stats import mode
import csv
import matplotlib.dates
from datetime import *
from sklearn.preprocessing import *
from sklearn import ensemble
import xgboost as xgb
from sklearn import metrics
from xgboost.sklearn import XGBClassifier  
from sklearn.model_selection import GridSearchCV,cross_val_score  
import matplotlib.pylab as plt  

from sklearn.preprocessing import *
import xgboost as xgb
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, StratifiedKFold
from sklearn.externals import joblib


In [2]:
class XgbModel:
    def __init__(self, feaNames=None, params={}):
        self.feaNames = feaNames
        self.params = {
            'objective': 'binary:logistic',
            'eval_metric':'logloss',
            'silent': True,
            'eta': 0.1,
            'max_depth': 4,
            'gamma': 0.5,
            'subsample': 0.95,
            'colsample_bytree': 1,
            'min_child_weight': 8,
            'max_delta_step': 5,
            'lambda': 100,
        }
        for k,v in params.items():
            self.params[k] = v
        self.clf = None

    def train(self, X, y, train_size=1, test_size=0.1, verbose=True, num_boost_round=1000, early_stopping_rounds=3):
        X = X.astype(float)
        if train_size==1:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
            X_train, y_train = X, y
        else:
            X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)
        dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=self.feaNames)
        dval = xgb.DMatrix(X_test, label=y_test, feature_names=self.feaNames)
        watchlist = [(dtrain,'train'),(dval,'val')]
        clf = xgb.train(
            self.params, dtrain, 
            num_boost_round = num_boost_round, 
            evals = watchlist, 
            early_stopping_rounds = early_stopping_rounds,
            verbose_eval=verbose
        )
        self.clf = clf

    def trainCV(self, X, y, nFold=3, verbose=True, num_boost_round=1500, early_stopping_rounds=10):
        X = X.astype(float)
        dtrain = xgb.DMatrix(X, label=y, feature_names=self.feaNames)
        cvResult = xgb.cv(
            self.params, dtrain, 
            num_boost_round = num_boost_round, 
            nfold = nFold,
            early_stopping_rounds = early_stopping_rounds,
            verbose_eval=verbose
        )
        clf = xgb.train(
            self.params, dtrain, 
            num_boost_round = cvResult.shape[0], 
        )
        self.clf = clf

    def gridSearch(self, X, y, nFold=3, verbose=1, num_boost_round=130):
        paramsGrids = {
            # 'n_estimators': [50+5*i for i in range(0,30)],
            'gamma': [0,0.01,0.05,0.1,0.5,1,5,10,50,100],
            # 'max_depth': list(range(3,10)),
            'min_child_weight': list(range(0,10)),
            'subsample': [1-0.05*i for i in range(0,8)],
            'colsample_bytree': [1-0.05*i for i in range(0,10)],
            # 'reg_alpha': [0+2*i for i in range(0,10)],
            'reg_lambda': [0+50*i for i in range(0,10)],            
            'max_delta_step': [0+1*i for i in range(0,8)],
        }
        for k,v in paramsGrids.items():
            gsearch = GridSearchCV(
                estimator = xgb.XGBClassifier(
                    max_depth = self.params['max_depth'], 
                    gamma = self.params['gamma'],
                    learning_rate = self.params['eta'],
                    max_delta_step = self.params['max_delta_step'],
                    min_child_weight = self.params['min_child_weight'],
                    subsample = self.params['subsample'],
                    colsample_bytree = self.params['colsample_bytree'],
                    silent = self.params['silent'],
                    reg_lambda = self.params['lambda'],
                    n_estimators = num_boost_round
                ),
                # param_grid = paramsGrids,
                param_grid = {k:v},
                scoring = 'neg_log_loss',
                cv = nFold,
                verbose = verbose,
                n_jobs = 4
            )
            gsearch.fit(X, y)
            print(pd.DataFrame(gsearch.cv_results_))
            print(gsearch.best_params_)
        exit()

    def predict(self, X):
        X = X.astype(float)
        return self.clf.predict(xgb.DMatrix(X, feature_names=self.feaNames))

    def getFeaScore(self, show=False):
        fscore = self.clf.get_score()
        feaNames = fscore.keys()
        scoreDf = pd.DataFrame(index=feaNames, columns=['importance'])
        for k,v in fscore.items():
            scoreDf.loc[k, 'importance'] = v
        if show:
            print(scoreDf.sort_index(by=['importance'], ascending=False))
        return scoreDf

# 划分训练集和测试集
def trainTestSplit(df, splitDate=pd.to_datetime('2018-09-23'), trainPeriod=3, testPeriod=1):
    trainDf = df[(df.context_timestamp<splitDate)&(df.context_timestamp>=splitDate-timedelta(days=trainPeriod))]
    testDf = df[(df.context_timestamp>=splitDate)&(df.context_timestamp<splitDate+timedelta(days=testPeriod))]
    return (trainDf, testDf)


# 统计预测误差
def countDeltaY(predictSeries, labelSeries, show=True, title='', subplot=None):
    deltaSeries = predictSeries - labelSeries
    if subplot!=None:
        plt.subplot(subplot[0], subplot[1], subplot[2])
    deltaSeries.plot(style='b-')
    plt.title(title)
    if show:
        plt.show()
    return deltaSeries

# 获取stacking下一层数据集
def getOof(clf, trainX, trainY, testX, nFold=5, stratify=False):
    oofTrain = np.zeros(trainX.shape[0])
    oofTest = np.zeros(testX.shape[0])
    oofTestSkf = np.zeros((testX.shape[0], nFold))
    if stratify:
        kf = StratifiedKFold(n_splits=nFold, shuffle=True)
    else:
        kf = KFold(n_splits=nFold, shuffle=True)
    for i, (trainIdx, testIdx) in enumerate(kf.split(trainX, trainY)):
        kfTrainX = trainX[trainIdx]
        kfTrainY = trainY[trainIdx]
        kfTestX = trainX[testIdx]
        clf.trainCV(kfTrainX, kfTrainY, verbose=False)
        oofTrain[testIdx] = clf.predict(kfTestX)
        oofTestSkf[:,i] = clf.predict(testX)
    oofTest[:] = oofTestSkf.mean(axis=1)
    return oofTrain, oofTest


In [3]:
#首先导入滑窗训练集数据进行线下数据集划分
train_df = pd.read_csv('~/kengkeng/alimama/data/fusai_a_train_df_weilai.csv')
test_df = pd.read_csv('~/kengkeng/alimama/data/fusai_a_test_df_weilai.csv')

print(train_df.info())
print(test_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1629767 entries, 0 to 1629766
Data columns (total 89 columns):
instance_id                                1629767 non-null int64
item_id                                    1629767 non-null int64
item_brand_id                              1629767 non-null int64
item_city_id                               1629767 non-null int64
item_price_level                           1629767 non-null int64
item_sales_level                           1629767 non-null int64
item_collected_level                       1629767 non-null int64
item_pv_level                              1629767 non-null int64
user_id                                    1629767 non-null int64
user_gender_id                             1629767 non-null int64
user_age_level                             1629767 non-null int64
user_occupation_id                         1629767 non-null int64
user_star_level                            1629767 non-null int64
context_id                   

In [None]:
print(train_df.columns.values)


In [4]:
fea = [
       
     'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level',
     'item_collected_level', 'item_pv_level', 'user_gender_id',
     'user_age_level', 'user_occupation_id', 'user_star_level',
     'context_page_id', 
     'shop_review_num_level', 'shop_review_positive_rate',
     'shop_star_level', 'shop_score_service', 'shop_score_delivery',
     'shop_score_description',
     'match_category_proportion', 'match_property_proportion', 'predict_category_number',
     'predict_property_number', #'isLastCategoryIn', 'isFirstCategoryIn',
     'category_number', 'property_number',
     'all_item_brand_id_click_number',
     'all_item_brand_id_buy_number',
     'history_item_brand_id_smooth_rate', 'all_shop_id_click_number',
     'all_shop_id_buy_number',
     'history_shop_id_smooth_rate', 'all_item_id_click_number',
     'all_item_id_buy_number',
     'history_item_id_smooth_rate', 'lastOneHour_sameItem_count',
     'lastOneHour_sameFirstCategory_count', 'lastOneHour_sameLastCategory_count',
     'lastOneHour_sameBrand_count', 'lastOneHour_sameShop_count',
     'isLastOneHour_firstClickItem',
#      'is_special', 'hour', 
     'hour_rate', 
     'userItem_lastClickDeltaTime',
     'userBrand_lastClickDeltaTime', 'userShop_lastClickDeltaTime',
     'userFirstCategory_lastClickDeltaTime',
     'userLastCategory_lastClickDeltaTime', 
     'user_id_converse_smooth_rate', 'user_id_total_number', 'user_id_buy_number',
     'item_brand_id_converse_smooth_rate', 'item_brand_id_total_number',
     'item_brand_id_buy_number', 'item_id_converse_smooth_rate',
     'item_id_total_number', 'item_id_buy_number', 'shop_id_converse_smooth_rate',
     'shop_id_total_number', 'shop_id_buy_number',
     'real_last_category_converse_smooth_rate',
     'real_last_category_total_number', 'real_last_category_buy_number',
     'is_later_clickSameItem', 'is_later_clickSameLastCategory', 
     'later_clickSameItem_count', 'later_clickSameLastCategory_count',
     'later_clickSameItem_deltaTime', 'later_clickSameLastCategory_deltaTime',
     'shop_item_classNumber', 'brand_item_classNumber', 'city_item_classNumber',
     'shop_user_classNumber', 'brand_user_classNumber', 'city_user_classNumber',
     'item_price_level_mean', 'item_price_level_max', 'item_price_level_min', 
     'item_price_level_median', 'item_price_level_mode', 'item_sales_level_mean',
     'item_sales_level_max', 'item_sales_level_min', 'item_sales_level_median', 'item_sales_level_mode'
    
      ]

print(len(fea))


66


In [5]:
train_df_7 = train_df[train_df.day == 7]
print(len(train_df_7))


1077175


In [6]:
xgbModel = XgbModel(feaNames=fea)
modelName = "xgb_fusai_a"

# 正式模型
startTime = datetime.now()
xgbModel.trainCV(train_df_7[fea].values, train_df_7['is_trade'].values)
xgbModel.getFeaScore(show=True)
print('training time: ', datetime.now()-startTime)


[0]	train-logloss:0.614744+5.40206e-05	test-logloss:0.614745+4.32538e-05
[1]	train-logloss:0.550636+0.000113528	test-logloss:0.550643+7.32985e-05
[2]	train-logloss:0.49741+0.0001625	test-logloss:0.497421+0.000106199
[3]	train-logloss:0.452716+0.000199707	test-logloss:0.452736+0.000139774
[4]	train-logloss:0.414861+0.000224559	test-logloss:0.41488+0.000188981
[5]	train-logloss:0.382594+0.000250754	test-logloss:0.382618+0.000228598
[6]	train-logloss:0.354939+0.000272906	test-logloss:0.354969+0.000265974
[7]	train-logloss:0.331136+0.00029379	test-logloss:0.331174+0.000299823
[8]	train-logloss:0.310588+0.000312513	test-logloss:0.31063+0.000337127
[9]	train-logloss:0.292796+0.000328297	test-logloss:0.292846+0.000365911
[10]	train-logloss:0.277372+0.000340743	test-logloss:0.277431+0.000405451
[11]	train-logloss:0.26398+0.000358986	test-logloss:0.26404+0.000431023
[12]	train-logloss:0.252336+0.000368446	test-logloss:0.252401+0.000465365
[13]	train-logloss:0.242211+0.000385939	test-logloss:0.2

[112]	train-logloss:0.171347+0.000453683	test-logloss:0.172418+0.000916507
[113]	train-logloss:0.171325+0.000457669	test-logloss:0.172402+0.00091412
[114]	train-logloss:0.1713+0.000455821	test-logloss:0.172387+0.000912488
[115]	train-logloss:0.171272+0.000463247	test-logloss:0.172366+0.000902579
[116]	train-logloss:0.171251+0.000463751	test-logloss:0.172351+0.000903036
[117]	train-logloss:0.17123+0.000465859	test-logloss:0.172334+0.000898008
[118]	train-logloss:0.171206+0.000468093	test-logloss:0.172324+0.000896888
[119]	train-logloss:0.171191+0.00046547	test-logloss:0.172316+0.000898918
[120]	train-logloss:0.171171+0.000457223	test-logloss:0.172304+0.000900589
[121]	train-logloss:0.171149+0.000458264	test-logloss:0.172293+0.00090055
[122]	train-logloss:0.171126+0.000455527	test-logloss:0.172277+0.000901982
[123]	train-logloss:0.171098+0.000460761	test-logloss:0.17226+0.000895773
[124]	train-logloss:0.171075+0.000455527	test-logloss:0.172242+0.000901455
[125]	train-logloss:0.171053+0.0

[222]	train-logloss:0.169719+0.000448724	test-logloss:0.171596+0.00088835
[223]	train-logloss:0.169705+0.000449228	test-logloss:0.171591+0.000889175
[224]	train-logloss:0.169694+0.000450175	test-logloss:0.171587+0.000888783
[225]	train-logloss:0.169679+0.000454289	test-logloss:0.17158+0.0008847
[226]	train-logloss:0.169669+0.000453793	test-logloss:0.171577+0.000883884
[227]	train-logloss:0.169657+0.000455658	test-logloss:0.171574+0.000881018
[228]	train-logloss:0.169646+0.000455328	test-logloss:0.171572+0.000880603
[229]	train-logloss:0.169634+0.00045353	test-logloss:0.171566+0.000881442
[230]	train-logloss:0.169624+0.000450636	test-logloss:0.171565+0.000881849
[231]	train-logloss:0.169615+0.000451085	test-logloss:0.171562+0.000880212
[232]	train-logloss:0.169605+0.000450557	test-logloss:0.17156+0.000879392
[233]	train-logloss:0.16959+0.00045072	test-logloss:0.171553+0.000879403
[234]	train-logloss:0.169581+0.000452688	test-logloss:0.17155+0.000879403
[235]	train-logloss:0.169573+0.000

[332]	train-logloss:0.168682+0.000438963	test-logloss:0.171304+0.000880603
[333]	train-logloss:0.168673+0.000437744	test-logloss:0.171302+0.00088101
[334]	train-logloss:0.168663+0.000437827	test-logloss:0.171299+0.000881003
[335]	train-logloss:0.168656+0.000434563	test-logloss:0.171298+0.000880595
[336]	train-logloss:0.16865+0.000434563	test-logloss:0.171297+0.000883045
[337]	train-logloss:0.168639+0.000434606	test-logloss:0.171296+0.000881824
[338]	train-logloss:0.16863+0.000430974	test-logloss:0.171293+0.000882645
[339]	train-logloss:0.168622+0.000432298	test-logloss:0.171292+0.000883459
[340]	train-logloss:0.168617+0.000431463	test-logloss:0.17129+0.000883465
[341]	train-logloss:0.16861+0.000435054	test-logloss:0.171288+0.000882651
[342]	train-logloss:0.168598+0.000433114	test-logloss:0.171287+0.000881012
[343]	train-logloss:0.168595+0.000434347	test-logloss:0.171287+0.000881018
[344]	train-logloss:0.168584+0.000432677	test-logloss:0.171286+0.000878982
[345]	train-logloss:0.168576+0

[442]	train-logloss:0.167866+0.000416795	test-logloss:0.171156+0.000875628
[443]	train-logloss:0.167859+0.000418686	test-logloss:0.171154+0.000875226
[444]	train-logloss:0.16785+0.000417952	test-logloss:0.171153+0.000873594
[445]	train-logloss:0.167841+0.000417098	test-logloss:0.171151+0.000873635
[446]	train-logloss:0.167835+0.000417019	test-logloss:0.17115+0.000872819
[447]	train-logloss:0.167828+0.000418806	test-logloss:0.171149+0.000872432
[448]	train-logloss:0.16782+0.000420643	test-logloss:0.171147+0.000870815
[449]	train-logloss:0.167812+0.000422446	test-logloss:0.171148+0.000872819
[450]	train-logloss:0.167805+0.000421028	test-logloss:0.171146+0.000872017
[451]	train-logloss:0.167795+0.000420392	test-logloss:0.171145+0.000871631
[452]	train-logloss:0.167789+0.000417833	test-logloss:0.171143+0.000872877
[453]	train-logloss:0.167781+0.000419112	test-logloss:0.171144+0.000872908
[454]	train-logloss:0.167775+0.000418379	test-logloss:0.171143+0.000873293
[455]	train-logloss:0.167767

[552]	train-logloss:0.167092+0.000411428	test-logloss:0.171061+0.000869816
                                      importance
item_sales_level                             368
later_clickSameLastCategory_deltaTime        346
history_item_id_smooth_rate                  313
property_number                              256
shop_score_description                       248
userFirstCategory_lastClickDeltaTime         245
shop_score_service                           214
all_item_id_click_number                     212
shop_score_delivery                          209
user_star_level                              207
history_shop_id_smooth_rate                  206
item_brand_id                                201
userItem_lastClickDeltaTime                  198
shop_review_positive_rate                    196
shop_id_converse_smooth_rate                 194
item_id_converse_smooth_rate                 189
item_collected_level                         187
all_shop_id_click_number                   



In [7]:
# 开始预测
test_df.loc[:,'predicted_score'] = xgbModel.predict(test_df[fea].values)
print('predicting time: ', datetime.now()-startTime)
print("预测结果：\n",test_df[['instance_id','predicted_score']].head())
print('预测均值：', test_df['predicted_score'].mean())


predicting time:  0:19:15.879917
预测结果：
        instance_id  predicted_score
0   93294255633855         0.056465
1  558322259509454         0.010791
2  594573634113186         0.034347
3  667327653735176         0.044280
4  697732672924394         0.038116
预测均值： 0.05340076610445976


In [8]:
# 生成stacking数据集
train_df_7['predicted_score'] = np.nan
test_df['predicted_score'] = np.nan
train_df_7.loc[:,'predicted_score'], test_df.loc[:,'predicted_score'] = getOof(xgbModel, train_df_7[fea].values, train_df_7['is_trade'].values, test_df[fea].values, stratify=True)
print('oof training time: ', datetime.now()-startTime)
xgbModel.getFeaScore(show=True)
cost = metrics.log_loss(train_df_7['is_trade'].values, train_df_7['predicted_score'].values)
print('train loss: ', cost)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


oof training time:  1:22:06.595173
                                      importance
item_sales_level                             357
history_item_id_smooth_rate                  350
later_clickSameLastCategory_deltaTime        348
shop_score_description                       292
shop_score_delivery                          270
property_number                              247
item_brand_id                                241
userFirstCategory_lastClickDeltaTime         236
history_shop_id_smooth_rate                  233
shop_review_positive_rate                    226
all_item_id_click_number                     221
user_star_level                              218
userItem_lastClickDeltaTime                  213
shop_id_converse_smooth_rate                 210
shop_score_service                           204
item_id_converse_smooth_rate                 190
item_collected_level                         183
all_shop_id_click_number                     173
history_item_brand_id_smooth_rate 

In [9]:
print('7th train loss', metrics.log_loss(train_df_7['is_trade'].values, train_df_7['predicted_score'].values))
print('7th train predict aver:', train_df_7['predicted_score'].mean())
print('test predict: \n',test_df[['instance_id','predicted_score']].head())
print('test predict aver:', test_df['predicted_score'].mean())


7th train loss 0.170961561088
7th train predict aver: 0.04608211032625805
test predict: 
        instance_id  predicted_score
0   93294255633855         0.053101
1  558322259509454         0.010546
2  594573634113186         0.032651
3  667327653735176         0.043970
4  697732672924394         0.035290
test predict aver: 0.05341672552241463


In [10]:
# 导出预测结果
def exportResult(df, fileName):
    df.to_csv('~/kengkeng/alimama/result/%s.txt' % fileName, sep=' ', header=True, index=False)

exportResult(test_df[['instance_id', 'predicted_score']], 'fusai_xgb_5_7_wen')
