In [1]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

In [2]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
all_data = load_pickle(all_data_path)

target = 'is_trade'

In [3]:
features = [
    # 预处理后的基础特征----------------------------------------------
#     'instance_id',
     'item_id',
     'item_brand_id',
     'item_city_id',
     'item_price_level',
     'item_sales_level',
     'item_collected_level',
     'item_pv_level',
     'user_id',
     'user_gender_id',
     'user_age_level',
     'user_occupation_id',
     'user_star_level',
     'context_id',
     'context_timestamp',
     'context_page_id',
     'shop_id',
     'shop_review_num_level',
     'shop_review_positive_rate',
     'shop_star_level',
     'shop_score_service',
     'shop_score_delivery',
     'shop_score_description',
#     'is_trade',
     'day',
     'hour',
#     'minute',
     'category2_label',
     'item_property_list0',
     'item_property_list1',
     'item_property_list2',
     'item_property_list3',
     'item_property_list4',
     'item_property_list5',
     'item_property_list6',
     'item_property_list7',
    #  2_1处理后的特征
    #  生成用户对当天属性的点击量
    #     ['item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #     'item_collected_level', 'item_pv_level',
    #     'context_page_id',
    #     'shop_id', 'shop_review_num_level', 'shop_star_level',]
     'user_item_id_click_day',
     'user_item_brand_id_click_day',
#     'user_item_city_id_click_day',
     'user_category2_label_click_day',
     'user_item_price_level_click_day',
     'user_item_sales_level_click_day',
#     'user_item_collected_level_click_day',
#     'user_item_pv_level_click_day',
     'user_context_page_id_click_day',
     'user_shop_id_click_day',
#     'user_shop_review_num_level_click_day',
#     'user_shop_star_level_click_day',
    
    #  2_5处理后的特征
    #  user_id前一天点击某某某的数量
    #  ['item_id', 'item_brand_id', 'shop_id', 'category2_label',]
     'user_id_item_id_day_I',
     'user_id_item_id_day_C',
     'user_id_item_brand_id_day_I',
     'user_id_item_brand_id_day_C',
     'user_id_shop_id_day_I',
     'user_id_shop_id_day_C',
     'user_id_category2_label_day_I',
     'user_id_category2_label_day_C',
     'user_id_item_price_level_day_I',
     'user_id_item_price_level_day_C',
    #  2_5处理后的特征
    #  user_id历史点击某某某的数量
    #  ['item_id', 'item_brand_id', 'shop_id', 'category2_label',]
     'user_id_item_id_history_I',
     'user_id_item_id_history_C',
     'user_id_item_brand_id_history_I',
     'user_id_item_brand_id_history_C',
     'user_id_shop_id_history_I',
     'user_id_shop_id_history_C',
     'user_id_category2_label_history_I',
     'user_id_category2_label_history_C',
     'user_id_item_price_level_history_I',
     'user_id_item_price_level_history_C',
    
    #  2_1处理后的特征
    #  生成用户对当天当小时属性的点击量
    #     ['item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #     'item_collected_level', 'item_pv_level',
    #     'context_page_id',
    #     'shop_id', 'shop_review_num_level', 'shop_star_level',]
     'user_item_id_click_hour',
     'user_item_brand_id_click_hour',
#     'user_item_city_id_click_hour',
     'user_category2_label_click_hour',
     'user_item_price_level_click_hour',
     'user_item_sales_level_click_hour',
#     'user_item_collected_level_click_hour',
#     'user_item_pv_level_click_hour',
     'user_context_page_id_click_hour',
     'user_shop_id_click_hour',
#     'user_shop_review_num_level_click_hour',
#     'user_shop_star_level_click_hour',
    #  2_1处理后的特征
    #  生成用户对单一特征点击数据的统计特征
     'user_item_id_click_day_mean',
     'user_item_id_click_day_min',
     'user_item_id_click_day_max',
     'user_item_brand_id_click_day_mean',
     'user_item_brand_id_click_day_min',
     'user_item_brand_id_click_day_max',
     'user_shop_id_click_day_mean',
     'user_shop_id_click_day_min',
     'user_shop_id_click_day_max',
     'user_category2_label_click_day_mean',
     'user_category2_label_click_day_min',
     'user_category2_label_click_day_max',
    
    #  2_2处理后的特征
    #  生成单一特征，日点击量的统计特征    stats_feature = ['user_id', 'item_id', 'item_brand_id', 'shop_id']
     'user_id_click_day_mean',
     'user_id_click_day_max',
     'user_id_click_day_min',
     'item_id_click_day_mean',
     'item_id_click_day_max',
     'item_id_click_day_min',
     'item_brand_id_click_day_mean',
     'item_brand_id_click_day_max',
     'item_brand_id_click_day_min',
     'shop_id_click_day_mean',
     'shop_id_click_day_max',
     'shop_id_click_day_min',
    
    #  2_3处理后的特征
    #  生成用户日点击时间差特征
     'user_click_rank_day',
#     'user_first_click_day',
#     'user_last_click_day',
     'user_click_interval_first_day',
     'user_click_interval_last_day',
#     'user_click_interval_diff_day',
#     'user_click_interval_prob',
     'time_gap_before',
     'time_gap_after',
     'user_click_true_rank_day',
    #  2_3处理后的特征
    #  生成用户全局击时间差特征
     'user_click_interval_mean_hour',
     'time_gap_before_total',
     'time_gap_after_total',
    #  2_3处理后的特征
    #  生成用户对属性全局点击时间差特征
    #   ['item_id', 'item_brand_id', 'shop_id', 'context_page_id', 'category2_label',]    
#     'user_item_id_first_click',
#     'user_item_id_last_click',
#     'user_item_id_click_rank',
     'user_item_id_first_click_interval',
     'user_item_id_last_click_interval',
#     'user_item_id_diff_click_interval',
#     'user_item_id_prob_click_interval',
     'item_id_time_gap_before',
     'item_id_time_gap_after',
#     'user_item_id_click_true_rank',
#     'user_item_brand_id_first_click',
#     'user_item_brand_id_last_click',
#     'user_item_brand_id_click_rank',
     'user_item_brand_id_first_click_interval',
     'user_item_brand_id_last_click_interval',
#     'user_item_brand_id_diff_click_interval',
#     'user_item_brand_id_prob_click_interval',
     'item_brand_id_time_gap_before',
     'item_brand_id_time_gap_after',
#     'user_item_brand_id_click_true_rank',
#     'user_shop_id_first_click',
#     'user_shop_id_last_click',
#     'user_shop_id_click_rank',
     'user_shop_id_first_click_interval',
     'user_shop_id_last_click_interval',
#     'user_shop_id_diff_click_interval',
#     'user_shop_id_prob_click_interval',
     'shop_id_time_gap_before',
     'shop_id_time_gap_after',
#     'user_shop_id_click_true_rank',
#      'user_context_page_id_first_click',
#      'user_context_page_id_last_click',
#      'user_context_page_id_click_rank',
#      'user_context_page_id_first_click_interval',
#      'user_context_page_id_last_click_interval',
# #     'user_context_page_id_diff_click_interval',
# #     'user_context_page_id_prob_click_interval',
#      'context_page_id_time_gap_before',
#      'context_page_id_time_gap_after',
#      'user_context_page_id_click_true_rank',
#     'user_category2_label_first_click',
#     'user_category2_label_last_click',
#     'user_category2_label_click_rank',
     'user_category2_label_first_click_interval',
     'user_category2_label_last_click_interval',
#     'user_category2_label_diff_click_interval',
#     'user_category2_label_prob_click_interval',
     'category2_label_time_gap_before',
     'category2_label_time_gap_after',
#     'user_category2_label_click_true_rank',
    #  2_3处理后的特征
    #  生成用户对属性当天点击时间差特征
    #   ['item_id', 'item_brand_id', 'shop_id', 'context_page_id', 'category2_label',]    
#     'user_item_id_first_click_day',
#     'user_item_id_last_click_day',
     'user_item_id_click_rank_day',
     'user_item_id_first_click_interval_day',
     'user_item_id_last_click_interval_day',
#     'user_item_id_diff_click_interval_day',
#     'user_item_id_prob_click_interval_day',
     'item_id_time_gap_before_day',
     'item_id_time_gap_after_day',
     'user_item_id_click_true_rank_day',
#     'user_item_brand_id_first_click_day',
#     'user_item_brand_id_last_click_day',
     'user_item_brand_id_click_rank_day',
     'user_item_brand_id_first_click_interval_day',
     'user_item_brand_id_last_click_interval_day',
#     'user_item_brand_id_diff_click_interval_day',
#     'user_item_brand_id_prob_click_interval_day',
     'item_brand_id_time_gap_before_day',
     'item_brand_id_time_gap_after_day',
     'user_item_brand_id_click_true_rank_day',
#     'user_shop_id_first_click_day',
#     'user_shop_id_last_click_day',
     'user_shop_id_click_rank_day',
     'user_shop_id_first_click_interval_day',
     'user_shop_id_last_click_interval_day',
#     'user_shop_id_diff_click_interval_day',
#     'user_shop_id_prob_click_interval_day',
     'shop_id_time_gap_before_day',
     'shop_id_time_gap_after_day',
     'user_shop_id_click_true_rank_day',
#      'user_context_page_id_first_click_day',
#      'user_context_page_id_last_click_day',
#      'user_context_page_id_click_rank_day',
#      'user_context_page_id_first_click_interval_day',
#      'user_context_page_id_last_click_interval_day',
# #     'user_context_page_id_diff_click_interval_day',
# #     'user_context_page_id_prob_click_interval_day',
#      'context_page_id_time_gap_before_day',
#      'context_page_id_time_gap_after_day',
#      'user_context_page_id_click_true_rank_day',
#     'user_category2_label_first_click_day',
#     'user_category2_label_last_click_day',
     'user_category2_label_click_rank_day',
     'user_category2_label_first_click_interval_day',
     'user_category2_label_last_click_interval_day',
#     'user_category2_label_diff_click_interval_day',
#     'user_category2_label_prob_click_interval_day',
     'category2_label_time_gap_before_day',
     'category2_label_time_gap_after_day',
     'user_category2_label_click_true_rank_day',
    
    #  2_4处理后的特征
     'property_sim',
     'category_predict_rank',
     'category_3',
    
    #  2_5处理后的特征
    #  生成单特征历史点击率，要去除点击次数和点击时间
    #     ['user_id', 'category_predict_rank', 'user_occupation_id', 'user_age_level', 'user_gender_id', 'user_star_level',
    #     'item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #     'item_collected_level', 'item_pv_level',
    #     'context_page_id',
    #     'shop_id', 'shop_review_num_level', 'shop_star_level',]    
     'user_id_smooth_CTR',
     'category_predict_rank_smooth_CTR',
     'user_occupation_id_smooth_CTR',
     'user_age_level_smooth_CTR',
     'user_gender_id_smooth_CTR',
     'user_star_level_smooth_CTR',
     'item_id_smooth_CTR',
     'item_brand_id_smooth_CTR',
#     'item_city_id_smooth_CTR',
     'category2_label_smooth_CTR',
     'item_price_level_smooth_CTR',
     'item_sales_level_smooth_CTR',
     'item_collected_level_smooth_CTR',
#     'item_pv_level_smooth_CTR',
     'context_page_id_smooth_CTR',
     'shop_id_smooth_CTR',
#     'shop_review_num_level_smooth_CTR',
#     'shop_star_level_smooth_CTR',
    #  2_5处理后的特征
    #  生成单特征前一天点击率，前一天的点击次数，前一天的购买次数， 去除ctr
    #     ['user_id', 'category_predict_rank', 'user_occupation_id', 'user_age_level', 'user_gender_id', 'user_star_level',
    #     'item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #     'item_collected_level', 'item_pv_level',
    #     'context_page_id',
    #     'shop_id', 'shop_review_num_level', 'shop_star_level',]
     'user_id_day_I',
     'user_id_day_C',
#     'category_predict_rank_day_I',
#     'category_predict_rank_day_C',
#     'user_occupation_id_day_I',
#     'user_occupation_id_day_C',
#     'user_age_level_day_I',
#     'user_age_level_day_C',
#     'user_gender_id_day_I',
#     'user_gender_id_day_C',
#     'user_star_level_day_I',
#     'user_star_level_day_C',
     'item_id_day_I',
     'item_id_day_C',
     'item_brand_id_day_I',
     'item_brand_id_day_C',
#     'item_city_id_day_I',
#     'item_city_id_day_C',
     'category2_label_day_I',
     'category2_label_day_C',
#     'item_price_level_day_I',
#     'item_price_level_day_C',
#     'item_sales_level_day_I',
#     'item_sales_level_day_C',
#     'item_collected_level_day_I',
#     'item_collected_level_day_C',
#     'item_pv_level_day_I',
#     'item_pv_level_day_C',
#     'context_page_id_day_I',
#     'context_page_id_day_C',
     'shop_id_day_I',
     'shop_id_day_C',
#     'shop_review_num_level_day_I',
#     'shop_review_num_level_day_C',
#     'shop_star_level_day_I',
#     'shop_star_level_day_C',

    #  2_5处理后的特征
    #  生成历史交叉点击率，前一天的点击次数，前一天的购买次数，只保留ctr
    #    ['user_gender_id', 'user_age_level', 'user_occupation_id']
    #    ['item_id', 'item_brand_id', 'shop_id']    

     'user_gender_id_item_id_smooth_CTR',
     'user_gender_id_item_brand_id_smooth_CTR',
     'user_gender_id_shop_id_smooth_CTR',
     'user_gender_id_item_price_level_smooth_CTR',
     'user_age_level_item_id_smooth_CTR',
     'user_age_level_item_brand_id_smooth_CTR',
     'user_age_level_shop_id_smooth_CTR',
     'user_age_level_item_price_level_smooth_CTR',
     'user_occupation_id_item_id_smooth_CTR',
     'user_occupation_id_item_brand_id_smooth_CTR',
     'user_occupation_id_shop_id_smooth_CTR',
     'user_occupation_id_item_price_level_smooth_CTR',
#     'user_star_level_item_id_smooth_CTR',
#     'user_star_level_item_brand_id_smooth_CTR',
#     'user_star_level_shop_id_smooth_CTR',
     'user_star_level_item_price_level_smooth_CTR',

    
    #  2_6处理后的特征
    #  分别groupby['shop_id', 'item_id', 'item_brand_id', 'item_price_level']
    #    计算item在['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']几个属性下的点击量
     'shop_id_user_gender_id_click_rate',
     'shop_id_user_age_level_click_rate',
     'shop_id_user_occupation_id_click_rate',
#     'shop_id_user_star_level_click_rate',
     'item_id_user_gender_id_click_rate',
     'item_id_user_age_level_click_rate',
     'item_id_user_occupation_id_click_rate',
#     'item_id_user_star_level_click_rate',
     'item_brand_id_user_gender_id_click_rate',
     'item_brand_id_user_age_level_click_rate',
     'item_brand_id_user_occupation_id_click_rate',
#     'item_brand_id_user_star_level_click_rate',
     'item_price_level_user_gender_id_click_rate',
     'item_price_level_user_age_level_click_rate',
     'item_price_level_user_occupation_id_click_rate',
     'item_price_level_user_star_level_click_rate',
    
    #  2_7处理后的特征
    #  计算每天的点击量
    #     ['user_id', 'user_occupation_id', 'user_age_level', 'user_gender_id', 'user_star_level',
    #      'item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #      'item_collected_level', 'item_pv_level',
    #      'context_page_id',
    #      'shop_id', 'shop_review_num_level', 'shop_star_level',]
     'user_id_click_day',
     'user_occupation_id_click_day',
     'user_age_level_click_day',
     'user_gender_id_click_day',
     'user_star_level_click_day',
     'item_id_click_day',
     'item_brand_id_click_day',
#     'item_city_id_click_day',
     'category2_label_click_day',
     'item_price_level_click_day',
     'item_sales_level_click_day',
     'item_collected_level_click_day',
#     'item_pv_level_click_day',
     'context_page_id_click_day',
     'shop_id_click_day',
#     'shop_review_num_level_click_day',
#     'shop_star_level_click_day',
    #  2_7处理后的特征
    #  计算每天每小时的点击量
    #     ['user_id', 'user_occupation_id', 'user_age_level', 'user_gender_id', 'user_star_level',
    #      'item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #      'item_collected_level', 'item_pv_level',
    #      'context_page_id',
    #      'shop_id', 'shop_review_num_level', 'shop_star_level',]
     'user_id_click_hour_x',
     'user_occupation_id_click_hour_x',
     'user_age_level_click_hour_x',
     'user_gender_id_click_hour_x',
     'user_star_level_click_hour_x',
     'item_id_click_hour_x',
     'item_brand_id_click_hour_x',
#     'item_city_id_click_hour_x',
     'category2_label_click_hour_x',
     'item_price_level_click_hour_x',
     'item_sales_level_click_hour_x',
     'item_collected_level_click_hour_x',
#     'item_pv_level_click_hour_x',
#     'context_page_id_click_hour_x',
     'shop_id_click_hour_x',
#     'shop_review_num_level_click_hour_x',
#     'shop_star_level_click_hour_x',
    #  2_7处理后的特征
    #  计算每小时的点击量
    #     ['user_id', 'user_occupation_id', 'user_age_level', 'user_gender_id', 'user_star_level',
    #      'item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #      'item_collected_level', 'item_pv_level',
    #      'context_page_id',
    #      'shop_id', 'shop_review_num_level', 'shop_star_level',]
     'user_id_click_hour_y',
     'user_occupation_id_click_hour_y',
     'user_age_level_click_hour_y',
     'user_gender_id_click_hour_y',
#     'user_star_level_click_hour_y',
     'item_id_click_hour_y',
     'item_brand_id_click_hour_y',
#     'item_city_id_click_hour_y',
     'category2_label_click_hour_y',
#     'item_price_level_click_hour_y',
#     'item_sales_level_click_hour_y',
#     'item_collected_level_click_hour_y',
#     'item_pv_level_click_hour_y',
#     'context_page_id_click_hour_y',
     'shop_id_click_hour_y',
#     'shop_review_num_level_click_hour_y',
#     'shop_star_level_click_hour_y'
]
len(features)

226

In [18]:
from sklearn.metrics import log_loss
import xgboost as xgb
train_data = all_data[(all_data.day >= 19) & (all_data.day <= 23)]
test_data = all_data[all_data.day == 24]

dtrain = xgb.DMatrix(train_data[features], train_data[target])
dtest = xgb.DMatrix(test_data[features], test_data[target])

watchlist = [(dtrain, 'train'), (dtest, 'val')]

params = {
    'n_estimators': 2000,
    'max_depth': 5,
    'eta': 0.02,
    'eval_metric': 'logloss',
    'objective': 'binary:logistic',
    'subsample': 1.0,
    'colsample_bytree': 0.7,
#     'random_state': 1123,
#     'min_child_weight': 10
    #'scale_pos_weight':0.5
}

xgb_a = xgb.train(params, dtrain,
                  num_boost_round=1000,
                  early_stopping_rounds=200,
                  evals=watchlist,
                  verbose_eval=50)




[0]	train-logloss:0.674789	val-logloss:0.674734
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 200 rounds.
[50]	train-logloss:0.238424	val-logloss:0.236086
[100]	train-logloss:0.128134	val-logloss:0.124244
[150]	train-logloss:0.096399	val-logloss:0.091686
[200]	train-logloss:0.087346	val-logloss:0.082471
[250]	train-logloss:0.084415	val-logloss:0.079883
[300]	train-logloss:0.082987	val-logloss:0.078975
[350]	train-logloss:0.082004	val-logloss:0.078608
[400]	train-logloss:0.081177	val-logloss:0.078427
[450]	train-logloss:0.080461	val-logloss:0.078311
[500]	train-logloss:0.079804	val-logloss:0.078245
[550]	train-logloss:0.079223	val-logloss:0.078234
[600]	train-logloss:0.078648	val-logloss:0.078253
[650]	train-logloss:0.078089	val-logloss:0.07824
[700]	train-logloss:0.077584	val-logloss:0.078235
[750]	train-logloss:0.077127	val-logloss:0.078236
[800]	train-logloss:0.076721	val-logloss:0.078264
[850]	

In [19]:
loss_train = log_loss(train_data[target], xgb_a.predict(dtrain))
loss_test = log_loss(test_data[target], xgb_a.predict(dtest))
test_data['predicted_score'] = xgb_a.predict(dtest)
test_data[['instance_id', 'predicted_score']].to_csv(
    '24_226_xgboost.txt', index=False, sep=' ')

loss_train, loss_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


(0.07615493807464839, 0.07828151363584884)

In [None]:
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
import lightgbm as lgb

train_data = all_data[(all_data.day >= 19) & (all_data.day <= 23)]
test_data = all_data[all_data.day == 24]

lgb_clf = lgb.LGBMClassifier(objective='binary',

                             n_estimators=2000,
                             learning_rate=0.02,

                             max_depth=4,
                             num_leaves=15,
                             min_child_samples=70,
                             min_child_weight=1e-3,

                             colsample_bytree=0.9,
                             subsample=0.7,
                             subsample_freq=1,

                             reg_lambda=12,
                             min_split_gain=0.,

                             n_jobs=-1,
                             silent=False
                             )


#cate_features = ['user_gender_id', 'user_occupation_id', 'hour']

lgb_clf.fit(train_data[features], train_data[target],
            eval_set=[(test_data[features], test_data[target])],
            early_stopping_rounds=200,
            feature_name=features,
#             categorical_feature=cate_features,
            verbose=50,
            )

loss_train = log_loss(train_data[target],lgb_clf.predict_proba(train_data[features]))
loss_test = log_loss(test_data[target], lgb_clf.predict_proba(test_data[features]))


loss_train, loss_test

In [21]:
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
import lightgbm as lgb

train_data = all_data[(all_data.day >= 19) & (all_data.day <= 22)]
test_data = all_data[all_data.day == 23]

lgb_clf = lgb.LGBMClassifier(objective='binary',

                             n_estimators=2000,
                             learning_rate=0.02,

                             max_depth=4,
                             num_leaves=15,
                             min_child_samples=100,
                             min_child_weight=1e-3,

                             colsample_bytree=1.0,
                             subsample=0.7,
                             subsample_freq=1,

                             reg_lambda=15,
                             min_split_gain=0.,
                             
                             max_bin=63,

                             n_jobs=-1,
                             silent=False,
                             
                             #device='gpu',
                             gpu_use_dp=False,
                             )

lgb_clf.fit(train_data[features], train_data[target],
            eval_set=[(test_data[features], test_data[target])],
            early_stopping_rounds=200,
            feature_name=features,
#             categorical_feature=cate_features,
            verbose=50,
            )

loss_train = log_loss(train_data[target],lgb_clf.predict_proba(train_data[features]))
loss_test = log_loss(test_data[target], lgb_clf.predict_proba(test_data[features]))

test_data['predicted_score'] = lgb_clf.predict_proba(test_data[features])[:, 1]

test_data[['instance_id', 'predicted_score']].to_csv(
    '23_226.txt', index=False, sep=' ')


loss_train, loss_test

Training until validation scores don't improve for 200 rounds.
[50]	valid_0's binary_logloss: 0.241304
[100]	valid_0's binary_logloss: 0.127046
[150]	valid_0's binary_logloss: 0.0936575
[200]	valid_0's binary_logloss: 0.0840902
[250]	valid_0's binary_logloss: 0.0813023
[300]	valid_0's binary_logloss: 0.0803174
[350]	valid_0's binary_logloss: 0.0798454
[400]	valid_0's binary_logloss: 0.0795709
[450]	valid_0's binary_logloss: 0.0793839
[500]	valid_0's binary_logloss: 0.0792878
[550]	valid_0's binary_logloss: 0.0791922
[600]	valid_0's binary_logloss: 0.0791346
[650]	valid_0's binary_logloss: 0.0790822
[700]	valid_0's binary_logloss: 0.0790708
[750]	valid_0's binary_logloss: 0.0790699
[800]	valid_0's binary_logloss: 0.0790485
[850]	valid_0's binary_logloss: 0.079025
[900]	valid_0's binary_logloss: 0.0790363
[950]	valid_0's binary_logloss: 0.079021
[1000]	valid_0's binary_logloss: 0.0790158
[1050]	valid_0's binary_logloss: 0.0789942
[1100]	valid_0's binary_logloss: 0.0789948
[1150]	valid_0'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(0.08026625494912704, 0.07898206400049905)

In [None]:

19-22 23  1120 0.0802662 0.078982     950:0.079021      1300:0.079049  

19-23 24  1207 0.0794017 0.077929     900:0.07799       1400:0.0779677
20-23 24  900  0.0792304 0.078160     750:0.07819       1100:0.0781925
21-23 24  768  0.0782709 0.078262     750:0.07827       950:0.0782948
        

In [None]:
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
import lightgbm as lgb
train_data = all_data[(all_data.day >= 19) & (all_data.day <= 24)]
print(train_data.shape)

test_data = all_data[(all_data.day == 25) & (all_data.is_trade == -2)]

lgb_clf = lgb.LGBMClassifier(objective='binary',

                             n_estimators=1340,
                             learning_rate=0.02,

                             max_depth=4,
                             num_leaves=15,
                             min_child_samples=100,
                             min_child_weight=1e-3,

                             colsample_bytree=1.0,
                             subsample=0.7,
                             subsample_freq=1,

                             reg_lambda=15,
                             min_split_gain=0.,
                             
                             max_bin=63,

                             n_jobs=-1,
                             silent=False,
                             
                             #device='gpu',
                             gpu_use_dp=False,
                             )

                     
lgb_clf.fit(train_data[features], train_data[target],feature_name=features)

loss_train = log_loss(train_data[target], lgb_clf.predict_proba(train_data[features]))

test_data['predicted_score'] = lgb_clf.predict_proba(test_data[features])[:, 1]

test_data[['instance_id', 'predicted_score']].to_csv(
    '20180421_1340.txt', index=False, sep=' ')

loss_train

(399850, 472)


In [None]:
importance = pd.DataFrame(bst.feature_importance())
importance.columns = ['importance_20']
features = pd.DataFrame(features)
features.columns = ['features_20']
pd.set_option('max_rows',500)
merge = features.join(importance)
merge = merge.sort_values(by=['importance_20'], ascending=False).reset_index()
merge

In [24]:
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
import lightgbm as lgb

train_data = all_data[(all_data.day >= 19) & (all_data.day <= 22)]
test_data = all_data[all_data.day == 23]
qwcxg_predict = pd.read_csv("24_226_xgboost.txt", sep=' ')['predicted_score']
qwc_predict = pd.read_csv("25_226.txt", sep=' ')['predicted_score']
yym = pd.read_csv("features-273-depth-5-with_25.txt", sep=' ')['predicted_score']
yym_23 = pd.read_csv("yym_23.txt", sep=' ')['predicted_score']
qwc_23 = pd.read_csv("23_226.txt", sep=' ')['predicted_score']
for i in range(11):
    w1 = 0.1*i
    w2 = 1- w1
    total = yym_23*w1+qwc_23*w2
    loss_test = log_loss(test_data[target],total)
    print(loss_test)

0.07898206400049905
0.07896555775084205
0.07895825162791338
0.07895952163050796
0.07896894320019182
0.07898621946888613
0.07901114730307535
0.07904359967726714
0.07908351654637066
0.07913090101163017
0.0791858194837445
