In [3]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

In [4]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
all_data = load_pickle(all_data_path)

# all_data.drop(['user_click_rank_day', 'user_first_click_day', 'user_last_click_day'], axis=1, inplace=True)
# all_data.drop(['user_click_interval_first_day', 'user_click_interval_last_day'], axis=1, inplace=True)


train_data = all_data[(all_data.day >= 19) & (all_data.day <= 23)]
test_data = all_data[all_data.day == 24]

features = list(all_data.columns)
features.remove('is_trade')
features.remove('instance_id')
target = ['is_trade']
len(features)

426

In [8]:
features = [
    # 预处理后的基础特征----------------------------------------------
    'item_id',
    'item_brand_id',
    'item_city_id',
    'item_price_level',
    'item_sales_level',
    'item_collected_level',
    'item_pv_level',
    'user_id',
    'user_gender_id',
    'user_age_level',
    'user_occupation_id',
    'user_star_level',
    'context_id',
    'context_timestamp',
    'context_page_id',
    'shop_id',
    'shop_review_num_level',
    'shop_review_positive_rate',
    'shop_star_level',
    'shop_score_service',
    'shop_score_delivery',
    'shop_score_description',
    'day',
    'hour',
    'minute',
    'category2_label',
    'item_property_list0',
    'item_property_list1',
    'item_property_list2',
    'item_property_list3',
    'item_property_list4',
    'item_property_list5',
    'item_property_list6',
    'item_property_list7',

    #  2_1处理后的特征
    #  生成用户对当天属性的点击量
    #     ['item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #     'item_collected_level', 'item_pv_level',
    #     'context_page_id',
    #     'shop_id', 'shop_review_num_level', 'shop_star_level',]
    'user_item_id_click_day',
    'user_item_id_click_day',
    'user_item_brand_id_click_day',
    'user_item_city_id_click_day',
    'user_category2_label_click_day',
    'user_item_price_level_click_day',
    'user_item_sales_level_click_day',
    'user_item_collected_level_click_day',
    'user_item_pv_level_click_day',
    'user_context_page_id_click_day',
    'user_shop_id_click_day',
    'user_shop_review_num_level_click_day',
    'user_shop_star_level_click_day',
    #  2_1处理后的特征
    #  生成用户对当天当小时属性的点击量
    #     ['item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #     'item_collected_level', 'item_pv_level',
    #     'context_page_id',
    #     'shop_id', 'shop_review_num_level', 'shop_star_level',]
    'user_item_id_click_hour',
    'user_item_brand_id_click_hour',
    'user_item_city_id_click_hour',
    'user_category2_label_click_hour',
    'user_item_price_level_click_hour',
    'user_item_sales_level_click_hour',
    'user_item_collected_level_click_hour',
    'user_item_pv_level_click_hour',
    'user_context_page_id_click_hour',
    'user_shop_id_click_hour',
    'user_shop_review_num_level_click_hour',
    'user_shop_star_level_click_hour',
    #  2_1处理后的特征
    #  生成用户对单一特征点击数据的统计特征
    'user_item_id_click_day_mean',
    'user_item_id_click_day_min',
    'user_item_id_click_day_max',
    'user_item_brand_id_click_day_mean',
    'user_item_brand_id_click_day_min',
    'user_item_brand_id_click_day_max',
    'user_shop_id_click_day_mean',
    'user_shop_id_click_day_min',
    'user_shop_id_click_day_max',
    'user_category2_label_click_day_mean',
    'user_category2_label_click_day_min',
    'user_category2_label_click_day_max',

    #  2_2处理后的特征
    #  生成单一特征，日点击量的统计特征    stats_feature = ['user_id', 'item_id', 'item_brand_id', 'shop_id']
    'user_id_click_day_mean',
    'user_id_click_day_max',
    'user_id_click_day_min',
    'item_id_click_day_mean',
    'item_id_click_day_max',
    'item_id_click_day_min',
    'item_brand_id_click_day_mean',
    'item_brand_id_click_day_max',
    'item_brand_id_click_day_min',
    'shop_id_click_day_mean',
    'shop_id_click_day_max',
    'shop_id_click_day_min',

    #  2_3处理后的特征
    #  生成用户日点击时间差特征
    # 'user_click_rank_day',
    'user_first_click_day',
    'user_last_click_day',
    'user_click_interval_first_day',
    'user_click_interval_last_day',
    # 'user_click_interval_diff_day',
    # 'user_click_interval_prob',
    'time_gap_before',
    'time_gap_after',
    'user_click_true_rank_day',
    #  2_3处理后的特征
    #  生成用户对属性全局点击时间差特征
    #   ['item_id', 'item_brand_id', 'shop_id', 'context_page_id', 'category2_label',]
    'user_item_id_first_click',
    'user_item_id_last_click',
    'user_item_id_click_rank',
    'user_item_id_first_click_interval',
    'user_item_id_last_click_interval',
    # 'user_item_id_diff_click_interval',
    # 'user_item_id_prob_click_interval',
    'item_id_time_gap_before',
    'item_id_time_gap_after',
    'user_item_id_click_true_rank',
    'user_item_brand_id_first_click',
    'user_item_brand_id_last_click',
    'user_item_brand_id_click_rank',
    'user_item_brand_id_first_click_interval',
    'user_item_brand_id_last_click_interval',
    #  'user_item_brand_id_diff_click_interval',
    #  'user_item_brand_id_prob_click_interval',
    'item_brand_id_time_gap_before',
    'item_brand_id_time_gap_after',
    'user_item_brand_id_click_true_rank',
    'user_shop_id_first_click',
    'user_shop_id_last_click',
    'user_shop_id_click_rank',
    'user_shop_id_first_click_interval',
    'user_shop_id_last_click_interval',
    #  'user_shop_id_diff_click_interval',
    #  'user_shop_id_prob_click_interval',
    'shop_id_time_gap_before',
    'shop_id_time_gap_after',
    'user_shop_id_click_true_rank',
    'user_context_page_id_first_click',
    'user_context_page_id_last_click',
    'user_context_page_id_click_rank',
    'user_context_page_id_first_click_interval',
    'user_context_page_id_last_click_interval',
    #  'user_context_page_id_diff_click_interval',
    #  'user_context_page_id_prob_click_interval',
    'context_page_id_time_gap_before',
    'context_page_id_time_gap_after',
    'user_context_page_id_click_true_rank',
    'user_category2_label_first_click',
    'user_category2_label_last_click',
    'user_category2_label_click_rank',
    'user_category2_label_first_click_interval',
    'user_category2_label_last_click_interval',
    #  'user_category2_label_diff_click_interval',
    #  'user_category2_label_prob_click_interval',
    'category2_label_time_gap_before',
    'category2_label_time_gap_after',
    'user_category2_label_click_true_rank',
    #  2_3处理后的特征
    #  生成用户对属性当天点击时间差特征
    #   ['item_id', 'item_brand_id', 'shop_id', 'context_page_id', 'category2_label',]
    'user_item_id_first_click_day',
    'user_item_id_last_click_day',
    'user_item_id_click_rank_day',
    'user_item_id_first_click_interval_day',
    'user_item_id_last_click_interval_day',
    #  'user_item_id_diff_click_interval_day',
    #  'user_item_id_prob_click_interval_day',
    'item_id_time_gap_before_day',
    'item_id_time_gap_after_day',
    'user_item_id_click_true_rank_day',
    'user_item_brand_id_first_click_day',
    'user_item_brand_id_last_click_day',
    'user_item_brand_id_click_rank_day',
    'user_item_brand_id_first_click_interval_day',
    'user_item_brand_id_last_click_interval_day',
    #  'user_item_brand_id_diff_click_interval_day',
    #  'user_item_brand_id_prob_click_interval_day',
    'item_brand_id_time_gap_before_day',
    'item_brand_id_time_gap_after_day',
    'user_item_brand_id_click_true_rank_day',
    'user_shop_id_first_click_day',
    'user_shop_id_last_click_day',
    'user_shop_id_click_rank_day',
    'user_shop_id_first_click_interval_day',
    'user_shop_id_last_click_interval_day',
    #  'user_shop_id_diff_click_interval_day',
    #  'user_shop_id_prob_click_interval_day',
    'shop_id_time_gap_before_day',
    'shop_id_time_gap_after_day',
    'user_shop_id_click_true_rank_day',
    'user_context_page_id_first_click_day',
    'user_context_page_id_last_click_day',
    'user_context_page_id_click_rank_day',
    'user_context_page_id_first_click_interval_day',
    'user_context_page_id_last_click_interval_day',
    #  'user_context_page_id_diff_click_interval_day',
    #  'user_context_page_id_prob_click_interval_day',
    'context_page_id_time_gap_before_day',
    'context_page_id_time_gap_after_day',
    'user_context_page_id_click_true_rank_day',
    'user_category2_label_first_click_day',
    'user_category2_label_last_click_day',
    'user_category2_label_click_rank_day',
    'user_category2_label_first_click_interval_day',
    'user_category2_label_last_click_interval_day',
    #  'user_category2_label_diff_click_interval_day',
    #  'user_category2_label_prob_click_interval_day',
    'category2_label_time_gap_before_day',
    'category2_label_time_gap_after_day',
    'user_category2_label_click_true_rank_day',

    #  2_4处理后的特征
    'property_sim',
    'category_predict_rank',
    'category_3',

    #  2_5处理后的特征
    #  生成单特征历史点击率，要去除点击次数和点击时间
    #     ['user_id', 'category_predict_rank', 'user_occupation_id', 'user_age_level', 'user_gender_id', 'user_star_level',
    #     'item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #     'item_collected_level', 'item_pv_level',
    #     'context_page_id',
    #     'shop_id', 'shop_review_num_level', 'shop_star_level',]
    # 'user_id_smooth_I',
    # 'user_id_smooth_C',
    'user_id_smooth_CTR',
    # 'category_predict_rank_smooth_I',
    # 'category_predict_rank_smooth_C',
    'category_predict_rank_smooth_CTR',
    # 'user_occupation_id_smooth_I',
    # 'user_occupation_id_smooth_C',
    'user_occupation_id_smooth_CTR',
    # 'user_age_level_smooth_I',
    # 'user_age_level_smooth_C',
    'user_age_level_smooth_CTR',
    # 'user_gender_id_smooth_I',
    # 'user_gender_id_smooth_C',
    'user_gender_id_smooth_CTR',
    # 'user_star_level_smooth_I',
    # 'user_star_level_smooth_C',
    'user_star_level_smooth_CTR',
    # 'item_id_smooth_I',
    # 'item_id_smooth_C',
    'item_id_smooth_CTR',
    # 'item_brand_id_smooth_I',
    # 'item_brand_id_smooth_C',
    'item_brand_id_smooth_CTR',
    # 'item_city_id_smooth_I',
    # 'item_city_id_smooth_C',
    'item_city_id_smooth_CTR',
    # 'category2_label_smooth_I',
    # 'category2_label_smooth_C',
    'category2_label_smooth_CTR',
    # 'item_price_level_smooth_I',
    # 'item_price_level_smooth_C',
    'item_price_level_smooth_CTR',
    # 'item_sales_level_smooth_I',
    # 'item_sales_level_smooth_C',
    'item_sales_level_smooth_CTR',
    # 'item_collected_level_smooth_I',
    # 'item_collected_level_smooth_C',
    'item_collected_level_smooth_CTR',
    # 'item_pv_level_smooth_I',
    # 'item_pv_level_smooth_C',
    'item_pv_level_smooth_CTR',
    # 'context_page_id_smooth_I',
    # 'context_page_id_smooth_C',
    'context_page_id_smooth_CTR',
    # 'shop_id_smooth_I',
    # 'shop_id_smooth_C',
    'shop_id_smooth_CTR',
    # 'shop_review_num_level_smooth_I',
    # 'shop_review_num_level_smooth_C',
    'shop_review_num_level_smooth_CTR',
    # 'shop_star_level_smooth_I',
    # 'shop_star_level_smooth_C',
    'shop_star_level_smooth_CTR',
    #  2_5处理后的特征
    #  生成单特征前一天点击率，前一天的点击次数，前一天的购买次数， 去除ctr
    #     ['user_id', 'category_predict_rank', 'user_occupation_id', 'user_age_level', 'user_gender_id', 'user_star_level',
    #     'item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #     'item_collected_level', 'item_pv_level',
    #     'context_page_id',
    #     'shop_id', 'shop_review_num_level', 'shop_star_level',]
    'user_id_day_I',
    'user_id_day_C',
    # 'user_id_CTR',
    'category_predict_rank_day_I',
    'category_predict_rank_day_C',
    # 'category_predict_rank_CTR',
    'user_occupation_id_day_I',
    'user_occupation_id_day_C',
    # 'user_occupation_id_CTR',
    'user_age_level_day_I',
    'user_age_level_day_C',
    # 'user_age_level_CTR',
    'user_gender_id_day_I',
    'user_gender_id_day_C',
    # 'user_gender_id_CTR',
    'user_star_level_day_I',
    'user_star_level_day_C',
    # 'user_star_level_CTR',
    'item_id_day_I',
    'item_id_day_C',
    # 'item_id_CTR',
    'item_brand_id_day_I',
    'item_brand_id_day_C',
    # 'item_brand_id_CTR',
    'item_city_id_day_I',
    'item_city_id_day_C',
    # 'item_city_id_CTR',
    'category2_label_day_I',
    'category2_label_day_C',
    # 'category2_label_CTR',
    'item_price_level_day_I',
    'item_price_level_day_C',
    # 'item_price_level_CTR',
    'item_sales_level_day_I',
    'item_sales_level_day_C',
    # 'item_sales_level_CTR',
    'item_collected_level_day_I',
    'item_collected_level_day_C',
    # 'item_collected_level_CTR',
    'item_pv_level_day_I',
    'item_pv_level_day_C',
    # 'item_pv_level_CTR',
    'context_page_id_day_I',
    'context_page_id_day_C',
    # 'context_page_id_CTR',
    'shop_id_day_I',
    'shop_id_day_C',
    # 'shop_id_CTR',
    'shop_review_num_level_day_I',
    'shop_review_num_level_day_C',
    # 'shop_review_num_level_CTR',
    'shop_star_level_day_I',
    'shop_star_level_day_C',
    # 'shop_star_level_CTR',

    #  2_5处理后的特征
    #  user_id前一天点击某某某的数量
    #  ['item_id', 'item_brand_id', 'shop_id', 'category2_label',]
    'user_id_item_id_day_I',
    'user_id_item_id_day_C',
    'user_id_item_brand_id_day_I',
    'user_id_item_brand_id_day_C',
    'user_id_shop_id_day_I',
    'user_id_shop_id_day_C',
    'user_id_category2_label_day_I',
    'user_id_category2_label_day_C',

    #  2_5处理后的特征
    #  生成历史交叉点击率，前一天的点击次数，前一天的购买次数，只保留ctr
    #    ['user_gender_id', 'user_age_level', 'user_occupation_id']
    #    ['item_id', 'item_brand_id', 'shop_id']


    # 'user_gender_id_item_id_smooth_I',
    # 'user_gender_id_item_id_smooth_C',
    'user_gender_id_item_id_smooth_CTR',
    # 'user_gender_id_item_brand_id_smooth_I',
    # 'user_gender_id_item_brand_id_smooth_C',
    'user_gender_id_item_brand_id_smooth_CTR',
    # 'user_gender_id_shop_id_smooth_I',
    # 'user_gender_id_shop_id_smooth_C',
    'user_gender_id_shop_id_smooth_CTR',
    # 'user_gender_id_item_price_level_smooth_I',
    # 'user_gender_id_item_price_level_smooth_C',
#    'user_gender_id_item_price_level_smooth_CTR',
    # 'user_age_level_item_id_smooth_I',
    # 'user_age_level_item_id_smooth_C',
    'user_age_level_item_id_smooth_CTR',
    # 'user_age_level_item_brand_id_smooth_I',
    # 'user_age_level_item_brand_id_smooth_C',
    'user_age_level_item_brand_id_smooth_CTR',
    # 'user_age_level_shop_id_smooth_I',
    # 'user_age_level_shop_id_smooth_C',
    'user_age_level_shop_id_smooth_CTR',
    # 'user_age_level_item_price_level_smooth_I',
    # 'user_age_level_item_price_level_smooth_C',
#    'user_age_level_item_price_level_smooth_CTR',
    # 'user_occupation_id_item_id_smooth_I',
    # 'user_occupation_id_item_id_smooth_C',
    'user_occupation_id_item_id_smooth_CTR',
    # 'user_occupation_id_item_brand_id_smooth_I',
    # 'user_occupation_id_item_brand_id_smooth_C',
    'user_occupation_id_item_brand_id_smooth_CTR',
    # 'user_occupation_id_shop_id_smooth_I',
    # 'user_occupation_id_shop_id_smooth_C',
    'user_occupation_id_shop_id_smooth_CTR',
    # 'user_occupation_id_item_price_level_smooth_I',
    # 'user_occupation_id_item_price_level_smooth_C',
#    'user_occupation_id_item_price_level_smooth_CTR',
    # 'user_star_level_item_id_smooth_I',
    # 'user_star_level_item_id_smooth_C',
#    'user_star_level_item_id_smooth_CTR',
    # 'user_star_level_item_brand_id_smooth_I',
    # 'user_star_level_item_brand_id_smooth_C',
#    'user_star_level_item_brand_id_smooth_CTR',
    # 'user_star_level_shop_id_smooth_I',
    # 'user_star_level_shop_id_smooth_C',
#    'user_star_level_shop_id_smooth_CTR',
    # 'user_star_level_item_price_level_smooth_I',
    # 'user_star_level_item_price_level_smooth_C',
#    'user_star_level_item_price_level_smooth_CTR',



    #  2_6处理后的特征
    #  分别groupby['shop_id', 'item_id', 'item_brand_id', 'item_price_level']
    #    计算item在['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']几个属性下的点击量
    'shop_id_user_gender_id_click_rate',
    'shop_id_user_age_level_click_rate',
    'shop_id_user_occupation_id_click_rate',
    'shop_id_user_star_level_click_rate',
    'item_id_user_gender_id_click_rate',
    'item_id_user_age_level_click_rate',
    'item_id_user_occupation_id_click_rate',
    'item_id_user_star_level_click_rate',
    'item_brand_id_user_gender_id_click_rate',
    'item_brand_id_user_age_level_click_rate',
    'item_brand_id_user_occupation_id_click_rate',
    'item_brand_id_user_star_level_click_rate',
    'item_price_level_user_gender_id_click_rate',
    'item_price_level_user_age_level_click_rate',
    'item_price_level_user_occupation_id_click_rate',
    'item_price_level_user_star_level_click_rate',

    #  2_7处理后的特征
    #  计算每天的点击量
    #     ['user_id', 'user_occupation_id', 'user_age_level', 'user_gender_id', 'user_star_level',
    #      'item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #      'item_collected_level', 'item_pv_level',
    #      'context_page_id',
    #      'shop_id', 'shop_review_num_level', 'shop_star_level',]
    'user_id_click_day',
    'user_occupation_id_click_day',
    'user_age_level_click_day',
    'user_gender_id_click_day',
    'user_star_level_click_day',
    'item_id_click_day',
    'item_brand_id_click_day',
    'item_city_id_click_day',
    'category2_label_click_day',
    'item_price_level_click_day',
    'item_sales_level_click_day',
    'item_collected_level_click_day',
    'item_pv_level_click_day',
    'context_page_id_click_day',
    'shop_id_click_day',
    'shop_review_num_level_click_day',
    'shop_star_level_click_day',
    #  2_7处理后的特征
    #  计算每天每小时的点击量
    #     ['user_id', 'user_occupation_id', 'user_age_level', 'user_gender_id', 'user_star_level',
    #      'item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #      'item_collected_level', 'item_pv_level',
    #      'context_page_id',
    #      'shop_id', 'shop_review_num_level', 'shop_star_level',]
    'user_id_click_hour_x',
    'user_occupation_id_click_hour_x',
    'user_age_level_click_hour_x',
    'user_gender_id_click_hour_x',
    'user_star_level_click_hour_x',
    'item_id_click_hour_x',
    'item_brand_id_click_hour_x',
    'item_city_id_click_hour_x',
    'category2_label_click_hour_x',
    'item_price_level_click_hour_x',
    'item_sales_level_click_hour_x',
    'item_collected_level_click_hour_x',
    'item_pv_level_click_hour_x',
    'context_page_id_click_hour_x',
    'shop_id_click_hour_x',
    'shop_review_num_level_click_hour_x',
    'shop_star_level_click_hour_x',
    #  2_7处理后的特征
    #  计算每小时的点击量
    #     ['user_id', 'user_occupation_id', 'user_age_level', 'user_gender_id', 'user_star_level',
    #      'item_id', 'item_brand_id', 'item_city_id', 'category2_label','item_price_level','item_sales_level',
    #      'item_collected_level', 'item_pv_level',
    #      'context_page_id',
    #      'shop_id', 'shop_review_num_level', 'shop_star_level',]
    'user_id_click_hour_y',
    'user_occupation_id_click_hour_y',
    'user_age_level_click_hour_y',
    'user_gender_id_click_hour_y',
    'user_star_level_click_hour_y',
    'item_id_click_hour_y',
    'item_brand_id_click_hour_y',
    'item_city_id_click_hour_y',
    'category2_label_click_hour_y',
    'item_price_level_click_hour_y',
    'item_sales_level_click_hour_y',
    'item_collected_level_click_hour_y',
    'item_pv_level_click_hour_y',
    'context_page_id_click_hour_y',
    'shop_id_click_hour_y',
    'shop_review_num_level_click_hour_y',
    'shop_star_level_click_hour_y']


In [9]:
all_data[features].shape
# features

(496482, 311)

In [10]:
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
import lightgbm as lgb

train_data = all_data[(all_data.day >= 19) & (all_data.day <= 23)]
test_data = all_data[all_data.day == 24]

lgb_clf = lgb.LGBMClassifier(objective='binary',

                             n_estimators=2000,
                             learning_rate=0.02,

                             max_depth=4,
                             num_leaves=15,
                             min_child_samples=70,
                             min_child_weight=1e-3,

                             colsample_bytree=0.9,
                             subsample=0.7,
                             subsample_freq=1,

                             reg_lambda=10,
                             min_split_gain=0.,

                             n_jobs=-1,
                             silent=False
                             )


cate_features = ['user_gender_id', 'user_occupation_id', 'hour']

lgb_clf.fit(train_data[features], train_data[target],
            eval_set=[(test_data[features], test_data[target])],
            early_stopping_rounds=100,
            feature_name=features,
#             categorical_feature=cate_features,
            verbose=50,
            )

loss_train = log_loss(train_data[target],
                      lgb_clf.predict_proba(train_data[features]))
loss_test = log_loss(
    test_data[target], lgb_clf.predict_proba(test_data[features]))



loss_train, loss_test

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Training until validation scores don't improve for 100 rounds.
[50]	valid_0's binary_logloss: 0.240174
[100]	valid_0's binary_logloss: 0.125672
[150]	valid_0's binary_logloss: 0.0924428
[200]	valid_0's binary_logloss: 0.0831214
[250]	valid_0's binary_logloss: 0.0804786
[300]	valid_0's binary_logloss: 0.079536
[350]	valid_0's binary_logloss: 0.0791161
[400]	valid_0's binary_logloss: 0.078903
[450]	valid_0's binary_logloss: 0.0787356
[500]	valid_0's binary_logloss: 0.0786414
[550]	valid_0's binary_logloss: 0.0785863
[600]	valid_0's binary_logloss: 0.0785452
[650]	valid_0's binary_logloss: 0.0785254
[700]	valid_0's binary_logloss: 0.0784814
[750]	valid_0's binary_logloss: 0.0784615
[800]	valid_0's binary_logloss: 0.0784523
[850]	valid_0's binary_logloss: 0.0784584
Early stopping, best iteration is:
[797]	valid_0's binary_logloss: 0.0784503


(0.08109315354097049, 0.07845031181587162)

In [None]:
importance = pd.DataFrame(gbm.feature_importance())
importance.columns = ['importance_20']
features = pd.DataFrame(features)
features.columns = ['features_20']
pd.set_option('max_rows',500)
merge = features.join(importance)
merge = merge.sort_values(by=['importance_20'], ascending=False).reset_index()
merge

In [None]:
dump_pickle(merge, feature_data_path+'importance_20.pkl')

In [None]:
merge24 = load_pickle(feature_data_path+'importance_24.pkl')
merge24 = merge24.rename(columns={'index':'index24'})
merge23 = load_pickle(feature_data_path+'importance_23.pkl')
merge23 = merge23.rename(columns={'index':'index23'})
merge22 = load_pickle(feature_data_path+'importance_22.pkl')
merge22 = merge22.rename(columns={'index':'index22'})
merge20 = load_pickle(feature_data_path+'importance_20.pkl')
merge20 = merge20.rename(columns={'index':'index20'})
merge = merge24.join(merge23)
merge = merge.join(merge22)
merge = merge.join(merge20)

writer = pd.ExcelWriter('merge.xlsx')
merge.to_excel(writer,'Sheet1')
writer.save()

In [None]:
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(boosting_type='gbdt', max_depth=5, learning_rate=0.01,
                        n_estimators=3000, objective='binary', 
                        subsample=0.9, subsample_freq=5, colsample_bytree=0.9, 
                         n_jobs=-1, silent=True)
cate_features = ['user_gender_id', 'user_occupation_id']
lgb_clf.fit(train_data[features], train_data['is_trade'],
          eval_set=[(test_data[features],test_data[target])],
          early_stopping_rounds=20,
          feature_name=features,
          categorical_feature=cate_features,
          verbose=20,
          )
loss_train = log_loss(train_data[target], lgb_clf.predict_proba(train_data[features]))
loss_test = log_loss(test_data[target], lgb_clf.predict_proba(test_data[features]))

loss_train, loss_test

\## 提交结果

In [7]:
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
import lightgbm as lgb

train_data = all_data[(all_data.day >= 19) & (all_data.day <= 24)]
test_data = all_data[all_data.day == 25]

lgb_train = lgb.Dataset(train_data[features], train_data['is_trade'])
lgb_test = lgb.Dataset(test_data[features], test_data['is_trade'], reference=lgb_train)
cate_features = ['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    #'num_leaves': 31,
    'learning_rate': 0.01,
     'feature_fraction': 0.9,
     'bagging_fraction': 0.9,
     'bagging_freq': 5,
     'verbose': 0,
    'lambda_l2':10,
    'max_depth':4
}

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1234,
                valid_sets=[lgb_train],
                valid_names = ['train'] ,
                categorical_feature=cate_features,
                early_stopping_rounds=20,
                verbose_eval = 5)

loss_train = log_loss(train_data[target], gbm.predict(train_data[features]))

test_data['predicted_score'] = gbm.predict(test_data[features])

test_data[['instance_id', 'predicted_score']].to_csv('20180413.txt', index=False, sep=' ')

loss_train

Start training...




Training until validation scores don't improve for 20 rounds.
[5]	train's binary_logloss: 0.648793
[10]	train's binary_logloss: 0.608513
[15]	train's binary_logloss: 0.571806
[20]	train's binary_logloss: 0.538214
[25]	train's binary_logloss: 0.507408
[30]	train's binary_logloss: 0.479077
[35]	train's binary_logloss: 0.452949
[40]	train's binary_logloss: 0.428815
[45]	train's binary_logloss: 0.406484
[50]	train's binary_logloss: 0.385781
[55]	train's binary_logloss: 0.366556
[60]	train's binary_logloss: 0.348681
[65]	train's binary_logloss: 0.332045
[70]	train's binary_logloss: 0.316545
[75]	train's binary_logloss: 0.302079
[80]	train's binary_logloss: 0.288573
[85]	train's binary_logloss: 0.275961
[90]	train's binary_logloss: 0.264162
[95]	train's binary_logloss: 0.253129
[100]	train's binary_logloss: 0.242792
[105]	train's binary_logloss: 0.233107
[110]	train's binary_logloss: 0.224039
[115]	train's binary_logloss: 0.215527
[120]	train's binary_logloss: 0.207548
[125]	train's binary_l

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.08153040041657691

# 提交结果先用这个

In [None]:
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
import lightgbm as lgb
train_data = all_data[(all_data.day >= 19) & (all_data.day <= 24)]
print(train_data.shape)
test_data = all_data[all_data.day == 25]
cate_features = ['user_gender_id', 'user_occupation_id',]
clf = LGBMClassifier(n_estimators=200, max_depth=3)
                     
clf.fit(train_data[features], train_data['is_trade'],
          feature_name=features,
          categorical_feature=cate_features,
          )

loss_train = log_loss(train_data[target], clf.predict_proba(train_data[features]))

test_data['predicted_score'] = clf.predict_proba(test_data[features])[:, 1]

test_data[['instance_id', 'predicted_score']].to_csv(
    '20180410.txt', index=False, sep=' ')

loss_train