In [1]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

from sklearn.metrics import log_loss
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

In [7]:
def CustomCV(data,):    
    fold_index_train = data[(data.hour < 11) & (data.hour > 1)].index
    fold_index_test = data[data.hour >= 11].index
    yield fold_index_train, fold_index_test
    
    
def CustomCV_6_7(data,):    
    fold_index_train = data[((data.day == 7) & (data.hour < 11)) | (data.day == 6)].index
    fold_index_test = data[(data.day == 7) & (data.hour >= 11)].index
    yield fold_index_train, fold_index_test

In [8]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
all_data = load_pickle(all_data_path)

target = 'is_trade'

features = load_pickle('all_features_day_7.pkl')
categorical_feature = load_pickle('categorical_feature.pkl')


len(features), len(categorical_feature)

(261, 2)

In [10]:
if __name__ == '__main__':

#     data = all_data[((all_data.day == 7) | (all_data.day == 6)) & (all_data.is_trade != -1)]
#     data = data.reset_index()

    data = all_data[(all_data.day == 7) & (all_data.is_trade != -1)]
    data = data.reset_index()
    
    del all_data
    gc.collect()
    
    eval_data = data[(data.day == 7) & (data.hour >= 11)]
    eval_set = [(eval_data[features], eval_data[target])]

    lgb_clf = lgb.LGBMClassifier(objective='binary', device='gpu',  n_jobs=4, silent=False)

#  参数的组合
    lgb_param_grad = {'n_estimators': (2000, ),
                      'learning_rate': (0.03, ),

                      'max_depth': (5, ),
                      'num_leaves': (20, ),
                      'min_child_samples': (100, 50, 20),
                      'min_child_weight': (0.001, ),
                      'min_split_gain': (0.0, ),
                      
                      'colsample_bytree': (0.9,),
                      'subsample': (0.7, 0.8, 0.6),
                      'subsample_freq': (1,),
                      
                      'reg_lambda': (10, ),
                      
                      'max_bin': (63, ),
                      
                      'gpu_use_dp': (True, ),
                      }

    clf = GridSearchCV(lgb_clf, param_grid=lgb_param_grad, scoring='neg_log_loss',
                       cv=CustomCV(data), n_jobs=4, verbose=1, refit=False, return_train_score=True)

    clf.fit(data[features], data[target],
            feature_name=features,
            categorical_feature=categorical_feature,
            early_stopping_rounds=300, eval_set=eval_set, verbose=50
           )

    print('=====')
    print("Best parameters set found on development set:")
    print(clf.best_params_)

    print('=====')
    print("Best parameters set found on development set:")
    print(clf.best_score_)
    
    dump_pickle(clf.cv_results_, '0511_grid_search_depth_5_leaves_20.pkl')


Fitting 1 folds for each of 6 candidates, totalling 6 fits




Training until validation scores don't improve for 300 rounds.




Training until validation scores don't improve for 300 rounds.
Training until validation scores don't improve for 300 rounds.
[50]	valid_0's binary_logloss: 0.182822
Training until validation scores don't improve for 300 rounds.
[50]	valid_0's binary_logloss: 0.182822
[50]	valid_0's binary_logloss: 0.182822
[100]	valid_0's binary_logloss: 0.166324
[50]	valid_0's binary_logloss: 0.182865
[100]	valid_0's binary_logloss: 0.166323
[150]	valid_0's binary_logloss: 0.164455
[100]	valid_0's binary_logloss: 0.166302
[150]	valid_0's binary_logloss: 0.164455
[100]	valid_0's binary_logloss: 0.166314
[150]	valid_0's binary_logloss: 0.164446
[200]	valid_0's binary_logloss: 0.163694
[200]	valid_0's binary_logloss: 0.163675
[150]	valid_0's binary_logloss: 0.16447
[250]	valid_0's binary_logloss: 0.163196
[200]	valid_0's binary_logloss: 0.163658
[250]	valid_0's binary_logloss: 0.163198
[200]	valid_0's binary_logloss: 0.163687
[300]	valid_0's binary_logloss: 0.16285
[250]	valid_0's binary_logloss: 0.1631



Training until validation scores don't improve for 300 rounds.
Training until validation scores don't improve for 300 rounds.
[50]	valid_0's binary_logloss: 0.182865
[100]	valid_0's binary_logloss: 0.166314
[50]	valid_0's binary_logloss: 0.182865
[150]	valid_0's binary_logloss: 0.164473
[100]	valid_0's binary_logloss: 0.166314
[200]	valid_0's binary_logloss: 0.163699
[150]	valid_0's binary_logloss: 0.164461
[250]	valid_0's binary_logloss: 0.163202
[200]	valid_0's binary_logloss: 0.163698
[300]	valid_0's binary_logloss: 0.162889
[250]	valid_0's binary_logloss: 0.163224
[350]	valid_0's binary_logloss: 0.162631
[300]	valid_0's binary_logloss: 0.162903
[400]	valid_0's binary_logloss: 0.162468
[350]	valid_0's binary_logloss: 0.162655
[450]	valid_0's binary_logloss: 0.162335
[400]	valid_0's binary_logloss: 0.162485
[500]	valid_0's binary_logloss: 0.162236
[450]	valid_0's binary_logloss: 0.162371
[550]	valid_0's binary_logloss: 0.162109
[500]	valid_0's binary_logloss: 0.162267
[600]	valid_0's

[Parallel(n_jobs=4)]: Done   6 out of   6 | elapsed: 13.7min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   6 out of   6 | elapsed: 13.7min finished


=====
Best parameters set found on development set:
{'colsample_bytree': 0.8, 'gpu_use_dp': True, 'learning_rate': 0.05, 'max_bin': 63, 'max_depth': 3, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.1, 'n_estimators': 2000, 'num_leaves': 18, 'reg_lambda': 10, 'subsample': 0.7, 'subsample_freq': 1}
=====
Best parameters set found on development set:
-0.16136358768696385


In [2]:
gird = load_pickle('0511_grid_search_leaves_20_nan_yym.pkl')

pd.DataFrame(data=gird)[['rank_test_score', 'mean_test_score', 'mean_train_score', 'param_reg_lambda', 'param_min_child_weight',
                                    'param_min_child_samples', 'param_num_leaves', 'param_subsample', 'param_colsample_bytree', 
                                    'param_min_split_gain','param_subsample_freq',
                                    'param_max_bin', 'param_gpu_use_dp']]

# pd.DataFrame(data=clf.cv_results_)[['rank_test_score', 'mean_test_score', 'mean_train_score', 'param_reg_lambda', 'param_num_leaves', 'param_colsample_bytree', 
# #                                     'param_min_split_gain','param_subsample_freq',
#                                     'param_max_bin', 'param_gpu_use_dp']]



Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,param_reg_lambda,param_min_child_weight,param_min_child_samples,param_num_leaves,param_subsample,param_colsample_bytree,param_min_split_gain,param_subsample_freq,param_max_bin,param_gpu_use_dp
0,21,-0.160736,-0.157921,10,0.001,200,20,0.7,0.9,0.0,1,63,True
1,31,-0.160801,-0.159299,10,0.001,200,20,0.8,0.9,0.0,1,63,True
2,9,-0.160698,-0.15932,10,0.001,200,20,0.7,0.9,0.1,1,63,True
3,10,-0.160703,-0.158634,10,0.001,200,20,0.8,0.9,0.1,1,63,True
4,8,-0.160687,-0.160235,10,0.001,100,20,0.7,0.9,0.0,1,63,True
5,5,-0.160677,-0.158563,10,0.001,100,20,0.8,0.9,0.0,1,63,True
6,12,-0.160717,-0.15756,10,0.001,100,20,0.7,0.9,0.1,1,63,True
7,19,-0.160732,-0.158674,10,0.001,100,20,0.8,0.9,0.1,1,63,True
8,14,-0.16072,-0.159634,10,0.001,50,20,0.7,0.9,0.0,1,63,True
9,11,-0.160711,-0.158485,10,0.001,50,20,0.8,0.9,0.0,1,63,True
