In [1]:
%matplotlib inline

import gc
import os
import sys
import pandas as pd
import numpy as np
import tqdm
import seaborn as sns
from scipy.stats import mode
import pickle

from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score, precision_score
import platform
from collections import Counter
from sklearn.metrics import recall_score, matthews_corrcoef
from lightgbm import LGBMClassifier, LGBMRegressor, log_evaluation
from lightgbm import early_stopping
from sklearn.linear_model import LogisticRegression

import matplotlib
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.metrics import recall_score, precision_score

from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize

#from utils import prepare_df, transform_datetime_features

pd.set_option('display.max_columns', None)

# если у вас есть CUDA, то она понадобится там для экспериментов в catboost
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

task_type = 'GPU'
if platform.node() == 'VLAD2016':
    task_type = 'CPU'
    
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
warnings.simplefilter(action = 'ignore', category = DeprecationWarning)
warnings.simplefilter(action = 'ignore', category = UserWarning)
warnings.simplefilter(action = 'ignore', category = RuntimeWarning)
warnings.filterwarnings("ignore", message = "numpy.dtype size changed")
warnings.filterwarnings("ignore", message = "numpy.ufunc size changed")
pd.options.mode.chained_assignment = None

DEBUG = False
target = 'time_to_under'
target2 = 'label'
ver = '03-reg'

In [2]:
with open('dtrain.pkl', 'rb') as f:
    dtrain = pickle.load(f)

In [3]:
dtrain.columns

Index(['id', 'ticket_id', 'entrance_id', 'entrance_nm', 'station_id',
       'station_nm', 'line_id', 'pass_dttm', 'time_to_under', 'label',
       ...
       '19_count_hour', '20_count_hour', '21_count_hour', '22_count_hour',
       '23_count_hour', '0_count_hour', '1_count_hour', '2_count_hour',
       '3_count_hour', '4_count_hour'],
      dtype='object', length=127)

In [8]:
use = [f for f in dtrain.columns if f not in ['id', 'ticket_id', 'pass_dttm', target, target2]]
len(use)

129

In [9]:
X_train, X_val = train_test_split(
    dtrain,
    test_size = 0.1, 
    random_state = 1,
    stratify = dtrain[target2]
)

In [16]:
def get_score(use2):
    models = []
    SEEDS = 1
    
    params = {
        'max_depth': 5,
        'num_leaves': 31,
        'subsample': 0.98,
        'subsample_freq': 3,
        'colsample_bytree': 0.98,
        'reg_alpha': 0.01,
        'reg_lambda': 0.1,        
        'n_estimators': 1000,
        'random_state': 0,
    }
    for seed in range(SEEDS):
        params['random_state'] = seed
        model = LGBMClassifier(**params)
        model.fit(
            X_train[use2],
            X_train[target2],
            eval_set = (X_val[use2], X_val[target2]),
            callbacks = [early_stopping(10, verbose = 1), log_evaluation(1)],
            #eval_metric = lgb_metric
        )
        models.append(model)
    return models, dict(model.best_score_)['valid_0']['multi_logloss']

In [17]:
models, best = get_score(use)
best

[1]	valid_0's multi_logloss: 5.15975
Training until validation scores don't improve for 10 rounds
[2]	valid_0's multi_logloss: 5.20643
[3]	valid_0's multi_logloss: 5.22487
[4]	valid_0's multi_logloss: 5.12956
[5]	valid_0's multi_logloss: 5.14375
[6]	valid_0's multi_logloss: 5.10786
[7]	valid_0's multi_logloss: 5.10076
[8]	valid_0's multi_logloss: 5.08569
[9]	valid_0's multi_logloss: 5.11778
[10]	valid_0's multi_logloss: 5.05303
[11]	valid_0's multi_logloss: 5.06429
[12]	valid_0's multi_logloss: 5.10037
[13]	valid_0's multi_logloss: 5.09983
[14]	valid_0's multi_logloss: 5.10231
[15]	valid_0's multi_logloss: 5.10234
[16]	valid_0's multi_logloss: 5.10666
[17]	valid_0's multi_logloss: 5.09051
[18]	valid_0's multi_logloss: 5.13931
[19]	valid_0's multi_logloss: 5.24149
[20]	valid_0's multi_logloss: 5.32564
Early stopping, best iteration is:
[10]	valid_0's multi_logloss: 5.05303


5.053031524071723

In [19]:
remove = []
use2 = use.copy()
with open('models-clf.pkl', 'wb') as f:
    pickle.dump([models, remove, use2], f)

https://lk.hacks-ai.ru/758300/champ