In [1]:
%matplotlib inline

import gc
import os
import sys
import pandas as pd
import numpy as np
import tqdm
import seaborn as sns
from scipy.stats import mode
import pickle

from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score, precision_score
import platform
from collections import Counter
from sklearn.metrics import recall_score, matthews_corrcoef
from lightgbm import LGBMClassifier, LGBMRegressor, log_evaluation
from lightgbm import early_stopping
from sklearn.linear_model import LogisticRegression

import matplotlib
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.metrics import recall_score, precision_score
    
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
warnings.simplefilter(action = 'ignore', category = DeprecationWarning)
warnings.simplefilter(action = 'ignore', category = UserWarning)
warnings.simplefilter(action = 'ignore', category = RuntimeWarning)
warnings.filterwarnings("ignore", message = "numpy.dtype size changed")
warnings.filterwarnings("ignore", message = "numpy.ufunc size changed")
pd.options.mode.chained_assignment = None

DEBUG = False
target = 'time_to_under'
target2 = 'label'
ver = '05-reg'

In [2]:
with open('dtrain.pkl', 'rb') as f:
    dtrain = pickle.load(f)

In [3]:
dtrain.columns

Index(['id', 'ticket_id', 'entrance_id', 'entrance_nm', 'station_id',
       'station_nm', 'line_id', 'pass_dttm', 'time_to_under', 'label',
       ...
       '19_count_hour', '20_count_hour', '21_count_hour', '22_count_hour',
       '23_count_hour', '0_count_hour', '1_count_hour', '2_count_hour',
       '3_count_hour', '4_count_hour'],
      dtype='object', length=127)

In [9]:
use = [f for f in dtrain.columns if f not in ['id', 'ticket_id', 'pass_dttm', target, target2]]
len(use)

129

In [10]:
X_train, X_val = train_test_split(
    dtrain,
    test_size = 0.1, 
    random_state = 1,
    stratify = dtrain[target2]
)

In [19]:
def get_score(use2):
    models = []
    SEEDS = 1
    
    params = {
        'max_depth': 15,
        'num_leaves': 31,
        'subsample': 0.98,
        'subsample_freq': 3,
        'colsample_bytree': 0.98,
        'reg_alpha': 0.01,
        'reg_lambda': 0.1,        
        'n_estimators': 10000,
        'random_state': 0,
    }
    
    for seed in range(SEEDS):
        params['random_state'] = seed
        model = LGBMRegressor(**params)
        model.fit(
            X_train[use2],
            X_train[target],
            eval_set = (X_val[use2], X_val[target]),
            callbacks = [early_stopping(500, verbose = 0), log_evaluation(200)],
            #eval_metric = lgb_metric
        )
        models.append(model)
    
    return models, dict(model.best_score_)['valid_0']['l2']

In [20]:
models, best = get_score(use)
best

[200]	valid_0's l2: 23084.8
[400]	valid_0's l2: 22976.3
[600]	valid_0's l2: 22897.3
[800]	valid_0's l2: 22840.1
[1000]	valid_0's l2: 22802.3
[1200]	valid_0's l2: 22763.3
[1400]	valid_0's l2: 22735.1
[1600]	valid_0's l2: 22717.9
[1800]	valid_0's l2: 22699.4
[2000]	valid_0's l2: 22680.9
[2200]	valid_0's l2: 22668.8
[2400]	valid_0's l2: 22653.6
[2600]	valid_0's l2: 22643.5
[2800]	valid_0's l2: 22633.9
[3000]	valid_0's l2: 22629.2
[3200]	valid_0's l2: 22627.8
[3400]	valid_0's l2: 22621.8
[3600]	valid_0's l2: 22618.3
[3800]	valid_0's l2: 22617.5
[4000]	valid_0's l2: 22615.2
[4200]	valid_0's l2: 22615.9
[4400]	valid_0's l2: 22617.2


22612.057694628984

In [21]:
#

remove = []
use2 = use.copy()
with open('models-reg.pkl', 'wb') as f:
    pickle.dump([models, remove, use2], f)

https://lk.hacks-ai.ru/758300/champ