In [1]:
class CFG:
    exps = ['026', 'deberta-v3-large-e1_exp032']
    n_fold = 4
    debug=False
    seed = 42

# ====================================================
# Colab settings
# ====================================================
class ColabConfig:
    dataset_name = 'PPPM-stacking'
    dataset_version = 'exp' + '-'.join(CFG.exps)
    dataset_new = True  # 新しいデータセットか
    dataset_dir = None  # Kaggle Dataset にアップロードするディレクトリ
    dataset_note = '""'  # 前の版からの変更点
    in_colab = False  # colab上にデータダウンロード

In [2]:
!nvcc --version
!python -c 'import torch; print(torch.__version__) '
!python --version
print('')
!nvidia-smi
print('')
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0
1.11.0+cu113
Python 3.7.13

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.


Your runtime has 13.6 gigabytes of available RAM



In [3]:
%%time
import sys
COLAB = "google.colab" in sys.modules

if COLAB:
    import os
    print('This environment is Google Colab')

    # mount drive
    from google.colab import drive
    if not os.path.isdir('/content/drive'):
        drive.mount('/content/drive')
    os.chdir('/content/drive/MyDrive/kaggle/PPPM/exps')

    # kaggle api token and update kaggle api
    from google.colab import files
    if not os.path.isfile('~/.kaggle/kaggle.json'):
        # files.upload()
        !mkdir -p ~/.kaggle
        !cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
        !pip install --upgrade --force-reinstall --no-deps -q kaggle

    if ColabConfig.in_colab:
        # make directory in colab
        !mkdir -p /content/input
        !mkdir -p /content/working

        # download dataset in colab
        import zipfile, glob
        os.chdir('/content/input')
        # !kaggle competitions download -qc birdclef-2022 -p birdclef-2022
        
        # !mkdir birdclef-2022
        # !cp /content/drive/MyDrive/kaggle/BirdCLEF2022/input/birdclef-2022/* ./birdclef-2022/

        for p in glob.glob('**/*.zip', recursive=True):
            print(p)
            d, f = os.path.split(p)
            # if f in ['']:
            #     continue
            with zipfile.ZipFile(p, 'r') as zipf:
                print('unzip: ', zipf)
                zipf.extractall(d)
                print('remove: ', f)
                os.remove(p)
        
        os.chdir('/content/working')

This environment is Google Colab
Mounted at /content/drive
[K     |████████████████████████████████| 58 kB 4.0 MB/s 
[?25h  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
CPU times: user 2.57 s, sys: 502 ms, total: 3.08 s
Wall time: 28.6 s


In [4]:
import os
import random
import pickle
from pathlib import Path
from psutil import cpu_count
import warnings
warnings.filterwarnings("ignore")

os.system('python -m pip install -U pickle5')
import pickle5

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 10000)
pd.set_option('display.max_colwidth', None)
from tqdm.auto import tqdm
tqdm.pandas()
os.system('python -m pip install -U lightgbm')
import lightgbm as lgb
print(f"lightgbm.__version__: {lgb.__version__}")
os.system('python -m pip install optuna')
import optuna.integration.lightgbm as lgb_optuna
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import torch
print(f"torch.__version__: {torch.__version__}")

os.system('python -m pip install sentencepiece')
os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

lightgbm.__version__: 3.3.2
torch.__version__: 1.11.0+cu113
tokenizers.__version__: 0.12.1
transformers.__version__: 4.18.0
env: TOKENIZERS_PARALLELISM=true


In [5]:
input_dir = Path('../input/us-patent-phrase-to-phrase-matching')
output_dir = Path(f"/content/drive/MyDrive/kaggle/PPPM/output/stacking{'_'.join(CFG.exps)}")
output_dir.mkdir(exist_ok=True)

ColabConfig.dataset_dir = str(output_dir)

In [6]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score


def get_logger(filename=output_dir / 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

# LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [7]:
# 特徴量エンジニアリング
def tok_len(sentence, tokenizer):
    return len(tokenizer.tokenize(sentence))

def tokenizer_feature_engineering(df, tokenizer):
    def _tok_len(sentence):
        return tok_len(sentence, tokenizer)
    
    basecols = df.columns

    df['anchor_tok_len'] = df['anchor'].map(_tok_len)
    df['target_tok_len'] = df['target'].map(_tok_len)
    df['context_tok_len'] = df['context_text'].map(_tok_len)
    df['input_len'] = df['anchor_tok_len'] + df['target_tok_len'] + df['context_tok_len'] + 4

    df['len_anc_tgt_diff'] = df['anchor_tok_len'] - df['target_tok_len']
    df['len_anc_tgt_div'] = df['anchor_tok_len'] / df['target_tok_len']

    df['len_anc_cnt_diff'] = df['anchor_tok_len'] - df['context_tok_len']
    df['len_anc_cnt_div'] = df['anchor_tok_len'] / df['context_tok_len']

    df['len_tgt_cnt_diff'] = df['target_tok_len'] - df['context_tok_len']
    df['len_tgt_cnt_div'] = df['target_tok_len'] / df['context_tok_len']

    # 作成した特徴量とidカラムだけ返す
    usecols = [col for col in df.columns if col not in basecols]
    usecols.append('id')
    return df[usecols]

In [8]:
def make_ensemble_datasets(cfg):
    cv_scores = []
    train = pd.read_csv(input_dir / 'train.csv')

    for exp in cfg.exps:
        oof_dir = Path(f'/content/drive/MyDrive/kaggle/PPPM/output/{exp}')
        if exp in ['deberta-v3-large-e1_exp032', 'PatentSBERTa-exp035']:
            oof_df = pd.read_csv(oof_dir / 'oof_df.csv')
        else:
            oof_df = pd.read_pickle(oof_dir / 'oof_df.pkl')
        tokenizer = AutoTokenizer.from_pretrained(oof_dir / 'tokenizer')
        
        # 特徴量と予測スコアは'pred_000'のようなカラム名にする
        feature_df = tokenizer_feature_engineering(oof_df, tokenizer)
        rename_dict = {col: col+'_'+exp for col in feature_df.columns if col != 'id'}
        feature_df = feature_df.rename(columns=rename_dict)
        
        train = train.merge(feature_df, on='id', how='left')
        train = train.merge(oof_df[['id', 'pred']].rename(columns={'pred': f'pred_{exp}'}), 
                            on='id', how='left')

        cv_scores.append(get_score(train['score'], train[f'pred_{exp}']))
    
    print(f'max score: {max(cv_scores)}')
    print(f'avg score: {sum(cv_scores) / len(cv_scores)}')
    return train

In [9]:
pd.__version__

'1.3.5'

In [10]:
train = make_ensemble_datasets(CFG)
display(train.head())

max score: 0.832151085616115
avg score: 0.8237038950893069


Unnamed: 0,id,anchor,target,context,score,anchor_tok_len_026,target_tok_len_026,context_tok_len_026,input_len_026,len_anc_tgt_diff_026,len_anc_tgt_div_026,len_anc_cnt_diff_026,len_anc_cnt_div_026,len_tgt_cnt_diff_026,len_tgt_cnt_div_026,pred_026,anchor_tok_len_deberta-v3-large-e1_exp032,target_tok_len_deberta-v3-large-e1_exp032,context_tok_len_deberta-v3-large-e1_exp032,input_len_deberta-v3-large-e1_exp032,len_anc_tgt_diff_deberta-v3-large-e1_exp032,len_anc_tgt_div_deberta-v3-large-e1_exp032,len_anc_cnt_diff_deberta-v3-large-e1_exp032,len_anc_cnt_div_deberta-v3-large-e1_exp032,len_tgt_cnt_diff_deberta-v3-large-e1_exp032,len_tgt_cnt_div_deberta-v3-large-e1_exp032,pred_deberta-v3-large-e1_exp032
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,1,3,20,28,-2,0.333333,-19,0.05,-17,0.15,0.440476,1,3,29,37,-2,0.333333,-28,0.034483,-26,0.103448,0.358203
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,1,5,20,30,-4,0.2,-19,0.05,-15,0.25,0.641294,1,5,29,39,-4,0.2,-28,0.034483,-24,0.172414,0.610718
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.247038,1,2,29,36,-1,0.5,-28,0.034483,-27,0.068966,0.258612
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.494176,1,2,29,36,-1,0.5,-28,0.034483,-27,0.068966,0.401216
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.011766,1,2,29,36,-1,0.5,-28,0.034483,-27,0.068966,0.047901


In [11]:
# foldに分ける
# credits to: https://www.kaggle.com/code/abhishek/creating-folds-properly-hopefully-p

!pip install -q iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

dfx = pd.get_dummies(train, columns=["score"]).groupby(["anchor"], as_index=False).sum()
cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
dfx = dfx[cols]

mskf = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=42)
labels = [c for c in dfx.columns if c != "anchor"]
dfx_labels = dfx[labels]
dfx["fold"] = -1

for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
    print(len(trn_), len(val_))
    dfx.loc[val_, "fold"] = fold

train = train.merge(dfx[["anchor", "fold"]], on="anchor", how="left")
print(train.fold.value_counts())

550 183
549 184
550 183
550 183
3    9622
0    9379
1    8860
2    8612
Name: fold, dtype: int64


In [12]:
# カテゴリ変数の変換
train['section'] = train['context'].map(lambda s: s[0])
cat_cols = ['section']
for c in cat_cols:
    le = LabelEncoder()
    le.fit(train[c])
    train[f'{c}_encoded'] = le.transform(train[c])
    with open(output_dir / f'{c}_encoder.pkl', 'wb') as f:
        pickle.dump(le, f)

In [13]:
print(train.columns)

Index(['id', 'anchor', 'target', 'context', 'score', 'anchor_tok_len_026', 'target_tok_len_026', 'context_tok_len_026', 'input_len_026', 'len_anc_tgt_diff_026', 'len_anc_tgt_div_026', 'len_anc_cnt_diff_026', 'len_anc_cnt_div_026', 'len_tgt_cnt_diff_026', 'len_tgt_cnt_div_026', 'pred_026', 'anchor_tok_len_deberta-v3-large-e1_exp032', 'target_tok_len_deberta-v3-large-e1_exp032', 'context_tok_len_deberta-v3-large-e1_exp032', 'input_len_deberta-v3-large-e1_exp032', 'len_anc_tgt_diff_deberta-v3-large-e1_exp032', 'len_anc_tgt_div_deberta-v3-large-e1_exp032', 'len_anc_cnt_diff_deberta-v3-large-e1_exp032', 'len_anc_cnt_div_deberta-v3-large-e1_exp032', 'len_tgt_cnt_diff_deberta-v3-large-e1_exp032', 'len_tgt_cnt_div_deberta-v3-large-e1_exp032', 'pred_deberta-v3-large-e1_exp032', 'fold', 'section', 'section_encoded'], dtype='object')


In [14]:
excluded_columns = ['id', 'anchor', 'target', 'context', 'score', 'fold', 'section']
cols = [col for col in train.columns if col not in excluded_columns]
with open(output_dir / 'cols.pkl', 'wb') as f:
    pickle.dump(cols, f)
print(cols)

['anchor_tok_len_026', 'target_tok_len_026', 'context_tok_len_026', 'input_len_026', 'len_anc_tgt_diff_026', 'len_anc_tgt_div_026', 'len_anc_cnt_diff_026', 'len_anc_cnt_div_026', 'len_tgt_cnt_diff_026', 'len_tgt_cnt_div_026', 'pred_026', 'anchor_tok_len_deberta-v3-large-e1_exp032', 'target_tok_len_deberta-v3-large-e1_exp032', 'context_tok_len_deberta-v3-large-e1_exp032', 'input_len_deberta-v3-large-e1_exp032', 'len_anc_tgt_diff_deberta-v3-large-e1_exp032', 'len_anc_tgt_div_deberta-v3-large-e1_exp032', 'len_anc_cnt_diff_deberta-v3-large-e1_exp032', 'len_anc_cnt_div_deberta-v3-large-e1_exp032', 'len_tgt_cnt_diff_deberta-v3-large-e1_exp032', 'len_tgt_cnt_div_deberta-v3-large-e1_exp032', 'pred_deberta-v3-large-e1_exp032', 'section_encoded']


In [15]:
params = {
    'objective': 'regression',
    'boosting': 'gbdt',
    'num_iterations': 1000,
    'learning_rate': 0.01,
    # 'num_leaves': 20,  # 31
    'num_threads': cpu_count(logical=False),
    'seed': CFG.seed,

    # 'max_depth': 3,  # -1
    # 'min_data_in_leaf': 100,  # 20
    'verbosity': -1,  # 1

    'metric': 'l2'
}

In [16]:
def tuning_params(df, params):
    anchors = df['anchor'].unique()
    rng = np.random.default_rng(seed=71)
    rng.shuffle(anchors)
    val_prop = 0.25  # バリデーションデータの割合。4foldから0.25
    val_size = int(len(anchors)*val_prop)
    val_anchors = anchors[:val_size]
    is_val = np.isin(df['anchor'], val_anchors)
    idx = np.arange(len(df))
    val_idx = idx[is_val]
    tr_idx = idx[~is_val]
    print(f'train: {len(tr_idx)}, val: {len(val_idx)}')

    tr_df = df.iloc[tr_idx]
    val_df = df.iloc[val_idx]

    cat_cols_ = [f'{c}_encoded' for c in cat_cols]
    tr_ds = lgb.Dataset(tr_df[cols], tr_df['score'], categorical_feature=cat_cols_)
    val_ds = lgb.Dataset(val_df[cols], val_df['score'], categorical_feature=cat_cols_, reference=tr_ds)
    
    model = lgb_optuna.train(
        params,
        tr_ds, 
        valid_sets=[tr_ds, val_ds],
        callbacks=[lgb.early_stopping(stopping_rounds=100),
                    lgb.log_evaluation(period=100)],
        verbosity=-1,
    )

    best_params = model.params

    return best_params

In [None]:
best_params = tuning_params(train, params)
display(best_params)

train: 27261, val: 9212


[32m[I 2022-06-20 09:19:19,531][0m A new study created in memory with name: no-name-334595c9-4581-4b7d-adfa-d12d566d0445[0m
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]

[1]	valid_0's l2: 0.0662933	valid_1's l2: 0.0643284
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l2: 0.0653651	valid_1's l2: 0.0634327
[3]	valid_0's l2: 0.0644558	valid_1's l2: 0.0625555
[4]	valid_0's l2: 0.0635645	valid_1's l2: 0.0616964
[5]	valid_0's l2: 0.0626905	valid_1's l2: 0.060852
[6]	valid_0's l2: 0.0618343	valid_1's l2: 0.0600252
[7]	valid_0's l2: 0.0609953	valid_1's l2: 0.0592157
[8]	valid_0's l2: 0.0601721	valid_1's l2: 0.0584213
[9]	valid_0's l2: 0.0593729	valid_1's l2: 0.0576468
[10]	valid_0's l2: 0.0585822	valid_1's l2: 0.0568838
[11]	valid_0's l2: 0.0578418	valid_1's l2: 0.0561734
[12]	valid_0's l2: 0.057081	valid_1's l2: 0.0554418
[13]	valid_0's l2: 0.0563354	valid_1's l2: 0.054724
[14]	valid_0's l2: 0.055604	valid_1's l2: 0.05402
[15]	valid_0's l2: 0.054888	valid_1's l2: 0.053332
[16]	valid_0's l2: 0.0541855	valid_1's l2: 0.0526548
[17]	valid_0's l2: 0.0534972	valid_1's l2: 0.0519929
[18]	valid_0's l2: 0.0528221	valid_1's l2: 0.0513437
[

feature_fraction, val_score: 0.019794:  14%|#4        | 1/7 [00:06<00:40,  6.73s/it]

[1]	valid_0's l2: 0.0663024	valid_1's l2: 0.0643318
[2]	valid_0's l2: 0.0653745	valid_1's l2: 0.0634347
[3]	valid_0's l2: 0.0644738	valid_1's l2: 0.0625619
[4]	valid_0's l2: 0.0635913	valid_1's l2: 0.0617061
[5]	valid_0's l2: 0.0627573	valid_1's l2: 0.060904
[6]	valid_0's l2: 0.0626595	valid_1's l2: 0.0608342
[7]	valid_0's l2: 0.0618438	valid_1's l2: 0.0600505
[8]	valid_0's l2: 0.0610117	valid_1's l2: 0.0592434
[9]	valid_0's l2: 0.0609188	valid_1's l2: 0.0591769
[10]	valid_0's l2: 0.0600966	valid_1's l2: 0.0583841
[11]	valid_0's l2: 0.0593288	valid_1's l2: 0.0576463
[12]	valid_0's l2: 0.0585453	valid_1's l2: 0.0568862
[13]	valid_0's l2: 0.0584515	valid_1's l2: 0.0568157
[14]	valid_0's l2: 0.057715	valid_1's l2: 0.0561074
[15]	valid_0's l2: 0.0569556	valid_1's l2: 0.0553755
[16]	valid_0's l2: 0.0568739	valid_1's l2: 0.0553167
[17]	valid_0's l2: 0.0567942	valid_1's l2: 0.055258
[18]	valid_0's l2: 0.0567141	valid_1's l2: 0.0551997
[19]	valid_0's l2: 0.0566352	valid_1's l2: 0.0551443
[20]	

feature_fraction, val_score: 0.019794:  29%|##8       | 2/7 [00:13<00:35,  7.03s/it]

[1]	valid_0's l2: 0.0663024	valid_1's l2: 0.0643312
[2]	valid_0's l2: 0.0653744	valid_1's l2: 0.0634344
[3]	valid_0's l2: 0.0644736	valid_1's l2: 0.0625606
[4]	valid_0's l2: 0.063591	valid_1's l2: 0.0617044
[5]	valid_0's l2: 0.0627166	valid_1's l2: 0.0608608
[6]	valid_0's l2: 0.0618596	valid_1's l2: 0.0600333
[7]	valid_0's l2: 0.0610579	valid_1's l2: 0.0592632
[8]	valid_0's l2: 0.0602335	valid_1's l2: 0.0584679
[9]	valid_0's l2: 0.0601374	valid_1's l2: 0.0583969
[10]	valid_0's l2: 0.059331	valid_1's l2: 0.0576185
[11]	valid_0's l2: 0.0585769	valid_1's l2: 0.0568944
[12]	valid_0's l2: 0.0578085	valid_1's l2: 0.0561495
[13]	valid_0's l2: 0.0570834	valid_1's l2: 0.0554536
[14]	valid_0's l2: 0.0563727	valid_1's l2: 0.0547725
[15]	valid_0's l2: 0.0556402	valid_1's l2: 0.0540672
[16]	valid_0's l2: 0.0549281	valid_1's l2: 0.0533777
[17]	valid_0's l2: 0.0548464	valid_1's l2: 0.0533188
[18]	valid_0's l2: 0.0547655	valid_1's l2: 0.0532604
[19]	valid_0's l2: 0.0546807	valid_1's l2: 0.0531982
[20]

feature_fraction, val_score: 0.019794:  43%|####2     | 3/7 [00:23<00:32,  8.21s/it]

[1]	valid_0's l2: 0.0662933	valid_1's l2: 0.0643284
[2]	valid_0's l2: 0.0653651	valid_1's l2: 0.0634327
[3]	valid_0's l2: 0.0644558	valid_1's l2: 0.0625555
[4]	valid_0's l2: 0.0635727	valid_1's l2: 0.0616989
[5]	valid_0's l2: 0.0626987	valid_1's l2: 0.0608553
[6]	valid_0's l2: 0.0618419	valid_1's l2: 0.0600277
[7]	valid_0's l2: 0.0610409	valid_1's l2: 0.0592591
[8]	valid_0's l2: 0.0602167	valid_1's l2: 0.0584652
[9]	valid_0's l2: 0.0594164	valid_1's l2: 0.0576897
[10]	valid_0's l2: 0.0586244	valid_1's l2: 0.0569254
[11]	valid_0's l2: 0.0578838	valid_1's l2: 0.0562147
[12]	valid_0's l2: 0.0571219	valid_1's l2: 0.0554812
[13]	valid_0's l2: 0.0564102	valid_1's l2: 0.0547987
[14]	valid_0's l2: 0.0556775	valid_1's l2: 0.0540932
[15]	valid_0's l2: 0.0549596	valid_1's l2: 0.0534031
[16]	valid_0's l2: 0.0542555	valid_1's l2: 0.0527259
[17]	valid_0's l2: 0.054172	valid_1's l2: 0.0526643
[18]	valid_0's l2: 0.0535161	valid_1's l2: 0.0520359
[19]	valid_0's l2: 0.0528732	valid_1's l2: 0.0514187
[20

feature_fraction, val_score: 0.019785:  57%|#####7    | 4/7 [00:30<00:23,  7.76s/it]

[1]	valid_0's l2: 0.0663024	valid_1's l2: 0.0643312
[2]	valid_0's l2: 0.0653744	valid_1's l2: 0.0634344
[3]	valid_0's l2: 0.0644649	valid_1's l2: 0.0625561
[4]	valid_0's l2: 0.0635818	valid_1's l2: 0.0616996
[5]	valid_0's l2: 0.0627078	valid_1's l2: 0.0608555
[6]	valid_0's l2: 0.061851	valid_1's l2: 0.0600291
[7]	valid_0's l2: 0.0610496	valid_1's l2: 0.05926
[8]	valid_0's l2: 0.0602251	valid_1's l2: 0.0584659
[9]	valid_0's l2: 0.0601291	valid_1's l2: 0.0583948
[10]	valid_0's l2: 0.0593225	valid_1's l2: 0.0576172
[11]	valid_0's l2: 0.0585688	valid_1's l2: 0.0568936
[12]	valid_0's l2: 0.0577932	valid_1's l2: 0.0561471
[13]	valid_0's l2: 0.0570689	valid_1's l2: 0.0554522
[14]	valid_0's l2: 0.056359	valid_1's l2: 0.0547707
[15]	valid_0's l2: 0.0556268	valid_1's l2: 0.0540655
[16]	valid_0's l2: 0.0549146	valid_1's l2: 0.0533759
[17]	valid_0's l2: 0.0548315	valid_1's l2: 0.0533169
[18]	valid_0's l2: 0.054164	valid_1's l2: 0.0526758
[19]	valid_0's l2: 0.0535094	valid_1's l2: 0.0520485
[20]	va

feature_fraction, val_score: 0.019775:  71%|#######1  | 5/7 [00:35<00:13,  6.60s/it]

[588]	valid_0's l2: 0.0179794	valid_1's l2: 0.0197855
[589]	valid_0's l2: 0.0179772	valid_1's l2: 0.0197851
[590]	valid_0's l2: 0.017976	valid_1's l2: 0.0197849
[591]	valid_0's l2: 0.0179738	valid_1's l2: 0.0197852
[592]	valid_0's l2: 0.017972	valid_1's l2: 0.0197852
[593]	valid_0's l2: 0.0179701	valid_1's l2: 0.0197858
[594]	valid_0's l2: 0.0179686	valid_1's l2: 0.0197866
[595]	valid_0's l2: 0.0179661	valid_1's l2: 0.0197859
[596]	valid_0's l2: 0.017964	valid_1's l2: 0.0197864
Early stopping, best iteration is:
[496]	valid_0's l2: 0.0181864	valid_1's l2: 0.0197754
[1]	valid_0's l2: 0.0663023	valid_1's l2: 0.0643317
[2]	valid_0's l2: 0.0653743	valid_1's l2: 0.0634349
[3]	valid_0's l2: 0.0644648	valid_1's l2: 0.0625567
[4]	valid_0's l2: 0.0635817	valid_1's l2: 0.0617002
[5]	valid_0's l2: 0.0627077	valid_1's l2: 0.0608561
[6]	valid_0's l2: 0.0618509	valid_1's l2: 0.0600292
[7]	valid_0's l2: 0.0610494	valid_1's l2: 0.0592601
[8]	valid_0's l2: 0.0602249	valid_1's l2: 0.0584662
[9]	valid_0'

feature_fraction, val_score: 0.019773:  86%|########5 | 6/7 [00:39<00:05,  5.74s/it]

[519]	valid_0's l2: 0.01806	valid_1's l2: 0.019781
Early stopping, best iteration is:
[419]	valid_0's l2: 0.018318	valid_1's l2: 0.019773
[1]	valid_0's l2: 0.0662933	valid_1's l2: 0.0643284
[2]	valid_0's l2: 0.0653651	valid_1's l2: 0.0634327
[3]	valid_0's l2: 0.0644558	valid_1's l2: 0.0625555
[4]	valid_0's l2: 0.0635645	valid_1's l2: 0.0616964
[5]	valid_0's l2: 0.0626905	valid_1's l2: 0.060852
[6]	valid_0's l2: 0.0618343	valid_1's l2: 0.0600252
[7]	valid_0's l2: 0.0609953	valid_1's l2: 0.0592157
[8]	valid_0's l2: 0.0601721	valid_1's l2: 0.0584213
[9]	valid_0's l2: 0.0593657	valid_1's l2: 0.0576451
[10]	valid_0's l2: 0.0585754	valid_1's l2: 0.0568825
[11]	valid_0's l2: 0.0578003	valid_1's l2: 0.0561345
[12]	valid_0's l2: 0.0570412	valid_1's l2: 0.0554038
[13]	valid_0's l2: 0.0562961	valid_1's l2: 0.0546852
[14]	valid_0's l2: 0.0555664	valid_1's l2: 0.0539827
[15]	valid_0's l2: 0.0548505	valid_1's l2: 0.0532937
[16]	valid_0's l2: 0.0541496	valid_1's l2: 0.0526193
[17]	valid_0's l2: 0.053

feature_fraction, val_score: 0.019773: 100%|##########| 7/7 [00:43<00:00,  6.22s/it]
num_leaves, val_score: 0.019773:   0%|          | 0/20 [00:00<?, ?it/s]

[1]	valid_0's l2: 0.0663053	valid_1's l2: 0.0643344
[2]	valid_0's l2: 0.0653829	valid_1's l2: 0.0634409
[3]	valid_0's l2: 0.0644786	valid_1's l2: 0.0625663
[4]	valid_0's l2: 0.0635985	valid_1's l2: 0.0617119
[5]	valid_0's l2: 0.0627291	valid_1's l2: 0.0608708
[6]	valid_0's l2: 0.0618775	valid_1's l2: 0.0600474
[7]	valid_0's l2: 0.0610787	valid_1's l2: 0.0592789
[8]	valid_0's l2: 0.0602589	valid_1's l2: 0.0584862
[9]	valid_0's l2: 0.0594609	valid_1's l2: 0.0577111
[10]	valid_0's l2: 0.0586733	valid_1's l2: 0.0569492
[11]	valid_0's l2: 0.0579348	valid_1's l2: 0.0562395
[12]	valid_0's l2: 0.057177	valid_1's l2: 0.0555077
[13]	valid_0's l2: 0.0564674	valid_1's l2: 0.054826
[14]	valid_0's l2: 0.0557718	valid_1's l2: 0.0541579
[15]	valid_0's l2: 0.0550565	valid_1's l2: 0.0534666
[16]	valid_0's l2: 0.0543594	valid_1's l2: 0.0527903
[17]	valid_0's l2: 0.0542802	valid_1's l2: 0.0527313
[18]	valid_0's l2: 0.0536257	valid_1's l2: 0.0521033
[19]	valid_0's l2: 0.0529845	valid_1's l2: 0.0514854
[20]

num_leaves, val_score: 0.019773:   5%|5         | 1/20 [00:04<01:21,  4.27s/it]

[1]	valid_0's l2: 0.0663016	valid_1's l2: 0.0643317
[2]	valid_0's l2: 0.0653728	valid_1's l2: 0.0634346
[3]	valid_0's l2: 0.0644623	valid_1's l2: 0.0625555
[4]	valid_0's l2: 0.0635787	valid_1's l2: 0.061699
[5]	valid_0's l2: 0.0627039	valid_1's l2: 0.0608545
[6]	valid_0's l2: 0.0618462	valid_1's l2: 0.0600279
[7]	valid_0's l2: 0.0610443	valid_1's l2: 0.0592592
[8]	valid_0's l2: 0.0602189	valid_1's l2: 0.0584648
[9]	valid_0's l2: 0.0594178	valid_1's l2: 0.0576894
[10]	valid_0's l2: 0.0586251	valid_1's l2: 0.0569246
[11]	valid_0's l2: 0.0578833	valid_1's l2: 0.0562129
[12]	valid_0's l2: 0.0571204	valid_1's l2: 0.0554796
[13]	valid_0's l2: 0.0564075	valid_1's l2: 0.0547969
[14]	valid_0's l2: 0.0557091	valid_1's l2: 0.0541271
[15]	valid_0's l2: 0.0549891	valid_1's l2: 0.0534346
[16]	valid_0's l2: 0.054289	valid_1's l2: 0.0527575
[17]	valid_0's l2: 0.0542047	valid_1's l2: 0.0526961
[18]	valid_0's l2: 0.053548	valid_1's l2: 0.0520667
[19]	valid_0's l2: 0.0529039	valid_1's l2: 0.0514508
[20]	

num_leaves, val_score: 0.019741:  10%|#         | 2/20 [00:09<01:31,  5.06s/it]

[1]	valid_0's l2: 0.0662879	valid_1's l2: 0.0643315
[2]	valid_0's l2: 0.0653424	valid_1's l2: 0.06343
[3]	valid_0's l2: 0.0644156	valid_1's l2: 0.0625477
[4]	valid_0's l2: 0.0635192	valid_1's l2: 0.0616918
[5]	valid_0's l2: 0.062628	valid_1's l2: 0.0608433
[6]	valid_0's l2: 0.0617542	valid_1's l2: 0.0600118
[7]	valid_0's l2: 0.0609404	valid_1's l2: 0.05924
[8]	valid_0's l2: 0.0600997	valid_1's l2: 0.0584414
[9]	valid_0's l2: 0.0592861	valid_1's l2: 0.0576666
[10]	valid_0's l2: 0.0584778	valid_1's l2: 0.0569
[11]	valid_0's l2: 0.0577237	valid_1's l2: 0.0561884
[12]	valid_0's l2: 0.0569466	valid_1's l2: 0.0554513
[13]	valid_0's l2: 0.0562229	valid_1's l2: 0.054767
[14]	valid_0's l2: 0.0555131	valid_1's l2: 0.0540974
[15]	valid_0's l2: 0.0547787	valid_1's l2: 0.0534018
[16]	valid_0's l2: 0.0540676	valid_1's l2: 0.0527265
[17]	valid_0's l2: 0.0539661	valid_1's l2: 0.052661
[18]	valid_0's l2: 0.0532988	valid_1's l2: 0.0520316
[19]	valid_0's l2: 0.0526433	valid_1's l2: 0.0514148
[20]	valid_0

num_leaves, val_score: 0.019741:  15%|#5        | 3/20 [00:19<02:00,  7.07s/it]

[1]	valid_0's l2: 0.0662871	valid_1's l2: 0.0643309
[2]	valid_0's l2: 0.0653408	valid_1's l2: 0.06343
[3]	valid_0's l2: 0.0644133	valid_1's l2: 0.0625469
[4]	valid_0's l2: 0.0635163	valid_1's l2: 0.0616906
[5]	valid_0's l2: 0.0626245	valid_1's l2: 0.0608427
[6]	valid_0's l2: 0.0617499	valid_1's l2: 0.0600116
[7]	valid_0's l2: 0.060935	valid_1's l2: 0.0592392
[8]	valid_0's l2: 0.0600936	valid_1's l2: 0.0584404
[9]	valid_0's l2: 0.0592795	valid_1's l2: 0.0576655
[10]	valid_0's l2: 0.0584701	valid_1's l2: 0.0568994
[11]	valid_0's l2: 0.057715	valid_1's l2: 0.0561883
[12]	valid_0's l2: 0.0569371	valid_1's l2: 0.0554515
[13]	valid_0's l2: 0.0562128	valid_1's l2: 0.0547676
[14]	valid_0's l2: 0.0555024	valid_1's l2: 0.0540981
[15]	valid_0's l2: 0.0547674	valid_1's l2: 0.0534023
[16]	valid_0's l2: 0.0540559	valid_1's l2: 0.0527272
[17]	valid_0's l2: 0.0539539	valid_1's l2: 0.0526614
[18]	valid_0's l2: 0.0532862	valid_1's l2: 0.052032
[19]	valid_0's l2: 0.0526299	valid_1's l2: 0.0514152
[20]	va

num_leaves, val_score: 0.019741:  20%|##        | 4/20 [00:28<02:08,  8.01s/it]

[1]	valid_0's l2: 0.0663134	valid_1's l2: 0.0643427
[2]	valid_0's l2: 0.0654023	valid_1's l2: 0.0634612
[3]	valid_0's l2: 0.0645093	valid_1's l2: 0.0625998
[4]	valid_0's l2: 0.0636365	valid_1's l2: 0.0617529
[5]	valid_0's l2: 0.0627783	valid_1's l2: 0.0609245
[6]	valid_0's l2: 0.0619382	valid_1's l2: 0.0601099
[7]	valid_0's l2: 0.0611475	valid_1's l2: 0.0593483
[8]	valid_0's l2: 0.0603391	valid_1's l2: 0.058565
[9]	valid_0's l2: 0.0595472	valid_1's l2: 0.0577952
[10]	valid_0's l2: 0.0587692	valid_1's l2: 0.057046
[11]	valid_0's l2: 0.0580374	valid_1's l2: 0.0563415
[12]	valid_0's l2: 0.0572883	valid_1's l2: 0.0556154
[13]	valid_0's l2: 0.0565847	valid_1's l2: 0.0549383
[14]	valid_0's l2: 0.0558951	valid_1's l2: 0.0542746
[15]	valid_0's l2: 0.0551884	valid_1's l2: 0.0535886
[16]	valid_0's l2: 0.0544965	valid_1's l2: 0.0529183
[17]	valid_0's l2: 0.0544237	valid_1's l2: 0.0528637
[18]	valid_0's l2: 0.0537733	valid_1's l2: 0.0522398
[19]	valid_0's l2: 0.0531366	valid_1's l2: 0.0516283
[20]

num_leaves, val_score: 0.019721:  25%|##5       | 5/20 [00:32<01:36,  6.45s/it]

[649]	valid_0's l2: 0.0189034	valid_1's l2: 0.0197236
[650]	valid_0's l2: 0.0189028	valid_1's l2: 0.0197237
[651]	valid_0's l2: 0.0189018	valid_1's l2: 0.0197235
[652]	valid_0's l2: 0.0189011	valid_1's l2: 0.0197239
[653]	valid_0's l2: 0.0189005	valid_1's l2: 0.019724
[654]	valid_0's l2: 0.0188996	valid_1's l2: 0.0197238
[655]	valid_0's l2: 0.0188988	valid_1's l2: 0.0197242
[656]	valid_0's l2: 0.0188982	valid_1's l2: 0.0197239
[657]	valid_0's l2: 0.0188974	valid_1's l2: 0.0197239
[658]	valid_0's l2: 0.0188965	valid_1's l2: 0.0197237
[659]	valid_0's l2: 0.0188957	valid_1's l2: 0.0197233
[660]	valid_0's l2: 0.0188948	valid_1's l2: 0.0197235
[661]	valid_0's l2: 0.0188941	valid_1's l2: 0.0197234
[662]	valid_0's l2: 0.0188933	valid_1's l2: 0.0197234
[663]	valid_0's l2: 0.0188925	valid_1's l2: 0.0197238
[664]	valid_0's l2: 0.0188919	valid_1's l2: 0.0197242
[665]	valid_0's l2: 0.0188913	valid_1's l2: 0.019724
[666]	valid_0's l2: 0.0188906	valid_1's l2: 0.0197241
[667]	valid_0's l2: 0.0188898	

num_leaves, val_score: 0.019721:  30%|###       | 6/20 [00:40<01:37,  6.96s/it]

[1]	valid_0's l2: 0.0663032	valid_1's l2: 0.0643322
[2]	valid_0's l2: 0.0653767	valid_1's l2: 0.0634361
[3]	valid_0's l2: 0.0644685	valid_1's l2: 0.0625584
[4]	valid_0's l2: 0.0635865	valid_1's l2: 0.0617022
[5]	valid_0's l2: 0.0627135	valid_1's l2: 0.0608589
[6]	valid_0's l2: 0.0618581	valid_1's l2: 0.0600327
[7]	valid_0's l2: 0.0610576	valid_1's l2: 0.0592635
[8]	valid_0's l2: 0.0602342	valid_1's l2: 0.0584707
[9]	valid_0's l2: 0.0594346	valid_1's l2: 0.0576951
[10]	valid_0's l2: 0.058644	valid_1's l2: 0.0569327
[11]	valid_0's l2: 0.0579037	valid_1's l2: 0.0562218
[12]	valid_0's l2: 0.0571429	valid_1's l2: 0.0554895
[13]	valid_0's l2: 0.0564315	valid_1's l2: 0.0548069
[14]	valid_0's l2: 0.0557342	valid_1's l2: 0.0541381
[15]	valid_0's l2: 0.0550161	valid_1's l2: 0.0534459
[16]	valid_0's l2: 0.0543174	valid_1's l2: 0.0527694
[17]	valid_0's l2: 0.0542352	valid_1's l2: 0.0527095
[18]	valid_0's l2: 0.0535794	valid_1's l2: 0.0520809
[19]	valid_0's l2: 0.0529368	valid_1's l2: 0.0514636
[20

num_leaves, val_score: 0.019721:  35%|###5      | 7/20 [00:45<01:21,  6.30s/it]

[1]	valid_0's l2: 0.066281	valid_1's l2: 0.0643308
[2]	valid_0's l2: 0.0653265	valid_1's l2: 0.0634305
[3]	valid_0's l2: 0.0643921	valid_1's l2: 0.062548
[4]	valid_0's l2: 0.0634884	valid_1's l2: 0.0616909
[5]	valid_0's l2: 0.0625892	valid_1's l2: 0.0608421
[6]	valid_0's l2: 0.0617065	valid_1's l2: 0.0600117
[7]	valid_0's l2: 0.0608851	valid_1's l2: 0.0592391
[8]	valid_0's l2: 0.0600363	valid_1's l2: 0.0584417
[9]	valid_0's l2: 0.0592164	valid_1's l2: 0.0576658
[10]	valid_0's l2: 0.0584003	valid_1's l2: 0.0569014
[11]	valid_0's l2: 0.0576393	valid_1's l2: 0.056191
[12]	valid_0's l2: 0.0568547	valid_1's l2: 0.0554557
[13]	valid_0's l2: 0.0561247	valid_1's l2: 0.054772
[14]	valid_0's l2: 0.0554087	valid_1's l2: 0.0541029
[15]	valid_0's l2: 0.0546667	valid_1's l2: 0.0534065
[16]	valid_0's l2: 0.0539493	valid_1's l2: 0.0527288
[17]	valid_0's l2: 0.0538414	valid_1's l2: 0.0526607
[18]	valid_0's l2: 0.0531676	valid_1's l2: 0.0520309
[19]	valid_0's l2: 0.0525053	valid_1's l2: 0.0514156
[20]	v

num_leaves, val_score: 0.019721:  40%|####      | 8/20 [00:56<01:34,  7.84s/it]

[1]	valid_0's l2: 0.0662894	valid_1's l2: 0.064332
[2]	valid_0's l2: 0.0653458	valid_1's l2: 0.0634307
[3]	valid_0's l2: 0.0644208	valid_1's l2: 0.0625481
[4]	valid_0's l2: 0.0635258	valid_1's l2: 0.0616923
[5]	valid_0's l2: 0.0626363	valid_1's l2: 0.0608443
[6]	valid_0's l2: 0.0617643	valid_1's l2: 0.0600131
[7]	valid_0's l2: 0.0609519	valid_1's l2: 0.0592408
[8]	valid_0's l2: 0.0601127	valid_1's l2: 0.0584431
[9]	valid_0's l2: 0.0593007	valid_1's l2: 0.0576687
[10]	valid_0's l2: 0.0584942	valid_1's l2: 0.0569028
[11]	valid_0's l2: 0.0577417	valid_1's l2: 0.0561911
[12]	valid_0's l2: 0.0569662	valid_1's l2: 0.0554542
[13]	valid_0's l2: 0.0562441	valid_1's l2: 0.0547699
[14]	valid_0's l2: 0.0555355	valid_1's l2: 0.0541
[15]	valid_0's l2: 0.0548026	valid_1's l2: 0.0534054
[16]	valid_0's l2: 0.0540929	valid_1's l2: 0.0527304
[17]	valid_0's l2: 0.0539931	valid_1's l2: 0.0526651
[18]	valid_0's l2: 0.0533269	valid_1's l2: 0.0520349
[19]	valid_0's l2: 0.0526728	valid_1's l2: 0.0514181
[20]	v

num_leaves, val_score: 0.019721:  45%|####5     | 9/20 [01:06<01:32,  8.44s/it]

[1]	valid_0's l2: 0.0663038	valid_1's l2: 0.0643323
[2]	valid_0's l2: 0.0653782	valid_1's l2: 0.0634367
[3]	valid_0's l2: 0.0644709	valid_1's l2: 0.0625592
[4]	valid_0's l2: 0.0635893	valid_1's l2: 0.0617033
[5]	valid_0's l2: 0.0627172	valid_1's l2: 0.0608605
[6]	valid_0's l2: 0.0618627	valid_1's l2: 0.0600345
[7]	valid_0's l2: 0.0610626	valid_1's l2: 0.059266
[8]	valid_0's l2: 0.0602402	valid_1's l2: 0.0584724
[9]	valid_0's l2: 0.0594411	valid_1's l2: 0.0576968
[10]	valid_0's l2: 0.0586508	valid_1's l2: 0.0569345
[11]	valid_0's l2: 0.057911	valid_1's l2: 0.0562238
[12]	valid_0's l2: 0.0571508	valid_1's l2: 0.0554915
[13]	valid_0's l2: 0.0564399	valid_1's l2: 0.054809
[14]	valid_0's l2: 0.0557431	valid_1's l2: 0.0541402
[15]	valid_0's l2: 0.0550257	valid_1's l2: 0.0534489
[16]	valid_0's l2: 0.0543274	valid_1's l2: 0.0527722
[17]	valid_0's l2: 0.0542459	valid_1's l2: 0.0527129
[18]	valid_0's l2: 0.0535905	valid_1's l2: 0.0520846
[19]	valid_0's l2: 0.0529483	valid_1's l2: 0.0514678
[20]	

num_leaves, val_score: 0.019721:  50%|#####     | 10/20 [01:11<01:13,  7.38s/it]

[675]	valid_0's l2: 0.0180615	valid_1's l2: 0.0197786
[676]	valid_0's l2: 0.0180603	valid_1's l2: 0.0197785
[677]	valid_0's l2: 0.0180589	valid_1's l2: 0.0197782
[678]	valid_0's l2: 0.0180572	valid_1's l2: 0.019778
[679]	valid_0's l2: 0.0180554	valid_1's l2: 0.0197781
[680]	valid_0's l2: 0.0180536	valid_1's l2: 0.0197776
[681]	valid_0's l2: 0.0180524	valid_1's l2: 0.0197773
[682]	valid_0's l2: 0.0180509	valid_1's l2: 0.0197774
Early stopping, best iteration is:
[582]	valid_0's l2: 0.018959	valid_1's l2: 0.0197208
[1]	valid_0's l2: 0.0662798	valid_1's l2: 0.0643302
[2]	valid_0's l2: 0.0653242	valid_1's l2: 0.0634293
[3]	valid_0's l2: 0.0643883	valid_1's l2: 0.0625466
[4]	valid_0's l2: 0.0634836	valid_1's l2: 0.0616888
[5]	valid_0's l2: 0.0625833	valid_1's l2: 0.0608412
[6]	valid_0's l2: 0.0616996	valid_1's l2: 0.0600096
[7]	valid_0's l2: 0.0608773	valid_1's l2: 0.0592374
[8]	valid_0's l2: 0.0600274	valid_1's l2: 0.0584399
[9]	valid_0's l2: 0.0592065	valid_1's l2: 0.0576637
[10]	valid_0'

num_leaves, val_score: 0.019721:  55%|#####5    | 11/20 [01:24<01:22,  9.17s/it]

[1]	valid_0's l2: 0.0662946	valid_1's l2: 0.064332
[2]	valid_0's l2: 0.0653565	valid_1's l2: 0.0634318
[3]	valid_0's l2: 0.0644373	valid_1's l2: 0.0625496
[4]	valid_0's l2: 0.0635469	valid_1's l2: 0.0616941
[5]	valid_0's l2: 0.0626632	valid_1's l2: 0.060847
[6]	valid_0's l2: 0.0617968	valid_1's l2: 0.0600159
[7]	valid_0's l2: 0.060989	valid_1's l2: 0.0592452
[8]	valid_0's l2: 0.0601557	valid_1's l2: 0.0584481
[9]	valid_0's l2: 0.0593483	valid_1's l2: 0.0576744
[10]	valid_0's l2: 0.0585473	valid_1's l2: 0.0569082
[11]	valid_0's l2: 0.0577992	valid_1's l2: 0.056196
[12]	valid_0's l2: 0.0570287	valid_1's l2: 0.0554603
[13]	valid_0's l2: 0.0563105	valid_1's l2: 0.0547763
[14]	valid_0's l2: 0.0556062	valid_1's l2: 0.0541068
[15]	valid_0's l2: 0.0548786	valid_1's l2: 0.0534137
[16]	valid_0's l2: 0.0541726	valid_1's l2: 0.0527375
[17]	valid_0's l2: 0.054079	valid_1's l2: 0.0526752
[18]	valid_0's l2: 0.053417	valid_1's l2: 0.0520467
[19]	valid_0's l2: 0.0527671	valid_1's l2: 0.0514286
[20]	val

In [None]:
def prepare_datasets(df, fold):
    tr_df = df[df['fold']!=fold]
    val_df = df[df['fold']==fold]

    cat_cols_ = [f'{c}_encoded' for c in cat_cols]
    tr_ds = lgb.Dataset(tr_df[cols], tr_df['score'], categorical_feature=cat_cols_)
    val_ds = lgb.Dataset(val_df[cols], val_df['score'], categorical_feature=cat_cols_, reference=tr_ds)

    return tr_ds, val_ds


def train_fn(df, params):
    preds = []

    for fold in df['fold'].unique():
        tr_ds, val_ds = prepare_datasets(df, fold)
        
        model = lgb.train(
            params, 
            tr_ds, 
            valid_sets=[tr_ds, val_ds],
            callbacks=[lgb.early_stopping(stopping_rounds=100),
                       lgb.log_evaluation(period=100)]
        )

        pred = df[df['fold']==fold].copy()
        pred['pred_lgb'] = model.predict(pred[cols], num_iteration=model.best_iteration)
        preds.append(pred)

        model.save_model(output_dir / f'lgb_fold{fold}.txt', num_iteration=model.best_iteration)

    return pd.concat(preds, axis=0, ignore_index=False)

In [None]:
pred = train_fn(train, best_params)

In [None]:
importances = []
for fold in train['fold'].unique():
    importances.append(
        lgb.Booster(model_file=output_dir / f'lgb_fold{fold}.txt').feature_importance(importance_type='gain'))
importance_df = pd.DataFrame({
    'feature': cols,
    'importance': np.mean(importances, axis=0)
    })
importance_df.sort_values('importance', ascending=False)

In [None]:
# optunaのbest params
print(get_score(pred['score'], pred['pred_lgb']))
pred.head()

In [None]:
import os
import json
from kaggle.api.kaggle_api_extended import KaggleApi

ID = 'hanejiyuto'
DATASET_ID = ColabConfig.dataset_name + '-' + ColabConfig.dataset_version
UPLOAD_DIR = ColabConfig.dataset_dir
VERSION_NOTES = ColabConfig.dataset_note

def dataset_create_new():
    dataset_metadata = {}
    dataset_metadata['id'] = f'{ID}/{DATASET_ID}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = DATASET_ID
    with open(os.path.join(UPLOAD_DIR, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    # api.dataset_create_new(folder=UPLOAD_DIR, convert_to_csv=False, dir_mode='tar')
    !kaggle datasets create -t -p $UPLOAD_DIR -r tar

def dataset_create_version():  # バージョンアップデート
    dataset_metadata = {}
    dataset_metadata['id'] = f'{ID}/{DATASET_ID}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = DATASET_ID
    if not os.path.exists('dataset-metadata.json'):
        with open(os.path.join(UPLOAD_DIR, 'dataset-metadata.json'), 'w') as f:
            json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    # api.dataset_create_version(folder=UPLOAD_DIR, version_notes=VERSION_NOTES, convert_to_csv=False, dir_mode='tar')
    !kaggle datasets version -t -p $UPLOAD_DIR -r tar -m $VERSION_NOTES

if CFG.debug:
    pass
elif ColabConfig.dataset_new:
    dataset_create_new()
else:
    dataset_create_version()