In [None]:
class CFG:
    exps = ['026', '033', '022', 'PatentSBERTa-exp035']
    n_fold = 4
    debug=False
    seed = 42

# ====================================================
# Colab settings
# ====================================================
class ColabConfig:
    dataset_name = 'PPPM-stacking'
    dataset_version = 'exp' + '-'.join(CFG.exps)
    dataset_new = True  # 新しいデータセットか
    dataset_dir = None  # Kaggle Dataset にアップロードするディレクトリ
    dataset_note = '""'  # 前の版からの変更点
    in_colab = False  # colab上にデータダウンロード

In [None]:
!nvcc --version
!python -c 'import torch; print(torch.__version__) '
!python --version
print('')
!nvidia-smi
print('')
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0
1.11.0+cu113
Python 3.7.13

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.


Your runtime has 13.6 gigabytes of available RAM



In [None]:
%%time
import sys
COLAB = "google.colab" in sys.modules

if COLAB:
    import os
    print('This environment is Google Colab')

    # mount drive
    from google.colab import drive
    if not os.path.isdir('/content/drive'):
        drive.mount('/content/drive')
    os.chdir('/content/drive/MyDrive/kaggle/PPPM/exps')

    # kaggle api token and update kaggle api
    from google.colab import files
    if not os.path.isfile('~/.kaggle/kaggle.json'):
        # files.upload()
        !mkdir -p ~/.kaggle
        !cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
        !pip install --upgrade --force-reinstall --no-deps -q kaggle

    if ColabConfig.in_colab:
        # make directory in colab
        !mkdir -p /content/input
        !mkdir -p /content/working

        # download dataset in colab
        import zipfile, glob
        os.chdir('/content/input')
        # !kaggle competitions download -qc birdclef-2022 -p birdclef-2022
        
        # !mkdir birdclef-2022
        # !cp /content/drive/MyDrive/kaggle/BirdCLEF2022/input/birdclef-2022/* ./birdclef-2022/

        for p in glob.glob('**/*.zip', recursive=True):
            print(p)
            d, f = os.path.split(p)
            # if f in ['']:
            #     continue
            with zipfile.ZipFile(p, 'r') as zipf:
                print('unzip: ', zipf)
                zipf.extractall(d)
                print('remove: ', f)
                os.remove(p)
        
        os.chdir('/content/working')

This environment is Google Colab
Mounted at /content/drive
[K     |████████████████████████████████| 58 kB 2.7 MB/s 
[?25h  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
CPU times: user 1.94 s, sys: 467 ms, total: 2.41 s
Wall time: 27.8 s


In [None]:
import os
import random
import pickle
from pathlib import Path
from psutil import cpu_count
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 10000)
pd.set_option('display.max_colwidth', None)
from tqdm.auto import tqdm
tqdm.pandas()
os.system('python -m pip install -U lightgbm')
import lightgbm as lgb
print(f"lightgbm.__version__: {lgb.__version__}")
os.system('python -m pip install optuna')
import optuna.integration.lightgbm as lgb_optuna
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import torch
print(f"torch.__version__: {torch.__version__}")

os.system('python -m pip install sentencepiece')
os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

lightgbm.__version__: 3.3.2
torch.__version__: 1.11.0+cu113
tokenizers.__version__: 0.12.1
transformers.__version__: 4.18.0
env: TOKENIZERS_PARALLELISM=true


In [None]:
input_dir = Path('../input/us-patent-phrase-to-phrase-matching')
output_dir = Path(f"/content/drive/MyDrive/kaggle/PPPM/output/stacking{'_'.join(CFG.exps)}")
output_dir.mkdir(exist_ok=True)

ColabConfig.dataset_dir = str(output_dir)

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score


def get_logger(filename=output_dir / 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

# LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [None]:
# 特徴量エンジニアリング
def tok_len(sentence, tokenizer):
    return len(tokenizer.tokenize(sentence))

def tokenizer_feature_engineering(df, tokenizer):
    def _tok_len(sentence):
        return tok_len(sentence, tokenizer)
    
    basecols = df.columns

    df['anchor_tok_len'] = df['anchor'].map(_tok_len)
    df['target_tok_len'] = df['target'].map(_tok_len)
    df['context_tok_len'] = df['context_text'].map(_tok_len)
    df['input_len'] = df['anchor_tok_len'] + df['target_tok_len'] + df['context_tok_len'] + 4

    df['len_anc_tgt_diff'] = df['anchor_tok_len'] - df['target_tok_len']
    df['len_anc_tgt_div'] = df['anchor_tok_len'] / df['target_tok_len']

    df['len_anc_cnt_diff'] = df['anchor_tok_len'] - df['context_tok_len']
    df['len_anc_cnt_div'] = df['anchor_tok_len'] / df['context_tok_len']

    df['len_tgt_cnt_diff'] = df['target_tok_len'] - df['context_tok_len']
    df['len_tgt_cnt_div'] = df['target_tok_len'] / df['context_tok_len']

    # 作成した特徴量とidカラムだけ返す
    usecols = [col for col in df.columns if col not in basecols]
    usecols.append('id')
    return df[usecols]

In [None]:
def make_ensemble_datasets(cfg):
    cv_scores = []
    train = pd.read_csv(input_dir / 'train.csv')

    for exp in cfg.exps:
        oof_dir = Path(f'/content/drive/MyDrive/kaggle/PPPM/output/{exp}')
        if exp in ['deberta-v3-large-e1_exp032', 'deberta-v3-large-e1_exp035', 'PatentSBERTa-exp035']:
            oof_df = pd.read_csv(oof_dir / 'oof_df.csv')
        else:
            oof_df = pd.read_pickle(oof_dir / 'oof_df.pkl')
        tokenizer = AutoTokenizer.from_pretrained(oof_dir / 'tokenizer')
        
        # 特徴量と予測スコアは'pred_000'のようなカラム名にする
        feature_df = tokenizer_feature_engineering(oof_df, tokenizer)
        rename_dict = {col: col+'_'+exp for col in feature_df.columns if col != 'id'}
        feature_df = feature_df.rename(columns=rename_dict)
        
        train = train.merge(feature_df, on='id', how='left')
        train = train.merge(oof_df[['id', 'pred']].rename(columns={'pred': f'pred_{exp}'}), 
                            on='id', how='left')

        cv_scores.append(get_score(train['score'], train[f'pred_{exp}']))
    
    print(f'max score: {max(cv_scores)}')
    print(f'avg score: {sum(cv_scores) / len(cv_scores)}')
    return train

In [None]:
train = make_ensemble_datasets(CFG)
display(train.head())

max score: 0.832151085616115
avg score: 0.8176988520774562


Unnamed: 0,id,anchor,target,context,score,anchor_tok_len_026,target_tok_len_026,context_tok_len_026,input_len_026,len_anc_tgt_diff_026,len_anc_tgt_div_026,len_anc_cnt_diff_026,len_anc_cnt_div_026,len_tgt_cnt_diff_026,len_tgt_cnt_div_026,pred_026,anchor_tok_len_033,target_tok_len_033,context_tok_len_033,input_len_033,len_anc_tgt_diff_033,len_anc_tgt_div_033,len_anc_cnt_diff_033,len_anc_cnt_div_033,len_tgt_cnt_diff_033,len_tgt_cnt_div_033,pred_033,anchor_tok_len_022,target_tok_len_022,context_tok_len_022,input_len_022,len_anc_tgt_diff_022,len_anc_tgt_div_022,len_anc_cnt_diff_022,len_anc_cnt_div_022,len_tgt_cnt_diff_022,len_tgt_cnt_div_022,pred_022,anchor_tok_len_PatentSBERTa-exp035,target_tok_len_PatentSBERTa-exp035,context_tok_len_PatentSBERTa-exp035,input_len_PatentSBERTa-exp035,len_anc_tgt_diff_PatentSBERTa-exp035,len_anc_tgt_div_PatentSBERTa-exp035,len_anc_cnt_diff_PatentSBERTa-exp035,len_anc_cnt_div_PatentSBERTa-exp035,len_tgt_cnt_diff_PatentSBERTa-exp035,len_tgt_cnt_div_PatentSBERTa-exp035,pred_PatentSBERTa-exp035
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,1,3,20,28,-2,0.333333,-19,0.05,-17,0.15,0.440476,1,3,22,30,-2,0.333333,-21,0.045455,-19,0.136364,0.413867,1,3,20,28,-2,0.333333,-19,0.05,-17,0.15,0.126895,3,5,24,36,-2,0.6,-21,0.125,-19,0.208333,0.414768
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,1,5,20,30,-4,0.2,-19,0.05,-15,0.25,0.641294,1,3,22,30,-2,0.333333,-21,0.045455,-19,0.136364,0.531482,1,5,20,30,-4,0.2,-19,0.05,-15,0.25,0.632196,3,4,24,35,-1,0.75,-21,0.125,-20,0.166667,0.768174
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.247038,1,2,22,29,-1,0.5,-21,0.045455,-20,0.090909,0.237905,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.237127,3,2,24,33,1,1.5,-21,0.125,-22,0.083333,0.09902
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.494176,1,2,22,29,-1,0.5,-21,0.045455,-20,0.090909,0.561054,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.489984,3,2,24,33,1,1.5,-21,0.125,-22,0.083333,0.425223
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.011766,1,2,22,29,-1,0.5,-21,0.045455,-20,0.090909,0.028578,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.013014,3,2,24,33,1,1.5,-21,0.125,-22,0.083333,0.12524


In [None]:
# foldに分ける
# credits to: https://www.kaggle.com/code/abhishek/creating-folds-properly-hopefully-p

!pip install -q iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

dfx = pd.get_dummies(train, columns=["score"]).groupby(["anchor"], as_index=False).sum()
cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
dfx = dfx[cols]

mskf = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=42)
labels = [c for c in dfx.columns if c != "anchor"]
dfx_labels = dfx[labels]
dfx["fold"] = -1

for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
    print(len(trn_), len(val_))
    dfx.loc[val_, "fold"] = fold

train = train.merge(dfx[["anchor", "fold"]], on="anchor", how="left")
print(train.fold.value_counts())

550 183
549 184
550 183
550 183
3    9622
0    9379
1    8860
2    8612
Name: fold, dtype: int64


In [None]:
# カテゴリ変数の変換
train['section'] = train['context'].map(lambda s: s[0])
cat_cols = ['section']
for c in cat_cols:
    le = LabelEncoder()
    le.fit(train[c])
    train[f'{c}_encoded'] = le.transform(train[c])
    with open(output_dir / f'{c}_encoder.pkl', 'wb') as f:
        pickle.dump(le, f)

In [None]:
print(train.columns)

Index(['id', 'anchor', 'target', 'context', 'score', 'anchor_tok_len_026', 'target_tok_len_026', 'context_tok_len_026', 'input_len_026', 'len_anc_tgt_diff_026', 'len_anc_tgt_div_026', 'len_anc_cnt_diff_026', 'len_anc_cnt_div_026', 'len_tgt_cnt_diff_026', 'len_tgt_cnt_div_026', 'pred_026', 'anchor_tok_len_033', 'target_tok_len_033', 'context_tok_len_033', 'input_len_033', 'len_anc_tgt_diff_033', 'len_anc_tgt_div_033', 'len_anc_cnt_diff_033', 'len_anc_cnt_div_033', 'len_tgt_cnt_diff_033', 'len_tgt_cnt_div_033', 'pred_033', 'anchor_tok_len_022', 'target_tok_len_022', 'context_tok_len_022', 'input_len_022', 'len_anc_tgt_diff_022', 'len_anc_tgt_div_022', 'len_anc_cnt_diff_022', 'len_anc_cnt_div_022', 'len_tgt_cnt_diff_022', 'len_tgt_cnt_div_022', 'pred_022', 'anchor_tok_len_PatentSBERTa-exp035', 'target_tok_len_PatentSBERTa-exp035', 'context_tok_len_PatentSBERTa-exp035', 'input_len_PatentSBERTa-exp035', 'len_anc_tgt_diff_PatentSBERTa-exp035', 'len_anc_tgt_div_PatentSBERTa-exp035', 'len_anc_

In [None]:
excluded_columns = ['id', 'anchor', 'target', 'context', 'score', 'fold', 'section']
cols = [col for col in train.columns if col not in excluded_columns]
with open(output_dir / 'cols.pkl', 'wb') as f:
    pickle.dump(cols, f)
print(cols)

['anchor_tok_len_026', 'target_tok_len_026', 'context_tok_len_026', 'input_len_026', 'len_anc_tgt_diff_026', 'len_anc_tgt_div_026', 'len_anc_cnt_diff_026', 'len_anc_cnt_div_026', 'len_tgt_cnt_diff_026', 'len_tgt_cnt_div_026', 'pred_026', 'anchor_tok_len_033', 'target_tok_len_033', 'context_tok_len_033', 'input_len_033', 'len_anc_tgt_diff_033', 'len_anc_tgt_div_033', 'len_anc_cnt_diff_033', 'len_anc_cnt_div_033', 'len_tgt_cnt_diff_033', 'len_tgt_cnt_div_033', 'pred_033', 'anchor_tok_len_022', 'target_tok_len_022', 'context_tok_len_022', 'input_len_022', 'len_anc_tgt_diff_022', 'len_anc_tgt_div_022', 'len_anc_cnt_diff_022', 'len_anc_cnt_div_022', 'len_tgt_cnt_diff_022', 'len_tgt_cnt_div_022', 'pred_022', 'anchor_tok_len_PatentSBERTa-exp035', 'target_tok_len_PatentSBERTa-exp035', 'context_tok_len_PatentSBERTa-exp035', 'input_len_PatentSBERTa-exp035', 'len_anc_tgt_diff_PatentSBERTa-exp035', 'len_anc_tgt_div_PatentSBERTa-exp035', 'len_anc_cnt_diff_PatentSBERTa-exp035', 'len_anc_cnt_div_Pate

In [None]:
params = {
    'objective': 'regression',
    'boosting': 'gbdt',
    'num_iterations': 1000,
    'learning_rate': 0.01,
    # 'num_leaves': 20,  # 31
    'num_threads': cpu_count(logical=False),
    'seed': CFG.seed,

    # 'max_depth': 3,  # -1
    # 'min_data_in_leaf': 100,  # 20
    'verbosity': -1,  # 1

    'metric': 'l2'
}

In [None]:
def tuning_params(df, params):
    anchors = df['anchor'].unique()
    rng = np.random.default_rng(seed=71)
    rng.shuffle(anchors)
    val_prop = 0.25  # バリデーションデータの割合。4foldから0.25
    val_size = int(len(anchors)*val_prop)
    val_anchors = anchors[:val_size]
    is_val = np.isin(df['anchor'], val_anchors)
    idx = np.arange(len(df))
    val_idx = idx[is_val]
    tr_idx = idx[~is_val]
    print(f'train: {len(tr_idx)}, val: {len(val_idx)}')

    tr_df = df.iloc[tr_idx]
    val_df = df.iloc[val_idx]

    cat_cols_ = [f'{c}_encoded' for c in cat_cols]
    tr_ds = lgb.Dataset(tr_df[cols], tr_df['score'], categorical_feature=cat_cols_)
    val_ds = lgb.Dataset(val_df[cols], val_df['score'], categorical_feature=cat_cols_, reference=tr_ds)
    
    model = lgb_optuna.train(
        params,
        tr_ds, 
        valid_sets=[tr_ds, val_ds],
        callbacks=[lgb.early_stopping(stopping_rounds=100),
                    lgb.log_evaluation(period=100)],
        verbosity=-1,
    )

    best_params = model.params

    return best_params

In [None]:
best_params = tuning_params(train, params)
display(best_params)

train: 27261, val: 9212


[32m[I 2022-06-20 10:47:57,201][0m A new study created in memory with name: no-name-fbd32af8-3555-42c2-b794-e51828dd93e5[0m
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]

[1]	valid_0's l2: 0.0662874	valid_1's l2: 0.0643224
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l2: 0.0653452	valid_1's l2: 0.0634143
[3]	valid_0's l2: 0.0644171	valid_1's l2: 0.0625244
[4]	valid_0's l2: 0.0635076	valid_1's l2: 0.0616494
[5]	valid_0's l2: 0.0626195	valid_1's l2: 0.0607975
[6]	valid_0's l2: 0.0617568	valid_1's l2: 0.059966
[7]	valid_0's l2: 0.0609026	valid_1's l2: 0.0591478
[8]	valid_0's l2: 0.0600621	valid_1's l2: 0.0583425
[9]	valid_0's l2: 0.0592392	valid_1's l2: 0.0575544
[10]	valid_0's l2: 0.058433	valid_1's l2: 0.0567799
[11]	valid_0's l2: 0.0576439	valid_1's l2: 0.0560252
[12]	valid_0's l2: 0.0568688	valid_1's l2: 0.0552836
[13]	valid_0's l2: 0.0561104	valid_1's l2: 0.0545551
[14]	valid_0's l2: 0.0553645	valid_1's l2: 0.0538404
[15]	valid_0's l2: 0.0546334	valid_1's l2: 0.0531414
[16]	valid_0's l2: 0.0539171	valid_1's l2: 0.052455
[17]	valid_0's l2: 0.0532146	valid_1's l2: 0.0517807
[18]	valid_0's l2: 0.0525262	valid_1's l2: 0.0511

feature_fraction, val_score: 0.019012:  14%|#4        | 1/7 [00:05<00:34,  5.71s/it]

[490]	valid_0's l2: 0.0164512	valid_1's l2: 0.0190295
[491]	valid_0's l2: 0.0164483	valid_1's l2: 0.0190296
[492]	valid_0's l2: 0.0164449	valid_1's l2: 0.01903
Early stopping, best iteration is:
[392]	valid_0's l2: 0.0167868	valid_1's l2: 0.0190117
[1]	valid_0's l2: 0.0662889	valid_1's l2: 0.0643229
[2]	valid_0's l2: 0.0653741	valid_1's l2: 0.0634429
[3]	valid_0's l2: 0.064472	valid_1's l2: 0.0625693
[4]	valid_0's l2: 0.0643677	valid_1's l2: 0.0624944
[5]	valid_0's l2: 0.063578	valid_1's l2: 0.0617304
[6]	valid_0's l2: 0.0627093	valid_1's l2: 0.0608867
[7]	valid_0's l2: 0.0625871	valid_1's l2: 0.0607926
[8]	valid_0's l2: 0.0617263	valid_1's l2: 0.0599612
[9]	valid_0's l2: 0.0609093	valid_1's l2: 0.0591768
[10]	valid_0's l2: 0.0600677	valid_1's l2: 0.05837
[11]	valid_0's l2: 0.0592474	valid_1's l2: 0.0575842
[12]	valid_0's l2: 0.0585463	valid_1's l2: 0.0569104
[13]	valid_0's l2: 0.0578584	valid_1's l2: 0.0562488
[14]	valid_0's l2: 0.0571848	valid_1's l2: 0.0555981
[15]	valid_0's l2: 0.0

feature_fraction, val_score: 0.018954:  29%|##8       | 2/7 [00:13<00:33,  6.67s/it]

[1]	valid_0's l2: 0.0662889	valid_1's l2: 0.0643229
[2]	valid_0's l2: 0.0653741	valid_1's l2: 0.0634429
[3]	valid_0's l2: 0.064458	valid_1's l2: 0.0625595
[4]	valid_0's l2: 0.0635523	valid_1's l2: 0.0616861
[5]	valid_0's l2: 0.0627755	valid_1's l2: 0.0609355
[6]	valid_0's l2: 0.0619225	valid_1's l2: 0.0601078
[7]	valid_0's l2: 0.0610812	valid_1's l2: 0.0592972
[8]	valid_0's l2: 0.0602499	valid_1's l2: 0.0584952
[9]	valid_0's l2: 0.0594602	valid_1's l2: 0.0577379
[10]	valid_0's l2: 0.0586473	valid_1's l2: 0.0569595
[11]	valid_0's l2: 0.0578547	valid_1's l2: 0.0562
[12]	valid_0's l2: 0.0570768	valid_1's l2: 0.0554518
[13]	valid_0's l2: 0.0564119	valid_1's l2: 0.0548127
[14]	valid_0's l2: 0.0556766	valid_1's l2: 0.0541022
[15]	valid_0's l2: 0.0549492	valid_1's l2: 0.0534028
[16]	valid_0's l2: 0.0543226	valid_1's l2: 0.0527986
[17]	valid_0's l2: 0.0536219	valid_1's l2: 0.0521243
[18]	valid_0's l2: 0.0529468	valid_1's l2: 0.0514779
[19]	valid_0's l2: 0.0522646	valid_1's l2: 0.0508235
[20]	v

feature_fraction, val_score: 0.018946:  43%|####2     | 3/7 [00:22<00:32,  8.07s/it]

[1]	valid_0's l2: 0.0662888	valid_1's l2: 0.0643232
[2]	valid_0's l2: 0.0653464	valid_1's l2: 0.0634147
[3]	valid_0's l2: 0.0644311	valid_1's l2: 0.0625327
[4]	valid_0's l2: 0.0635253	valid_1's l2: 0.0616597
[5]	valid_0's l2: 0.0626364	valid_1's l2: 0.060807
[6]	valid_0's l2: 0.0617738	valid_1's l2: 0.0599765
[7]	valid_0's l2: 0.0609193	valid_1's l2: 0.0591587
[8]	valid_0's l2: 0.0600784	valid_1's l2: 0.0583531
[9]	valid_0's l2: 0.0592539	valid_1's l2: 0.0575634
[10]	valid_0's l2: 0.0584458	valid_1's l2: 0.0567896
[11]	valid_0's l2: 0.0576565	valid_1's l2: 0.0560344
[12]	valid_0's l2: 0.0568802	valid_1's l2: 0.055289
[13]	valid_0's l2: 0.0561215	valid_1's l2: 0.0545606
[14]	valid_0's l2: 0.0553761	valid_1's l2: 0.0538474
[15]	valid_0's l2: 0.054644	valid_1's l2: 0.0531473
[16]	valid_0's l2: 0.0539489	valid_1's l2: 0.0524822
[17]	valid_0's l2: 0.0532455	valid_1's l2: 0.0518081
[18]	valid_0's l2: 0.0525558	valid_1's l2: 0.0511478
[19]	valid_0's l2: 0.0518811	valid_1's l2: 0.0505005
[20]	

feature_fraction, val_score: 0.018946:  57%|#####7    | 4/7 [00:34<00:28,  9.59s/it]

[498]	valid_0's l2: 0.0164605	valid_1's l2: 0.0189881
[499]	valid_0's l2: 0.0164573	valid_1's l2: 0.0189881
[500]	valid_0's l2: 0.0164537	valid_1's l2: 0.0189873
[500]	valid_0's l2: 0.0164537	valid_1's l2: 0.0189873
[501]	valid_0's l2: 0.0164516	valid_1's l2: 0.0189882
[502]	valid_0's l2: 0.0164487	valid_1's l2: 0.018988
[503]	valid_0's l2: 0.0164456	valid_1's l2: 0.0189879
[504]	valid_0's l2: 0.016443	valid_1's l2: 0.0189882
[505]	valid_0's l2: 0.0164406	valid_1's l2: 0.0189877
Early stopping, best iteration is:
[405]	valid_0's l2: 0.0169279	valid_1's l2: 0.0189458
[1]	valid_0's l2: 0.0662889	valid_1's l2: 0.0643229
[2]	valid_0's l2: 0.0653741	valid_1's l2: 0.0634429
[3]	valid_0's l2: 0.064458	valid_1's l2: 0.0625595
[4]	valid_0's l2: 0.0635523	valid_1's l2: 0.0616861
[5]	valid_0's l2: 0.0627754	valid_1's l2: 0.0609356
[6]	valid_0's l2: 0.0619178	valid_1's l2: 0.0601051
[7]	valid_0's l2: 0.0610602	valid_1's l2: 0.0592841
[8]	valid_0's l2: 0.0602161	valid_1's l2: 0.0584752
[9]	valid_0'

feature_fraction, val_score: 0.018946:  71%|#######1  | 5/7 [00:39<00:15,  7.93s/it]

[1]	valid_0's l2: 0.0662889	valid_1's l2: 0.0643229
[2]	valid_0's l2: 0.0653741	valid_1's l2: 0.0634429
[3]	valid_0's l2: 0.064458	valid_1's l2: 0.0625595
[4]	valid_0's l2: 0.0635513	valid_1's l2: 0.0616859
[5]	valid_0's l2: 0.0626785	valid_1's l2: 0.0608428
[6]	valid_0's l2: 0.0618149	valid_1's l2: 0.0600113
[7]	valid_0's l2: 0.0609595	valid_1's l2: 0.0591911
[8]	valid_0's l2: 0.0601177	valid_1's l2: 0.0583845
[9]	valid_0's l2: 0.0592964	valid_1's l2: 0.0575965
[10]	valid_0's l2: 0.0584882	valid_1's l2: 0.0568228
[11]	valid_0's l2: 0.0576993	valid_1's l2: 0.0560675
[12]	valid_0's l2: 0.0569245	valid_1's l2: 0.0553226
[13]	valid_0's l2: 0.0561648	valid_1's l2: 0.0545932
[14]	valid_0's l2: 0.0554276	valid_1's l2: 0.053885
[15]	valid_0's l2: 0.0546946	valid_1's l2: 0.0531837
[16]	valid_0's l2: 0.0539986	valid_1's l2: 0.0525168
[17]	valid_0's l2: 0.0532937	valid_1's l2: 0.0518411
[18]	valid_0's l2: 0.0526045	valid_1's l2: 0.0511841
[19]	valid_0's l2: 0.0519295	valid_1's l2: 0.0505377
[20]

feature_fraction, val_score: 0.018946:  86%|########5 | 6/7 [00:45<00:07,  7.06s/it]

[497]	valid_0's l2: 0.0164974	valid_1's l2: 0.0190052
[498]	valid_0's l2: 0.0164946	valid_1's l2: 0.0190041
[499]	valid_0's l2: 0.016491	valid_1's l2: 0.0190036
[500]	valid_0's l2: 0.016488	valid_1's l2: 0.0190035
[500]	valid_0's l2: 0.016488	valid_1's l2: 0.0190035
[501]	valid_0's l2: 0.016485	valid_1's l2: 0.0190042
[502]	valid_0's l2: 0.0164824	valid_1's l2: 0.0190045
[503]	valid_0's l2: 0.01648	valid_1's l2: 0.0190048
[504]	valid_0's l2: 0.0164774	valid_1's l2: 0.0190051
[505]	valid_0's l2: 0.0164739	valid_1's l2: 0.0190046
Early stopping, best iteration is:
[405]	valid_0's l2: 0.0169279	valid_1's l2: 0.0189458
[1]	valid_0's l2: 0.0662746	valid_1's l2: 0.0643153
[2]	valid_0's l2: 0.0653285	valid_1's l2: 0.0634081
[3]	valid_0's l2: 0.0644013	valid_1's l2: 0.0625159
[4]	valid_0's l2: 0.0634923	valid_1's l2: 0.0616446
[5]	valid_0's l2: 0.0626017	valid_1's l2: 0.0607871
[6]	valid_0's l2: 0.0617295	valid_1's l2: 0.0599499
[7]	valid_0's l2: 0.0608745	valid_1's l2: 0.05913
[8]	valid_0's l

feature_fraction, val_score: 0.018946: 100%|##########| 7/7 [00:50<00:00,  7.24s/it]


[500]	valid_0's l2: 0.0164182	valid_1's l2: 0.0190884
[500]	valid_0's l2: 0.0164182	valid_1's l2: 0.0190884
[501]	valid_0's l2: 0.0164155	valid_1's l2: 0.0190886
[502]	valid_0's l2: 0.0164129	valid_1's l2: 0.0190897
[503]	valid_0's l2: 0.0164106	valid_1's l2: 0.0190903
[504]	valid_0's l2: 0.0164079	valid_1's l2: 0.0190904
[505]	valid_0's l2: 0.0164044	valid_1's l2: 0.0190897
Early stopping, best iteration is:
[405]	valid_0's l2: 0.0169279	valid_1's l2: 0.0189458


num_leaves, val_score: 0.018946:   0%|          | 0/20 [00:00<?, ?it/s]

[1]	valid_0's l2: 0.0662801	valid_1's l2: 0.0643183
[2]	valid_0's l2: 0.065354	valid_1's l2: 0.0634322
[3]	valid_0's l2: 0.0644277	valid_1's l2: 0.0625449
[4]	valid_0's l2: 0.0635119	valid_1's l2: 0.0616691
[5]	valid_0's l2: 0.0627269	valid_1's l2: 0.060918
[6]	valid_0's l2: 0.0618675	valid_1's l2: 0.0600909
[7]	valid_0's l2: 0.0610168	valid_1's l2: 0.0592783
[8]	valid_0's l2: 0.0601772	valid_1's l2: 0.0584734
[9]	valid_0's l2: 0.0593806	valid_1's l2: 0.0577159
[10]	valid_0's l2: 0.0585568	valid_1's l2: 0.0569288
[11]	valid_0's l2: 0.0577547	valid_1's l2: 0.0561646
[12]	valid_0's l2: 0.0569672	valid_1's l2: 0.0554134
[13]	valid_0's l2: 0.0562954	valid_1's l2: 0.0547732
[14]	valid_0's l2: 0.0555519	valid_1's l2: 0.0540602
[15]	valid_0's l2: 0.0548154	valid_1's l2: 0.0533589
[16]	valid_0's l2: 0.0541822	valid_1's l2: 0.0527578
[17]	valid_0's l2: 0.0534736	valid_1's l2: 0.0520822
[18]	valid_0's l2: 0.0527906	valid_1's l2: 0.0514321
[19]	valid_0's l2: 0.0521003	valid_1's l2: 0.0507735
[20]

num_leaves, val_score: 0.018946:   5%|5         | 1/20 [00:06<01:54,  6.04s/it]

[1]	valid_0's l2: 0.0662681	valid_1's l2: 0.0643169
[2]	valid_0's l2: 0.0653283	valid_1's l2: 0.0634257
[3]	valid_0's l2: 0.0643894	valid_1's l2: 0.062536
[4]	valid_0's l2: 0.0634615	valid_1's l2: 0.0616569
[5]	valid_0's l2: 0.0626641	valid_1's l2: 0.0609064
[6]	valid_0's l2: 0.0617955	valid_1's l2: 0.0600803
[7]	valid_0's l2: 0.0609328	valid_1's l2: 0.059269
[8]	valid_0's l2: 0.0600815	valid_1's l2: 0.0584618
[9]	valid_0's l2: 0.0592752	valid_1's l2: 0.0577012
[10]	valid_0's l2: 0.0584392	valid_1's l2: 0.0569124
[11]	valid_0's l2: 0.0576252	valid_1's l2: 0.0561476
[12]	valid_0's l2: 0.0568265	valid_1's l2: 0.0553947
[13]	valid_0's l2: 0.0561431	valid_1's l2: 0.0547568
[14]	valid_0's l2: 0.0553898	valid_1's l2: 0.0540439
[15]	valid_0's l2: 0.054642	valid_1's l2: 0.053342
[16]	valid_0's l2: 0.0539971	valid_1's l2: 0.0527372
[17]	valid_0's l2: 0.0532776	valid_1's l2: 0.0520602
[18]	valid_0's l2: 0.0525845	valid_1's l2: 0.0514079
[19]	valid_0's l2: 0.0518828	valid_1's l2: 0.0507482
[20]	v

num_leaves, val_score: 0.018946:  10%|#         | 2/20 [00:14<02:12,  7.34s/it]

[1]	valid_0's l2: 0.0662746	valid_1's l2: 0.064318
[2]	valid_0's l2: 0.0653416	valid_1's l2: 0.0634289
[3]	valid_0's l2: 0.064409	valid_1's l2: 0.0625397
[4]	valid_0's l2: 0.0634874	valid_1's l2: 0.0616626
[5]	valid_0's l2: 0.0626963	valid_1's l2: 0.060914
[6]	valid_0's l2: 0.0618323	valid_1's l2: 0.0600871
[7]	valid_0's l2: 0.0609757	valid_1's l2: 0.0592766
[8]	valid_0's l2: 0.0601303	valid_1's l2: 0.0584698
[9]	valid_0's l2: 0.0593291	valid_1's l2: 0.0577112
[10]	valid_0's l2: 0.0584992	valid_1's l2: 0.0569227
[11]	valid_0's l2: 0.0576912	valid_1's l2: 0.0561586
[12]	valid_0's l2: 0.0568981	valid_1's l2: 0.0554055
[13]	valid_0's l2: 0.0562202	valid_1's l2: 0.0547671
[14]	valid_0's l2: 0.0554719	valid_1's l2: 0.0540547
[15]	valid_0's l2: 0.0547299	valid_1's l2: 0.0533533
[16]	valid_0's l2: 0.054091	valid_1's l2: 0.0527486
[17]	valid_0's l2: 0.053377	valid_1's l2: 0.0520721
[18]	valid_0's l2: 0.0526886	valid_1's l2: 0.0514209
[19]	valid_0's l2: 0.0519929	valid_1's l2: 0.0507619
[20]	va

num_leaves, val_score: 0.018946:  15%|#5        | 3/20 [00:21<02:02,  7.22s/it]

[1]	valid_0's l2: 0.0662695	valid_1's l2: 0.064317
[2]	valid_0's l2: 0.0653311	valid_1's l2: 0.0634268
[3]	valid_0's l2: 0.064394	valid_1's l2: 0.0625376
[4]	valid_0's l2: 0.0634675	valid_1's l2: 0.0616585
[5]	valid_0's l2: 0.0626716	valid_1's l2: 0.0609083
[6]	valid_0's l2: 0.061804	valid_1's l2: 0.0600821
[7]	valid_0's l2: 0.0609425	valid_1's l2: 0.0592708
[8]	valid_0's l2: 0.0600927	valid_1's l2: 0.0584641
[9]	valid_0's l2: 0.0592879	valid_1's l2: 0.0577033
[10]	valid_0's l2: 0.0584533	valid_1's l2: 0.0569146
[11]	valid_0's l2: 0.0576405	valid_1's l2: 0.05615
[12]	valid_0's l2: 0.0568429	valid_1's l2: 0.0553963
[13]	valid_0's l2: 0.0561607	valid_1's l2: 0.0547575
[14]	valid_0's l2: 0.0554084	valid_1's l2: 0.0540452
[15]	valid_0's l2: 0.0546619	valid_1's l2: 0.053343
[16]	valid_0's l2: 0.0540182	valid_1's l2: 0.0527384
[17]	valid_0's l2: 0.0532999	valid_1's l2: 0.0520631
[18]	valid_0's l2: 0.0526078	valid_1's l2: 0.0514114
[19]	valid_0's l2: 0.0519078	valid_1's l2: 0.0507519
[20]	val

num_leaves, val_score: 0.018946:  20%|##        | 4/20 [00:29<02:00,  7.53s/it]

[1]	valid_0's l2: 0.0662584	valid_1's l2: 0.0643156
[2]	valid_0's l2: 0.0653084	valid_1's l2: 0.0634222
[3]	valid_0's l2: 0.0643601	valid_1's l2: 0.062532
[4]	valid_0's l2: 0.0634239	valid_1's l2: 0.0616518
[5]	valid_0's l2: 0.0626166	valid_1's l2: 0.0609016
[6]	valid_0's l2: 0.0617403	valid_1's l2: 0.0600765
[7]	valid_0's l2: 0.0608675	valid_1's l2: 0.0592642
[8]	valid_0's l2: 0.0600075	valid_1's l2: 0.0584559
[9]	valid_0's l2: 0.0591933	valid_1's l2: 0.0576952
[10]	valid_0's l2: 0.0583478	valid_1's l2: 0.0569053
[11]	valid_0's l2: 0.0575248	valid_1's l2: 0.0561415
[12]	valid_0's l2: 0.0567172	valid_1's l2: 0.0553873
[13]	valid_0's l2: 0.0560239	valid_1's l2: 0.0547517
[14]	valid_0's l2: 0.0552625	valid_1's l2: 0.0540387
[15]	valid_0's l2: 0.0545055	valid_1's l2: 0.0533361
[16]	valid_0's l2: 0.0538524	valid_1's l2: 0.0527323
[17]	valid_0's l2: 0.0531241	valid_1's l2: 0.0520552
[18]	valid_0's l2: 0.0524229	valid_1's l2: 0.0514023
[19]	valid_0's l2: 0.0517132	valid_1's l2: 0.0507434
[20

num_leaves, val_score: 0.018946:  25%|##5       | 5/20 [00:41<02:15,  9.05s/it]

[1]	valid_0's l2: 0.0662617	valid_1's l2: 0.064316
[2]	valid_0's l2: 0.065315	valid_1's l2: 0.0634221
[3]	valid_0's l2: 0.0643695	valid_1's l2: 0.0625329
[4]	valid_0's l2: 0.0634359	valid_1's l2: 0.0616519
[5]	valid_0's l2: 0.0626317	valid_1's l2: 0.0609014
[6]	valid_0's l2: 0.0617579	valid_1's l2: 0.0600763
[7]	valid_0's l2: 0.0608889	valid_1's l2: 0.0592633
[8]	valid_0's l2: 0.0600315	valid_1's l2: 0.0584554
[9]	valid_0's l2: 0.0592201	valid_1's l2: 0.057694
[10]	valid_0's l2: 0.0583774	valid_1's l2: 0.0569046
[11]	valid_0's l2: 0.0575573	valid_1's l2: 0.0561415
[12]	valid_0's l2: 0.0567526	valid_1's l2: 0.0553874
[13]	valid_0's l2: 0.0560625	valid_1's l2: 0.0547522
[14]	valid_0's l2: 0.0553038	valid_1's l2: 0.0540389
[15]	valid_0's l2: 0.0545498	valid_1's l2: 0.0533369
[16]	valid_0's l2: 0.0538994	valid_1's l2: 0.0527324
[17]	valid_0's l2: 0.0531739	valid_1's l2: 0.0520561
[18]	valid_0's l2: 0.0524751	valid_1's l2: 0.0514037
[19]	valid_0's l2: 0.0517686	valid_1's l2: 0.0507439
[20]	

num_leaves, val_score: 0.018946:  30%|###       | 6/20 [00:50<02:09,  9.27s/it]

[1]	valid_0's l2: 0.0662729	valid_1's l2: 0.0643178
[2]	valid_0's l2: 0.065338	valid_1's l2: 0.0634284
[3]	valid_0's l2: 0.0644037	valid_1's l2: 0.0625393
[4]	valid_0's l2: 0.0634803	valid_1's l2: 0.0616612
[5]	valid_0's l2: 0.0626874	valid_1's l2: 0.0609122
[6]	valid_0's l2: 0.0618221	valid_1's l2: 0.060086
[7]	valid_0's l2: 0.0609636	valid_1's l2: 0.0592755
[8]	valid_0's l2: 0.0601164	valid_1's l2: 0.0584688
[9]	valid_0's l2: 0.0593138	valid_1's l2: 0.0577089
[10]	valid_0's l2: 0.0584821	valid_1's l2: 0.0569202
[11]	valid_0's l2: 0.0576725	valid_1's l2: 0.056156
[12]	valid_0's l2: 0.0568776	valid_1's l2: 0.0554016
[13]	valid_0's l2: 0.0561981	valid_1's l2: 0.0547636
[14]	valid_0's l2: 0.0554483	valid_1's l2: 0.0540512
[15]	valid_0's l2: 0.0547047	valid_1's l2: 0.0533498
[16]	valid_0's l2: 0.0540642	valid_1's l2: 0.0527453
[17]	valid_0's l2: 0.0533485	valid_1's l2: 0.0520693
[18]	valid_0's l2: 0.0526587	valid_1's l2: 0.0514173
[19]	valid_0's l2: 0.0519615	valid_1's l2: 0.0507587
[20]	

num_leaves, val_score: 0.018946:  35%|###5      | 7/20 [00:58<01:52,  8.69s/it]

[1]	valid_0's l2: 0.0662756	valid_1's l2: 0.0643177
[2]	valid_0's l2: 0.0653441	valid_1's l2: 0.0634287
[3]	valid_0's l2: 0.0644127	valid_1's l2: 0.0625397
[4]	valid_0's l2: 0.0634923	valid_1's l2: 0.0616623
[5]	valid_0's l2: 0.0627025	valid_1's l2: 0.0609138
[6]	valid_0's l2: 0.0618394	valid_1's l2: 0.0600871
[7]	valid_0's l2: 0.0609839	valid_1's l2: 0.0592767
[8]	valid_0's l2: 0.0601396	valid_1's l2: 0.0584705
[9]	valid_0's l2: 0.0593392	valid_1's l2: 0.0577121
[10]	valid_0's l2: 0.0585106	valid_1's l2: 0.0569241
[11]	valid_0's l2: 0.0577039	valid_1's l2: 0.0561609
[12]	valid_0's l2: 0.0569119	valid_1's l2: 0.0554087
[13]	valid_0's l2: 0.0562352	valid_1's l2: 0.0547692
[14]	valid_0's l2: 0.055488	valid_1's l2: 0.0540563
[15]	valid_0's l2: 0.0547471	valid_1's l2: 0.0533549
[16]	valid_0's l2: 0.0541096	valid_1's l2: 0.0527521
[17]	valid_0's l2: 0.0533967	valid_1's l2: 0.0520758
[18]	valid_0's l2: 0.0527095	valid_1's l2: 0.0514247
[19]	valid_0's l2: 0.0520151	valid_1's l2: 0.0507658
[20

num_leaves, val_score: 0.018946:  40%|####      | 8/20 [01:05<01:37,  8.12s/it]

[1]	valid_0's l2: 0.0662715	valid_1's l2: 0.0643167
[2]	valid_0's l2: 0.0653356	valid_1's l2: 0.0634271
[3]	valid_0's l2: 0.0644003	valid_1's l2: 0.0625383
[4]	valid_0's l2: 0.0634757	valid_1's l2: 0.0616602
[5]	valid_0's l2: 0.0626817	valid_1's l2: 0.0609097
[6]	valid_0's l2: 0.0618156	valid_1's l2: 0.060083
[7]	valid_0's l2: 0.0609561	valid_1's l2: 0.0592718
[8]	valid_0's l2: 0.0601081	valid_1's l2: 0.0584668
[9]	valid_0's l2: 0.0593047	valid_1's l2: 0.0577066
[10]	valid_0's l2: 0.0584719	valid_1's l2: 0.0569178
[11]	valid_0's l2: 0.0576613	valid_1's l2: 0.0561532
[12]	valid_0's l2: 0.0568654	valid_1's l2: 0.0553994
[13]	valid_0's l2: 0.0561849	valid_1's l2: 0.0547609
[14]	valid_0's l2: 0.0554344	valid_1's l2: 0.0540487
[15]	valid_0's l2: 0.0546898	valid_1's l2: 0.0533467
[16]	valid_0's l2: 0.0540481	valid_1's l2: 0.0527418
[17]	valid_0's l2: 0.0533314	valid_1's l2: 0.0520655
[18]	valid_0's l2: 0.0526408	valid_1's l2: 0.0514137
[19]	valid_0's l2: 0.0519425	valid_1's l2: 0.050755
[20]

num_leaves, val_score: 0.018946:  45%|####5     | 9/20 [01:12<01:27,  7.96s/it]

[1]	valid_0's l2: 0.066313	valid_1's l2: 0.064348
[2]	valid_0's l2: 0.0654255	valid_1's l2: 0.0634938
[3]	valid_0's l2: 0.0645332	valid_1's l2: 0.0626331
[4]	valid_0's l2: 0.0636591	valid_1's l2: 0.0617845
[5]	valid_0's l2: 0.0628988	valid_1's l2: 0.061049
[6]	valid_0's l2: 0.0620618	valid_1's l2: 0.0602358
[7]	valid_0's l2: 0.0612396	valid_1's l2: 0.0594446
[8]	valid_0's l2: 0.0604282	valid_1's l2: 0.058664
[9]	valid_0's l2: 0.0596503	valid_1's l2: 0.0579157
[10]	valid_0's l2: 0.0588638	valid_1's l2: 0.0571595
[11]	valid_0's l2: 0.0580934	valid_1's l2: 0.0564196
[12]	valid_0's l2: 0.0573394	valid_1's l2: 0.0556845
[13]	valid_0's l2: 0.0566859	valid_1's l2: 0.0550507
[14]	valid_0's l2: 0.0559667	valid_1's l2: 0.0543527
[15]	valid_0's l2: 0.0552561	valid_1's l2: 0.053663
[16]	valid_0's l2: 0.0546387	valid_1's l2: 0.053065
[17]	valid_0's l2: 0.0539534	valid_1's l2: 0.0524066
[18]	valid_0's l2: 0.0532926	valid_1's l2: 0.0517736
[19]	valid_0's l2: 0.052629	valid_1's l2: 0.0511338
[20]	vali

In [None]:
def prepare_datasets(df, fold):
    tr_df = df[df['fold']!=fold]
    val_df = df[df['fold']==fold]

    cat_cols_ = [f'{c}_encoded' for c in cat_cols]
    tr_ds = lgb.Dataset(tr_df[cols], tr_df['score'], categorical_feature=cat_cols_)
    val_ds = lgb.Dataset(val_df[cols], val_df['score'], categorical_feature=cat_cols_, reference=tr_ds)

    return tr_ds, val_ds


def train_fn(df, params):
    preds = []

    for fold in df['fold'].unique():
        tr_ds, val_ds = prepare_datasets(df, fold)
        
        model = lgb.train(
            params, 
            tr_ds, 
            valid_sets=[tr_ds, val_ds],
            callbacks=[lgb.early_stopping(stopping_rounds=100),
                       lgb.log_evaluation(period=100)]
        )

        pred = df[df['fold']==fold].copy()
        pred['pred_lgb'] = model.predict(pred[cols], num_iteration=model.best_iteration)
        preds.append(pred)

        model.save_model(output_dir / f'lgb_fold{fold}.txt', num_iteration=model.best_iteration)

    return pd.concat(preds, axis=0, ignore_index=False)

In [None]:
pred = train_fn(train, best_params)

In [22]:
importances = []
for fold in train['fold'].unique():
    importances.append(
        lgb.Booster(model_file=output_dir / f'lgb_fold{fold}.txt').feature_importance(importance_type='gain'))
importance_df = pd.DataFrame({
    'feature': cols,
    'importance': np.mean(importances, axis=0)
    })
importance_df.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
10,pred_026,14147.423238
32,pred_022,9508.143789
21,pred_033,8278.9596
43,pred_PatentSBERTa-exp035,2919.82001
44,section_encoded,60.221818
38,len_anc_tgt_div_PatentSBERTa-exp035,50.380996
15,len_anc_tgt_diff_033,19.533977
40,len_anc_cnt_div_PatentSBERTa-exp035,17.892692
16,len_anc_tgt_div_033,16.351428
33,anchor_tok_len_PatentSBERTa-exp035,15.667941


In [23]:
# optunaのbest params
print(get_score(pred['score'], pred['pred_lgb']))
pred.head()

0.848795045785566


Unnamed: 0,id,anchor,target,context,score,anchor_tok_len_026,target_tok_len_026,context_tok_len_026,input_len_026,len_anc_tgt_diff_026,len_anc_tgt_div_026,len_anc_cnt_diff_026,len_anc_cnt_div_026,len_tgt_cnt_diff_026,len_tgt_cnt_div_026,pred_026,anchor_tok_len_033,target_tok_len_033,context_tok_len_033,input_len_033,len_anc_tgt_diff_033,len_anc_tgt_div_033,len_anc_cnt_diff_033,len_anc_cnt_div_033,len_tgt_cnt_diff_033,len_tgt_cnt_div_033,pred_033,anchor_tok_len_022,target_tok_len_022,context_tok_len_022,input_len_022,len_anc_tgt_diff_022,len_anc_tgt_div_022,len_anc_cnt_diff_022,len_anc_cnt_div_022,len_tgt_cnt_diff_022,len_tgt_cnt_div_022,pred_022,anchor_tok_len_PatentSBERTa-exp035,target_tok_len_PatentSBERTa-exp035,context_tok_len_PatentSBERTa-exp035,input_len_PatentSBERTa-exp035,len_anc_tgt_diff_PatentSBERTa-exp035,len_anc_tgt_div_PatentSBERTa-exp035,len_anc_cnt_diff_PatentSBERTa-exp035,len_anc_cnt_div_PatentSBERTa-exp035,len_tgt_cnt_diff_PatentSBERTa-exp035,len_tgt_cnt_div_PatentSBERTa-exp035,pred_PatentSBERTa-exp035,fold,section,section_encoded,pred_lgb
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,1,3,20,28,-2,0.333333,-19,0.05,-17,0.15,0.440476,1,3,22,30,-2,0.333333,-21,0.045455,-19,0.136364,0.413867,1,3,20,28,-2,0.333333,-19,0.05,-17,0.15,0.126895,3,5,24,36,-2,0.6,-21,0.125,-19,0.208333,0.414768,0,A,0,0.386777
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,1,5,20,30,-4,0.2,-19,0.05,-15,0.25,0.641294,1,3,22,30,-2,0.333333,-21,0.045455,-19,0.136364,0.531482,1,5,20,30,-4,0.2,-19,0.05,-15,0.25,0.632196,3,4,24,35,-1,0.75,-21,0.125,-20,0.166667,0.768174,0,A,0,0.628578
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.247038,1,2,22,29,-1,0.5,-21,0.045455,-20,0.090909,0.237905,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.237127,3,2,24,33,1,1.5,-21,0.125,-22,0.083333,0.09902,0,A,0,0.228067
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.494176,1,2,22,29,-1,0.5,-21,0.045455,-20,0.090909,0.561054,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.489984,3,2,24,33,1,1.5,-21,0.125,-22,0.083333,0.425223,0,A,0,0.504004
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.011766,1,2,22,29,-1,0.5,-21,0.045455,-20,0.090909,0.028578,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.013014,3,2,24,33,1,1.5,-21,0.125,-22,0.083333,0.12524,0,A,0,0.069495


In [None]:
import os
import json
from kaggle.api.kaggle_api_extended import KaggleApi

ID = 'hanejiyuto'
DATASET_ID = ColabConfig.dataset_name + '-' + ColabConfig.dataset_version.replace('_', '-')
UPLOAD_DIR = ColabConfig.dataset_dir
VERSION_NOTES = ColabConfig.dataset_note

def dataset_create_new():
    dataset_metadata = {}
    dataset_metadata['id'] = f'{ID}/{DATASET_ID}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = DATASET_ID
    with open(os.path.join(UPLOAD_DIR, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    # api.dataset_create_new(folder=UPLOAD_DIR, convert_to_csv=False, dir_mode='tar')
    !kaggle datasets create -t -p $UPLOAD_DIR -r tar

def dataset_create_version():  # バージョンアップデート
    dataset_metadata = {}
    dataset_metadata['id'] = f'{ID}/{DATASET_ID}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = DATASET_ID
    if not os.path.exists('dataset-metadata.json'):
        with open(os.path.join(UPLOAD_DIR, 'dataset-metadata.json'), 'w') as f:
            json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    # api.dataset_create_version(folder=UPLOAD_DIR, version_notes=VERSION_NOTES, convert_to_csv=False, dir_mode='tar')
    !kaggle datasets version -t -p $UPLOAD_DIR -r tar -m $VERSION_NOTES

if CFG.debug:
    pass
elif ColabConfig.dataset_new:
    dataset_create_new()
else:
    dataset_create_version()

Starting upload for file section_encoder.pkl
100% 323/323 [00:04<00:00, 66.8B/s]
Upload successful: section_encoder.pkl (323B)
Starting upload for file cols.pkl
100% 1.26k/1.26k [00:06<00:00, 208B/s]
Upload successful: cols.pkl (1KB)
Starting upload for file lgb_fold0.txt
100% 496k/496k [00:05<00:00, 85.4kB/s]
Upload successful: lgb_fold0.txt (496KB)
Starting upload for file lgb_fold1.txt
100% 601k/601k [00:04<00:00, 152kB/s] 
Upload successful: lgb_fold1.txt (601KB)
Starting upload for file lgb_fold3.txt
100% 403k/403k [00:04<00:00, 85.8kB/s]
Upload successful: lgb_fold3.txt (403KB)
Starting upload for file lgb_fold2.txt
100% 457k/457k [00:04<00:00, 96.2kB/s]
Upload successful: lgb_fold2.txt (457KB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/datasets/hanejiyuto/PPPM-stacking-exp026-033-022-PatentSBERTa-exp035
