In [1]:
class CFG:
    exps = ['026', '033', '022']
    n_fold = 4
    debug=False
    seed = 42

# ====================================================
# Colab settings
# ====================================================
class ColabConfig:
    dataset_name = 'PPPM-stacking'
    dataset_version = 'exp' + '-'.join(CFG.exps)
    dataset_new = True  # 新しいデータセットか
    dataset_dir = None  # Kaggle Dataset にアップロードするディレクトリ
    dataset_note = '""'  # 前の版からの変更点
    in_colab = False  # colab上にデータダウンロード

In [2]:
!nvcc --version
!python -c 'import torch; print(torch.__version__) '
!python --version
print('')
!nvidia-smi
print('')
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0
1.11.0+cu113
Python 3.7.13

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.


Your runtime has 13.6 gigabytes of available RAM



In [3]:
%%time
import sys
COLAB = "google.colab" in sys.modules

if COLAB:
    import os
    print('This environment is Google Colab')

    # mount drive
    from google.colab import drive
    if not os.path.isdir('/content/drive'):
        drive.mount('/content/drive')
    os.chdir('/content/drive/MyDrive/kaggle/PPPM/exps')

    # kaggle api token and update kaggle api
    from google.colab import files
    if not os.path.isfile('~/.kaggle/kaggle.json'):
        # files.upload()
        !mkdir -p ~/.kaggle
        !cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
        !pip install --upgrade --force-reinstall --no-deps -q kaggle

    if ColabConfig.in_colab:
        # make directory in colab
        !mkdir -p /content/input
        !mkdir -p /content/working

        # download dataset in colab
        import zipfile, glob
        os.chdir('/content/input')
        # !kaggle competitions download -qc birdclef-2022 -p birdclef-2022
        
        # !mkdir birdclef-2022
        # !cp /content/drive/MyDrive/kaggle/BirdCLEF2022/input/birdclef-2022/* ./birdclef-2022/

        for p in glob.glob('**/*.zip', recursive=True):
            print(p)
            d, f = os.path.split(p)
            # if f in ['']:
            #     continue
            with zipfile.ZipFile(p, 'r') as zipf:
                print('unzip: ', zipf)
                zipf.extractall(d)
                print('remove: ', f)
                os.remove(p)
        
        os.chdir('/content/working')

This environment is Google Colab
Mounted at /content/drive
[K     |████████████████████████████████| 58 kB 5.1 MB/s 
[?25h  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
CPU times: user 1.13 s, sys: 263 ms, total: 1.39 s
Wall time: 22.4 s


In [4]:
import os
import random
import pickle
from pathlib import Path
from psutil import cpu_count
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 10000)
pd.set_option('display.max_colwidth', None)
from tqdm.auto import tqdm
tqdm.pandas()
os.system('python -m pip install -U lightgbm')
import lightgbm as lgb
print(f"lightgbm.__version__: {lgb.__version__}")
os.system('python -m pip install optuna')
import optuna.integration.lightgbm as lgb_optuna
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import torch
print(f"torch.__version__: {torch.__version__}")

os.system('python -m pip install sentencepiece')
os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

lightgbm.__version__: 3.3.2
torch.__version__: 1.11.0+cu113
tokenizers.__version__: 0.12.1
transformers.__version__: 4.18.0
env: TOKENIZERS_PARALLELISM=true


In [5]:
input_dir = Path('../input/us-patent-phrase-to-phrase-matching')
output_dir = Path(f"/content/drive/MyDrive/kaggle/PPPM/output/stacking{'_'.join(CFG.exps)}")
output_dir.mkdir(exist_ok=True)

ColabConfig.dataset_dir = str(output_dir)

In [6]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score


def get_logger(filename=output_dir / 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

# LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [7]:
# 特徴量エンジニアリング
def tok_len(sentence, tokenizer):
    return len(tokenizer.tokenize(sentence))

def tokenizer_feature_engineering(df, tokenizer):
    def _tok_len(sentence):
        return tok_len(sentence, tokenizer)
    
    basecols = df.columns

    df['anchor_tok_len'] = df['anchor'].map(_tok_len)
    df['target_tok_len'] = df['target'].map(_tok_len)
    df['context_tok_len'] = df['context_text'].map(_tok_len)
    df['input_len'] = df['anchor_tok_len'] + df['target_tok_len'] + df['context_tok_len'] + 4

    df['len_anc_tgt_diff'] = df['anchor_tok_len'] - df['target_tok_len']
    df['len_anc_tgt_div'] = df['anchor_tok_len'] / df['target_tok_len']

    df['len_anc_cnt_diff'] = df['anchor_tok_len'] - df['context_tok_len']
    df['len_anc_cnt_div'] = df['anchor_tok_len'] / df['context_tok_len']

    df['len_tgt_cnt_diff'] = df['target_tok_len'] - df['context_tok_len']
    df['len_tgt_cnt_div'] = df['target_tok_len'] / df['context_tok_len']

    # 作成した特徴量とidカラムだけ返す
    usecols = [col for col in df.columns if col not in basecols]
    usecols.append('id')
    return df[usecols]

In [8]:
def make_ensemble_datasets(cfg):
    cv_scores = []
    train = pd.read_csv(input_dir / 'train.csv')

    for exp in cfg.exps:
        oof_dir = Path(f'/content/drive/MyDrive/kaggle/PPPM/output/{exp}')
        oof_df = pd.read_pickle(oof_dir / 'oof_df.pkl')
        tokenizer = AutoTokenizer.from_pretrained(oof_dir / 'tokenizer')
        
        # 特徴量と予測スコアは'pred_000'のようなカラム名にする
        feature_df = tokenizer_feature_engineering(oof_df, tokenizer)
        rename_dict = {col: col+'_'+exp for col in feature_df.columns if col != 'id'}
        feature_df = feature_df.rename(columns=rename_dict)
        
        train = train.merge(feature_df, on='id', how='left')
        train = train.merge(oof_df[['id', 'pred']].rename(columns={'pred': f'pred_{exp}'}), 
                            on='id', how='left')

        cv_scores.append(get_score(train['score'], train[f'pred_{exp}']))
    
    print(f'max score: {max(cv_scores)}')
    print(f'avg score: {sum(cv_scores) / len(cv_scores)}')
    return train

In [9]:
train = make_ensemble_datasets(CFG)
display(train.head())

max score: 0.832151085616115
avg score: 0.8283517295638948


Unnamed: 0,id,anchor,target,context,score,anchor_tok_len_026,target_tok_len_026,context_tok_len_026,input_len_026,len_anc_tgt_diff_026,len_anc_tgt_div_026,len_anc_cnt_diff_026,len_anc_cnt_div_026,len_tgt_cnt_diff_026,len_tgt_cnt_div_026,pred_026,anchor_tok_len_033,target_tok_len_033,context_tok_len_033,input_len_033,len_anc_tgt_diff_033,len_anc_tgt_div_033,len_anc_cnt_diff_033,len_anc_cnt_div_033,len_tgt_cnt_diff_033,len_tgt_cnt_div_033,pred_033,anchor_tok_len_022,target_tok_len_022,context_tok_len_022,input_len_022,len_anc_tgt_diff_022,len_anc_tgt_div_022,len_anc_cnt_diff_022,len_anc_cnt_div_022,len_tgt_cnt_diff_022,len_tgt_cnt_div_022,pred_022
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,1,3,20,28,-2,0.333333,-19,0.05,-17,0.15,0.440476,1,3,22,30,-2,0.333333,-21,0.045455,-19,0.136364,0.413867,1,3,20,28,-2,0.333333,-19,0.05,-17,0.15,0.126895
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,1,5,20,30,-4,0.2,-19,0.05,-15,0.25,0.641294,1,3,22,30,-2,0.333333,-21,0.045455,-19,0.136364,0.531482,1,5,20,30,-4,0.2,-19,0.05,-15,0.25,0.632196
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.247038,1,2,22,29,-1,0.5,-21,0.045455,-20,0.090909,0.237905,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.237127
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.494176,1,2,22,29,-1,0.5,-21,0.045455,-20,0.090909,0.561054,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.489984
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.011766,1,2,22,29,-1,0.5,-21,0.045455,-20,0.090909,0.028578,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.013014


In [10]:
# foldに分ける
# credits to: https://www.kaggle.com/code/abhishek/creating-folds-properly-hopefully-p

!pip install -q iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

dfx = pd.get_dummies(train, columns=["score"]).groupby(["anchor"], as_index=False).sum()
cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
dfx = dfx[cols]

mskf = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=42)
labels = [c for c in dfx.columns if c != "anchor"]
dfx_labels = dfx[labels]
dfx["fold"] = -1

for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
    print(len(trn_), len(val_))
    dfx.loc[val_, "fold"] = fold

train = train.merge(dfx[["anchor", "fold"]], on="anchor", how="left")
print(train.fold.value_counts())

550 183
549 184
550 183
550 183
3    9622
0    9379
1    8860
2    8612
Name: fold, dtype: int64


In [11]:
# カテゴリ変数の変換
train['section'] = train['context'].map(lambda s: s[0])
cat_cols = ['section']
for c in cat_cols:
    le = LabelEncoder()
    le.fit(train[c])
    train[f'{c}_encoded'] = le.transform(train[c])
    with open(output_dir / f'{c}_encoder.pkl', 'wb') as f:
        pickle.dump(le, f)

In [12]:
print(train.columns)

Index(['id', 'anchor', 'target', 'context', 'score', 'anchor_tok_len_026', 'target_tok_len_026', 'context_tok_len_026', 'input_len_026', 'len_anc_tgt_diff_026', 'len_anc_tgt_div_026', 'len_anc_cnt_diff_026', 'len_anc_cnt_div_026', 'len_tgt_cnt_diff_026', 'len_tgt_cnt_div_026', 'pred_026', 'anchor_tok_len_033', 'target_tok_len_033', 'context_tok_len_033', 'input_len_033', 'len_anc_tgt_diff_033', 'len_anc_tgt_div_033', 'len_anc_cnt_diff_033', 'len_anc_cnt_div_033', 'len_tgt_cnt_diff_033', 'len_tgt_cnt_div_033', 'pred_033', 'anchor_tok_len_022', 'target_tok_len_022', 'context_tok_len_022', 'input_len_022', 'len_anc_tgt_diff_022', 'len_anc_tgt_div_022', 'len_anc_cnt_diff_022', 'len_anc_cnt_div_022', 'len_tgt_cnt_diff_022', 'len_tgt_cnt_div_022', 'pred_022', 'fold', 'section', 'section_encoded'], dtype='object')


In [13]:
excluded_columns = ['id', 'anchor', 'target', 'context', 'score', 'fold', 'section']
cols = [col for col in train.columns if col not in excluded_columns]
with open(output_dir / 'cols.pkl', 'wb') as f:
    pickle.dump(cols, f)
print(cols)

['anchor_tok_len_026', 'target_tok_len_026', 'context_tok_len_026', 'input_len_026', 'len_anc_tgt_diff_026', 'len_anc_tgt_div_026', 'len_anc_cnt_diff_026', 'len_anc_cnt_div_026', 'len_tgt_cnt_diff_026', 'len_tgt_cnt_div_026', 'pred_026', 'anchor_tok_len_033', 'target_tok_len_033', 'context_tok_len_033', 'input_len_033', 'len_anc_tgt_diff_033', 'len_anc_tgt_div_033', 'len_anc_cnt_diff_033', 'len_anc_cnt_div_033', 'len_tgt_cnt_diff_033', 'len_tgt_cnt_div_033', 'pred_033', 'anchor_tok_len_022', 'target_tok_len_022', 'context_tok_len_022', 'input_len_022', 'len_anc_tgt_diff_022', 'len_anc_tgt_div_022', 'len_anc_cnt_diff_022', 'len_anc_cnt_div_022', 'len_tgt_cnt_diff_022', 'len_tgt_cnt_div_022', 'pred_022', 'section_encoded']


In [14]:
params = {
    'objective': 'regression',
    'boosting': 'gbdt',
    'num_iterations': 1000,
    'learning_rate': 0.01,
    # 'num_leaves': 20,  # 31
    'num_threads': cpu_count(logical=False),
    'seed': CFG.seed,

    # 'max_depth': 3,  # -1
    # 'min_data_in_leaf': 100,  # 20
    'verbosity': -1,  # 1

    'metric': 'l2'
}

In [15]:
def tuning_params(df, params):
    anchors = df['anchor'].unique()
    rng = np.random.default_rng(seed=71)
    rng.shuffle(anchors)
    val_prop = 0.25  # バリデーションデータの割合。4foldから0.25
    val_size = int(len(anchors)*val_prop)
    val_anchors = anchors[:val_size]
    is_val = np.isin(df['anchor'], val_anchors)
    idx = np.arange(len(df))
    val_idx = idx[is_val]
    tr_idx = idx[~is_val]
    print(f'train: {len(tr_idx)}, val: {len(val_idx)}')

    tr_df = df.iloc[tr_idx]
    val_df = df.iloc[val_idx]

    cat_cols_ = [f'{c}_encoded' for c in cat_cols]
    tr_ds = lgb.Dataset(tr_df[cols], tr_df['score'], categorical_feature=cat_cols_)
    val_ds = lgb.Dataset(val_df[cols], val_df['score'], categorical_feature=cat_cols_, reference=tr_ds)
    
    model = lgb_optuna.train(
        params,
        tr_ds, 
        valid_sets=[tr_ds, val_ds],
        callbacks=[lgb.early_stopping(stopping_rounds=100),
                    lgb.log_evaluation(period=100)],
        verbosity=-1,
    )

    best_params = model.params

    return best_params

In [None]:
best_params = tuning_params(train, params)
display(best_params)

train: 27261, val: 9212


[32m[I 2022-06-20 09:59:22,726][0m A new study created in memory with name: no-name-3866a11a-43bf-4933-82fd-9eb2a2c73af0[0m
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]

[1]	valid_0's l2: 0.0662747	valid_1's l2: 0.0643164
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l2: 0.0653287	valid_1's l2: 0.0634089
[3]	valid_0's l2: 0.0644016	valid_1's l2: 0.0625168
[4]	valid_0's l2: 0.0634929	valid_1's l2: 0.061646
[5]	valid_0's l2: 0.062602	valid_1's l2: 0.0607887
[6]	valid_0's l2: 0.06173	valid_1's l2: 0.0599506
[7]	valid_0's l2: 0.0608753	valid_1's l2: 0.0591298
[8]	valid_0's l2: 0.060036	valid_1's l2: 0.058326
[9]	valid_0's l2: 0.0592148	valid_1's l2: 0.0575339
[10]	valid_0's l2: 0.0584117	valid_1's l2: 0.0567639
[11]	valid_0's l2: 0.0576226	valid_1's l2: 0.0560046
[12]	valid_0's l2: 0.0568471	valid_1's l2: 0.0552605
[13]	valid_0's l2: 0.0560897	valid_1's l2: 0.054531
[14]	valid_0's l2: 0.0553458	valid_1's l2: 0.0538144
[15]	valid_0's l2: 0.0546154	valid_1's l2: 0.0531154
[16]	valid_0's l2: 0.0539006	valid_1's l2: 0.0524279
[17]	valid_0's l2: 0.0531987	valid_1's l2: 0.0517561
[18]	valid_0's l2: 0.052512	valid_1's l2: 0.0510965
[

feature_fraction, val_score: 0.019123:  14%|#4        | 1/7 [00:04<00:26,  4.37s/it]

[463]	valid_0's l2: 0.0168611	valid_1's l2: 0.0191418
[464]	valid_0's l2: 0.0168585	valid_1's l2: 0.0191414
[465]	valid_0's l2: 0.0168557	valid_1's l2: 0.0191418
[466]	valid_0's l2: 0.0168534	valid_1's l2: 0.0191414
[467]	valid_0's l2: 0.0168502	valid_1's l2: 0.0191421
[468]	valid_0's l2: 0.0168481	valid_1's l2: 0.0191434
[469]	valid_0's l2: 0.0168457	valid_1's l2: 0.0191436
[470]	valid_0's l2: 0.0168427	valid_1's l2: 0.0191444
[471]	valid_0's l2: 0.0168394	valid_1's l2: 0.0191448
[472]	valid_0's l2: 0.0168374	valid_1's l2: 0.019145
[473]	valid_0's l2: 0.0168347	valid_1's l2: 0.0191449
[474]	valid_0's l2: 0.0168316	valid_1's l2: 0.0191453
[475]	valid_0's l2: 0.0168289	valid_1's l2: 0.0191455
Early stopping, best iteration is:
[375]	valid_0's l2: 0.0171308	valid_1's l2: 0.0191235
[1]	valid_0's l2: 0.0663215	valid_1's l2: 0.0643599
[2]	valid_0's l2: 0.0654216	valid_1's l2: 0.063495
[3]	valid_0's l2: 0.0645182	valid_1's l2: 0.0626207
[4]	valid_0's l2: 0.0644121	valid_1's l2: 0.0625413
[5]

feature_fraction, val_score: 0.019123:  29%|##8       | 2/7 [00:10<00:26,  5.32s/it]

[1]	valid_0's l2: 0.06628	valid_1's l2: 0.0643201
[2]	valid_0's l2: 0.0653797	valid_1's l2: 0.0634554
[3]	valid_0's l2: 0.0644778	valid_1's l2: 0.0625819
[4]	valid_0's l2: 0.0636118	valid_1's l2: 0.0617501
[5]	valid_0's l2: 0.062723	valid_1's l2: 0.0608976
[6]	valid_0's l2: 0.0618465	valid_1's l2: 0.0600564
[7]	valid_0's l2: 0.0610104	valid_1's l2: 0.0592461
[8]	valid_0's l2: 0.0601719	valid_1's l2: 0.0584373
[9]	valid_0's l2: 0.0593458	valid_1's l2: 0.0576425
[10]	valid_0's l2: 0.0585588	valid_1's l2: 0.0568819
[11]	valid_0's l2: 0.0577646	valid_1's l2: 0.05612
[12]	valid_0's l2: 0.0569969	valid_1's l2: 0.0553803
[13]	valid_0's l2: 0.0562699	valid_1's l2: 0.0546845
[14]	valid_0's l2: 0.0555573	valid_1's l2: 0.0540014
[15]	valid_0's l2: 0.0548317	valid_1's l2: 0.053303
[16]	valid_0's l2: 0.0541143	valid_1's l2: 0.0526166
[17]	valid_0's l2: 0.0534273	valid_1's l2: 0.0519544
[18]	valid_0's l2: 0.0527342	valid_1's l2: 0.0512897
[19]	valid_0's l2: 0.0520639	valid_1's l2: 0.0506448
[20]	val

feature_fraction, val_score: 0.019070:  43%|####2     | 3/7 [00:18<00:26,  6.64s/it]

[1]	valid_0's l2: 0.0662747	valid_1's l2: 0.0643164
[2]	valid_0's l2: 0.0653287	valid_1's l2: 0.0634089
[3]	valid_0's l2: 0.064428	valid_1's l2: 0.062537
[4]	valid_0's l2: 0.0635231	valid_1's l2: 0.0616687
[5]	valid_0's l2: 0.0626329	valid_1's l2: 0.0608115
[6]	valid_0's l2: 0.0617588	valid_1's l2: 0.0599712
[7]	valid_0's l2: 0.0609033	valid_1's l2: 0.0591496
[8]	valid_0's l2: 0.0600647	valid_1's l2: 0.0583442
[9]	valid_0's l2: 0.0592412	valid_1's l2: 0.0575549
[10]	valid_0's l2: 0.0584378	valid_1's l2: 0.0567836
[11]	valid_0's l2: 0.0576477	valid_1's l2: 0.0560256
[12]	valid_0's l2: 0.0568736	valid_1's l2: 0.0552796
[13]	valid_0's l2: 0.0561485	valid_1's l2: 0.0545853
[14]	valid_0's l2: 0.0554021	valid_1's l2: 0.0538675
[15]	valid_0's l2: 0.0546802	valid_1's l2: 0.053173
[16]	valid_0's l2: 0.0539638	valid_1's l2: 0.0524836
[17]	valid_0's l2: 0.0532604	valid_1's l2: 0.051811
[18]	valid_0's l2: 0.0525721	valid_1's l2: 0.0511492
[19]	valid_0's l2: 0.051896	valid_1's l2: 0.0505025
[20]	va

feature_fraction, val_score: 0.019070:  57%|#####7    | 4/7 [00:29<00:25,  8.38s/it]

[1]	valid_0's l2: 0.0662799	valid_1's l2: 0.0643203
[2]	valid_0's l2: 0.0653796	valid_1's l2: 0.063456
[3]	valid_0's l2: 0.0644776	valid_1's l2: 0.0625819
[4]	valid_0's l2: 0.0636114	valid_1's l2: 0.0617505
[5]	valid_0's l2: 0.0627227	valid_1's l2: 0.0608982
[6]	valid_0's l2: 0.0618462	valid_1's l2: 0.060057
[7]	valid_0's l2: 0.0609989	valid_1's l2: 0.0592392
[8]	valid_0's l2: 0.0601604	valid_1's l2: 0.0584304
[9]	valid_0's l2: 0.0593345	valid_1's l2: 0.0576358
[10]	valid_0's l2: 0.0585293	valid_1's l2: 0.0568634
[11]	valid_0's l2: 0.0577369	valid_1's l2: 0.0561009
[12]	valid_0's l2: 0.0569587	valid_1's l2: 0.0553551
[13]	valid_0's l2: 0.0562332	valid_1's l2: 0.0546592
[14]	valid_0's l2: 0.0554869	valid_1's l2: 0.0539426
[15]	valid_0's l2: 0.0547625	valid_1's l2: 0.0532453
[16]	valid_0's l2: 0.0540431	valid_1's l2: 0.0525546
[17]	valid_0's l2: 0.0533571	valid_1's l2: 0.0518933
[18]	valid_0's l2: 0.0526662	valid_1's l2: 0.0512313
[19]	valid_0's l2: 0.0519972	valid_1's l2: 0.0505885
[20]

feature_fraction, val_score: 0.019070:  71%|#######1  | 5/7 [00:33<00:13,  6.86s/it]

[523]	valid_0's l2: 0.0168039	valid_1's l2: 0.019116
[524]	valid_0's l2: 0.0168022	valid_1's l2: 0.0191159
[525]	valid_0's l2: 0.0167998	valid_1's l2: 0.0191158
Early stopping, best iteration is:
[425]	valid_0's l2: 0.0171293	valid_1's l2: 0.0190696
[1]	valid_0's l2: 0.0662799	valid_1's l2: 0.0643203
[2]	valid_0's l2: 0.0653795	valid_1's l2: 0.063456
[3]	valid_0's l2: 0.0644775	valid_1's l2: 0.0625817
[4]	valid_0's l2: 0.0636113	valid_1's l2: 0.0617503
[5]	valid_0's l2: 0.0627173	valid_1's l2: 0.0608921
[6]	valid_0's l2: 0.061841	valid_1's l2: 0.0600506
[7]	valid_0's l2: 0.0609823	valid_1's l2: 0.059224
[8]	valid_0's l2: 0.0601418	valid_1's l2: 0.0584169
[9]	valid_0's l2: 0.0593164	valid_1's l2: 0.0576238
[10]	valid_0's l2: 0.0585114	valid_1's l2: 0.0568516
[11]	valid_0's l2: 0.0577196	valid_1's l2: 0.0560896
[12]	valid_0's l2: 0.0569435	valid_1's l2: 0.0553461
[13]	valid_0's l2: 0.0562187	valid_1's l2: 0.0546523
[14]	valid_0's l2: 0.0554728	valid_1's l2: 0.0539355
[15]	valid_0's l2: 0

feature_fraction, val_score: 0.019070:  86%|########5 | 6/7 [00:38<00:06,  6.00s/it]

[523]	valid_0's l2: 0.0167468	valid_1's l2: 0.0191332
[524]	valid_0's l2: 0.0167449	valid_1's l2: 0.0191332
[525]	valid_0's l2: 0.0167422	valid_1's l2: 0.0191338
Early stopping, best iteration is:
[425]	valid_0's l2: 0.0171293	valid_1's l2: 0.0190696
[1]	valid_0's l2: 0.0662747	valid_1's l2: 0.0643164
[2]	valid_0's l2: 0.0653287	valid_1's l2: 0.0634089
[3]	valid_0's l2: 0.0644014	valid_1's l2: 0.0625168
[4]	valid_0's l2: 0.0634927	valid_1's l2: 0.0616461
[5]	valid_0's l2: 0.0626018	valid_1's l2: 0.0607887
[6]	valid_0's l2: 0.0617297	valid_1's l2: 0.0599507
[7]	valid_0's l2: 0.0608751	valid_1's l2: 0.0591299
[8]	valid_0's l2: 0.0600358	valid_1's l2: 0.058326
[9]	valid_0's l2: 0.0592146	valid_1's l2: 0.0575339
[10]	valid_0's l2: 0.0584081	valid_1's l2: 0.0567617
[11]	valid_0's l2: 0.0576188	valid_1's l2: 0.0560004
[12]	valid_0's l2: 0.0568435	valid_1's l2: 0.0552576
[13]	valid_0's l2: 0.0560849	valid_1's l2: 0.054527
[14]	valid_0's l2: 0.0553408	valid_1's l2: 0.0538124
[15]	valid_0's l2:

feature_fraction, val_score: 0.019070: 100%|##########| 7/7 [00:42<00:00,  6.09s/it]
num_leaves, val_score: 0.019070:   0%|          | 0/20 [00:00<?, ?it/s]

[1]	valid_0's l2: 0.0662486	valid_1's l2: 0.0643086
[2]	valid_0's l2: 0.0653234	valid_1's l2: 0.063438
[3]	valid_0's l2: 0.0643997	valid_1's l2: 0.0625642
[4]	valid_0's l2: 0.0635105	valid_1's l2: 0.0617292
[5]	valid_0's l2: 0.0625938	valid_1's l2: 0.060869
[6]	valid_0's l2: 0.0616872	valid_1's l2: 0.0600149
[7]	valid_0's l2: 0.0608294	valid_1's l2: 0.0592066
[8]	valid_0's l2: 0.0599613	valid_1's l2: 0.0583923
[9]	valid_0's l2: 0.0591056	valid_1's l2: 0.05759
[10]	valid_0's l2: 0.0582983	valid_1's l2: 0.0568286
[11]	valid_0's l2: 0.0574746	valid_1's l2: 0.0560584
[12]	valid_0's l2: 0.0566829	valid_1's l2: 0.0553166
[13]	valid_0's l2: 0.0559348	valid_1's l2: 0.0546178
[14]	valid_0's l2: 0.0552005	valid_1's l2: 0.0539368
[15]	valid_0's l2: 0.0544501	valid_1's l2: 0.0532356
[16]	valid_0's l2: 0.0537077	valid_1's l2: 0.0525455
[17]	valid_0's l2: 0.0530018	valid_1's l2: 0.0518833
[18]	valid_0's l2: 0.052281	valid_1's l2: 0.0512118
[19]	valid_0's l2: 0.0515889	valid_1's l2: 0.0505664
[20]	va

num_leaves, val_score: 0.019070:   5%|5         | 1/20 [00:08<02:49,  8.94s/it]

[1]	valid_0's l2: 0.0662808	valid_1's l2: 0.0643206
[2]	valid_0's l2: 0.0653812	valid_1's l2: 0.063455
[3]	valid_0's l2: 0.0644797	valid_1's l2: 0.0625813
[4]	valid_0's l2: 0.0636142	valid_1's l2: 0.0617495
[5]	valid_0's l2: 0.0627262	valid_1's l2: 0.0608974
[6]	valid_0's l2: 0.0618506	valid_1's l2: 0.0600575
[7]	valid_0's l2: 0.0610151	valid_1's l2: 0.0592474
[8]	valid_0's l2: 0.0601773	valid_1's l2: 0.0584389
[9]	valid_0's l2: 0.0593522	valid_1's l2: 0.0576451
[10]	valid_0's l2: 0.0585656	valid_1's l2: 0.0568849
[11]	valid_0's l2: 0.0577725	valid_1's l2: 0.056123
[12]	valid_0's l2: 0.0570053	valid_1's l2: 0.0553837
[13]	valid_0's l2: 0.0562786	valid_1's l2: 0.0546864
[14]	valid_0's l2: 0.0555664	valid_1's l2: 0.0540045
[15]	valid_0's l2: 0.0548414	valid_1's l2: 0.0533069
[16]	valid_0's l2: 0.0541246	valid_1's l2: 0.0526212
[17]	valid_0's l2: 0.0534381	valid_1's l2: 0.0519588
[18]	valid_0's l2: 0.0527459	valid_1's l2: 0.0512949
[19]	valid_0's l2: 0.0520762	valid_1's l2: 0.0506506
[20]

num_leaves, val_score: 0.019067:  10%|#         | 2/20 [00:12<01:46,  5.89s/it]

[504]	valid_0's l2: 0.0170122	valid_1's l2: 0.0190893
[505]	valid_0's l2: 0.0170087	valid_1's l2: 0.0190889
[506]	valid_0's l2: 0.0170065	valid_1's l2: 0.019089
[507]	valid_0's l2: 0.0170033	valid_1's l2: 0.01909
[508]	valid_0's l2: 0.0170004	valid_1's l2: 0.0190908
[509]	valid_0's l2: 0.0169978	valid_1's l2: 0.0190908
[510]	valid_0's l2: 0.0169958	valid_1's l2: 0.0190918
[511]	valid_0's l2: 0.0169936	valid_1's l2: 0.0190917
[512]	valid_0's l2: 0.0169908	valid_1's l2: 0.0190918
Early stopping, best iteration is:
[412]	valid_0's l2: 0.0172497	valid_1's l2: 0.019067
[1]	valid_0's l2: 0.0662684	valid_1's l2: 0.0643119
[2]	valid_0's l2: 0.0653594	valid_1's l2: 0.063442
[3]	valid_0's l2: 0.0644496	valid_1's l2: 0.0625681
[4]	valid_0's l2: 0.0635758	valid_1's l2: 0.0617356
[5]	valid_0's l2: 0.0626762	valid_1's l2: 0.060875
[6]	valid_0's l2: 0.0617874	valid_1's l2: 0.0600251
[7]	valid_0's l2: 0.0609438	valid_1's l2: 0.0592163
[8]	valid_0's l2: 0.0600941	valid_1's l2: 0.0584043
[9]	valid_0's l

num_leaves, val_score: 0.019067:  15%|#5        | 3/20 [00:17<01:35,  5.61s/it]

[1]	valid_0's l2: 0.0662516	valid_1's l2: 0.0643083
[2]	valid_0's l2: 0.0653285	valid_1's l2: 0.0634382
[3]	valid_0's l2: 0.0644068	valid_1's l2: 0.0625649
[4]	valid_0's l2: 0.0635197	valid_1's l2: 0.0617296
[5]	valid_0's l2: 0.0626054	valid_1's l2: 0.0608699
[6]	valid_0's l2: 0.0617017	valid_1's l2: 0.0600165
[7]	valid_0's l2: 0.0608458	valid_1's l2: 0.0592079
[8]	valid_0's l2: 0.05998	valid_1's l2: 0.0583955
[9]	valid_0's l2: 0.0591271	valid_1's l2: 0.0575934
[10]	valid_0's l2: 0.0583222	valid_1's l2: 0.0568327
[11]	valid_0's l2: 0.0575016	valid_1's l2: 0.0560618
[12]	valid_0's l2: 0.0567122	valid_1's l2: 0.0553202
[13]	valid_0's l2: 0.0559662	valid_1's l2: 0.0546211
[14]	valid_0's l2: 0.0552341	valid_1's l2: 0.0539385
[15]	valid_0's l2: 0.0544863	valid_1's l2: 0.053237
[16]	valid_0's l2: 0.053746	valid_1's l2: 0.0525474
[17]	valid_0's l2: 0.0530417	valid_1's l2: 0.0518852
[18]	valid_0's l2: 0.0523236	valid_1's l2: 0.0512129
[19]	valid_0's l2: 0.0516337	valid_1's l2: 0.0505657
[20]	v

num_leaves, val_score: 0.019067:  20%|##        | 4/20 [00:26<01:45,  6.61s/it]

[1]	valid_0's l2: 0.0662479	valid_1's l2: 0.0643089
[2]	valid_0's l2: 0.0653222	valid_1's l2: 0.0634383
[3]	valid_0's l2: 0.0643979	valid_1's l2: 0.0625645
[4]	valid_0's l2: 0.0635083	valid_1's l2: 0.0617296
[5]	valid_0's l2: 0.062591	valid_1's l2: 0.0608694
[6]	valid_0's l2: 0.0616839	valid_1's l2: 0.0600161
[7]	valid_0's l2: 0.0608257	valid_1's l2: 0.0592077
[8]	valid_0's l2: 0.0599571	valid_1's l2: 0.0583939
[9]	valid_0's l2: 0.0591008	valid_1's l2: 0.0575907
[10]	valid_0's l2: 0.0582931	valid_1's l2: 0.0568291
[11]	valid_0's l2: 0.0574689	valid_1's l2: 0.0560588
[12]	valid_0's l2: 0.0566767	valid_1's l2: 0.0553176
[13]	valid_0's l2: 0.0559281	valid_1's l2: 0.0546186
[14]	valid_0's l2: 0.0551933	valid_1's l2: 0.0539373
[15]	valid_0's l2: 0.0544423	valid_1's l2: 0.0532359
[16]	valid_0's l2: 0.0536995	valid_1's l2: 0.0525457
[17]	valid_0's l2: 0.0529931	valid_1's l2: 0.0518834
[18]	valid_0's l2: 0.0522717	valid_1's l2: 0.0512116
[19]	valid_0's l2: 0.0515788	valid_1's l2: 0.0505668
[20

num_leaves, val_score: 0.019067:  25%|##5       | 5/20 [00:35<01:51,  7.44s/it]

[1]	valid_0's l2: 0.0662718	valid_1's l2: 0.0643146
[2]	valid_0's l2: 0.0653654	valid_1's l2: 0.0634468
[3]	valid_0's l2: 0.0644581	valid_1's l2: 0.0625723
[4]	valid_0's l2: 0.0635868	valid_1's l2: 0.0617398
[5]	valid_0's l2: 0.0626901	valid_1's l2: 0.0608806
[6]	valid_0's l2: 0.0618047	valid_1's l2: 0.0600316
[7]	valid_0's l2: 0.0609636	valid_1's l2: 0.0592224
[8]	valid_0's l2: 0.0601173	valid_1's l2: 0.0584111
[9]	valid_0's l2: 0.059283	valid_1's l2: 0.0576128
[10]	valid_0's l2: 0.0584921	valid_1's l2: 0.0568507
[11]	valid_0's l2: 0.0576899	valid_1's l2: 0.0560824
[12]	valid_0's l2: 0.0569163	valid_1's l2: 0.05534
[13]	valid_0's l2: 0.0561849	valid_1's l2: 0.0546435
[14]	valid_0's l2: 0.0554679	valid_1's l2: 0.0539609
[15]	valid_0's l2: 0.0547365	valid_1's l2: 0.0532601
[16]	valid_0's l2: 0.0540132	valid_1's l2: 0.05257
[17]	valid_0's l2: 0.0533219	valid_1's l2: 0.0519079
[18]	valid_0's l2: 0.0526208	valid_1's l2: 0.051238
[19]	valid_0's l2: 0.0519454	valid_1's l2: 0.0505915
[20]	val

num_leaves, val_score: 0.019067:  30%|###       | 6/20 [00:39<01:31,  6.57s/it]

[1]	valid_0's l2: 0.0662497	valid_1's l2: 0.0643085
[2]	valid_0's l2: 0.065325	valid_1's l2: 0.0634377
[3]	valid_0's l2: 0.0644019	valid_1's l2: 0.062564
[4]	valid_0's l2: 0.0635133	valid_1's l2: 0.061729
[5]	valid_0's l2: 0.0625974	valid_1's l2: 0.0608694
[6]	valid_0's l2: 0.0616919	valid_1's l2: 0.0600159
[7]	valid_0's l2: 0.0608346	valid_1's l2: 0.0592073
[8]	valid_0's l2: 0.0599673	valid_1's l2: 0.0583934
[9]	valid_0's l2: 0.0591126	valid_1's l2: 0.0575911
[10]	valid_0's l2: 0.0583059	valid_1's l2: 0.0568301
[11]	valid_0's l2: 0.0574822	valid_1's l2: 0.0560588
[12]	valid_0's l2: 0.0566913	valid_1's l2: 0.0553176
[13]	valid_0's l2: 0.0559437	valid_1's l2: 0.05462
[14]	valid_0's l2: 0.0552105	valid_1's l2: 0.0539387
[15]	valid_0's l2: 0.0544609	valid_1's l2: 0.0532375
[16]	valid_0's l2: 0.0537191	valid_1's l2: 0.0525473
[17]	valid_0's l2: 0.0530136	valid_1's l2: 0.0518849
[18]	valid_0's l2: 0.0522936	valid_1's l2: 0.0512133
[19]	valid_0's l2: 0.0516021	valid_1's l2: 0.0505688
[20]	va

num_leaves, val_score: 0.019067:  35%|###5      | 7/20 [00:48<01:33,  7.21s/it]

[1]	valid_0's l2: 0.0662784	valid_1's l2: 0.0643177
[2]	valid_0's l2: 0.0653773	valid_1's l2: 0.0634529
[3]	valid_0's l2: 0.0644746	valid_1's l2: 0.0625793
[4]	valid_0's l2: 0.0636077	valid_1's l2: 0.0617476
[5]	valid_0's l2: 0.0627175	valid_1's l2: 0.0608927
[6]	valid_0's l2: 0.0618393	valid_1's l2: 0.0600503
[7]	valid_0's l2: 0.0610024	valid_1's l2: 0.0592396
[8]	valid_0's l2: 0.0601625	valid_1's l2: 0.0584308
[9]	valid_0's l2: 0.0593348	valid_1's l2: 0.0576348
[10]	valid_0's l2: 0.058547	valid_1's l2: 0.0568736
[11]	valid_0's l2: 0.0577512	valid_1's l2: 0.0561131
[12]	valid_0's l2: 0.0569824	valid_1's l2: 0.0553733
[13]	valid_0's l2: 0.0562548	valid_1's l2: 0.0546771
[14]	valid_0's l2: 0.055542	valid_1's l2: 0.0539945
[15]	valid_0's l2: 0.0548152	valid_1's l2: 0.0532957
[16]	valid_0's l2: 0.0540969	valid_1's l2: 0.0526087
[17]	valid_0's l2: 0.0534091	valid_1's l2: 0.0519464
[18]	valid_0's l2: 0.0527146	valid_1's l2: 0.0512808
[19]	valid_0's l2: 0.0520434	valid_1's l2: 0.0506361
[20]

num_leaves, val_score: 0.019067:  40%|####      | 8/20 [00:52<01:14,  6.19s/it]

[499]	valid_0's l2: 0.0167708	valid_1's l2: 0.0190969
[500]	valid_0's l2: 0.0167682	valid_1's l2: 0.0190972
[500]	valid_0's l2: 0.0167682	valid_1's l2: 0.0190972
[501]	valid_0's l2: 0.0167653	valid_1's l2: 0.0190982
[502]	valid_0's l2: 0.016762	valid_1's l2: 0.019099
[503]	valid_0's l2: 0.016758	valid_1's l2: 0.0190998
[504]	valid_0's l2: 0.0167557	valid_1's l2: 0.0191006
[505]	valid_0's l2: 0.0167532	valid_1's l2: 0.0191011
[506]	valid_0's l2: 0.0167511	valid_1's l2: 0.0191017
[507]	valid_0's l2: 0.0167478	valid_1's l2: 0.0191023
[508]	valid_0's l2: 0.0167442	valid_1's l2: 0.0191023
[509]	valid_0's l2: 0.0167417	valid_1's l2: 0.0191028
[510]	valid_0's l2: 0.0167386	valid_1's l2: 0.0191032
[511]	valid_0's l2: 0.0167359	valid_1's l2: 0.019103
[512]	valid_0's l2: 0.0167332	valid_1's l2: 0.0191028
Early stopping, best iteration is:
[412]	valid_0's l2: 0.0172497	valid_1's l2: 0.019067
[1]	valid_0's l2: 0.066257	valid_1's l2: 0.0643089
[2]	valid_0's l2: 0.065339	valid_1's l2: 0.0634408
[3]	

num_leaves, val_score: 0.019067:  45%|####5     | 9/20 [00:59<01:11,  6.46s/it]

In [None]:
def prepare_datasets(df, fold):
    tr_df = df[df['fold']!=fold]
    val_df = df[df['fold']==fold]

    cat_cols_ = [f'{c}_encoded' for c in cat_cols]
    tr_ds = lgb.Dataset(tr_df[cols], tr_df['score'], categorical_feature=cat_cols_)
    val_ds = lgb.Dataset(val_df[cols], val_df['score'], categorical_feature=cat_cols_, reference=tr_ds)

    return tr_ds, val_ds


def train_fn(df, params):
    preds = []

    for fold in df['fold'].unique():
        tr_ds, val_ds = prepare_datasets(df, fold)
        
        model = lgb.train(
            params, 
            tr_ds, 
            valid_sets=[tr_ds, val_ds],
            callbacks=[lgb.early_stopping(stopping_rounds=100),
                       lgb.log_evaluation(period=100)]
        )

        pred = df[df['fold']==fold].copy()
        pred['pred_lgb'] = model.predict(pred[cols], num_iteration=model.best_iteration)
        preds.append(pred)

        model.save_model(output_dir / f'lgb_fold{fold}.txt', num_iteration=model.best_iteration)

    return pd.concat(preds, axis=0, ignore_index=False)

In [None]:
pred = train_fn(train, best_params)

In [None]:
importances = []
for fold in train['fold'].unique():
    importances.append(
        lgb.Booster(model_file=output_dir / f'lgb_fold{fold}.txt').feature_importance(importance_type='gain'))
importance_df = pd.DataFrame({
    'feature': cols,
    'importance': np.mean(importances, axis=0)
    })
importance_df.sort_values('importance', ascending=False)

In [None]:
# optunaのbest params
print(get_score(pred['score'], pred['pred_lgb']))
pred.head()

In [None]:
import os
import json
from kaggle.api.kaggle_api_extended import KaggleApi

ID = 'hanejiyuto'
DATASET_ID = ColabConfig.dataset_name + '-' + ColabConfig.dataset_version
UPLOAD_DIR = ColabConfig.dataset_dir
VERSION_NOTES = ColabConfig.dataset_note

def dataset_create_new():
    dataset_metadata = {}
    dataset_metadata['id'] = f'{ID}/{DATASET_ID}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = DATASET_ID
    with open(os.path.join(UPLOAD_DIR, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    # api.dataset_create_new(folder=UPLOAD_DIR, convert_to_csv=False, dir_mode='tar')
    !kaggle datasets create -t -p $UPLOAD_DIR -r tar

def dataset_create_version():  # バージョンアップデート
    dataset_metadata = {}
    dataset_metadata['id'] = f'{ID}/{DATASET_ID}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = DATASET_ID
    if not os.path.exists('dataset-metadata.json'):
        with open(os.path.join(UPLOAD_DIR, 'dataset-metadata.json'), 'w') as f:
            json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    # api.dataset_create_version(folder=UPLOAD_DIR, version_notes=VERSION_NOTES, convert_to_csv=False, dir_mode='tar')
    !kaggle datasets version -t -p $UPLOAD_DIR -r tar -m $VERSION_NOTES

if CFG.debug:
    pass
elif ColabConfig.dataset_new:
    dataset_create_new()
else:
    dataset_create_version()