In [1]:
class CFG:
    exps = ['026', '033', 'PatentSBERTa-exp035']
    n_fold = 4
    debug=False
    seed = 42

# ====================================================
# Colab settings
# ====================================================
class ColabConfig:
    dataset_name = 'PPPM-stacking'
    dataset_version = 'exp' + '-'.join(CFG.exps)
    dataset_new = True  # 新しいデータセットか
    dataset_dir = None  # Kaggle Dataset にアップロードするディレクトリ
    dataset_note = '""'  # 前の版からの変更点
    in_colab = False  # colab上にデータダウンロード

In [2]:
!nvcc --version
!python -c 'import torch; print(torch.__version__) '
!python --version
print('')
!nvidia-smi
print('')
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0
1.11.0+cu113
Python 3.7.13

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.


Your runtime has 13.6 gigabytes of available RAM



In [3]:
%%time
import sys
COLAB = "google.colab" in sys.modules

if COLAB:
    import os
    print('This environment is Google Colab')

    # mount drive
    from google.colab import drive
    if not os.path.isdir('/content/drive'):
        drive.mount('/content/drive')
    os.chdir('/content/drive/MyDrive/kaggle/PPPM/exps')

    # kaggle api token and update kaggle api
    from google.colab import files
    if not os.path.isfile('~/.kaggle/kaggle.json'):
        # files.upload()
        !mkdir -p ~/.kaggle
        !cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
        !pip install --upgrade --force-reinstall --no-deps -q kaggle

    if ColabConfig.in_colab:
        # make directory in colab
        !mkdir -p /content/input
        !mkdir -p /content/working

        # download dataset in colab
        import zipfile, glob
        os.chdir('/content/input')
        # !kaggle competitions download -qc birdclef-2022 -p birdclef-2022
        
        # !mkdir birdclef-2022
        # !cp /content/drive/MyDrive/kaggle/BirdCLEF2022/input/birdclef-2022/* ./birdclef-2022/

        for p in glob.glob('**/*.zip', recursive=True):
            print(p)
            d, f = os.path.split(p)
            # if f in ['']:
            #     continue
            with zipfile.ZipFile(p, 'r') as zipf:
                print('unzip: ', zipf)
                zipf.extractall(d)
                print('remove: ', f)
                os.remove(p)
        
        os.chdir('/content/working')

This environment is Google Colab
Mounted at /content/drive
[K     |████████████████████████████████| 58 kB 3.6 MB/s 
[?25h  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
CPU times: user 2.36 s, sys: 439 ms, total: 2.8 s
Wall time: 1min 47s


In [4]:
import os
import random
import pickle
from pathlib import Path
from psutil import cpu_count
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 10000)
pd.set_option('display.max_colwidth', None)
from tqdm.auto import tqdm
tqdm.pandas()
os.system('python -m pip install -U lightgbm')
import lightgbm as lgb
print(f"lightgbm.__version__: {lgb.__version__}")
os.system('python -m pip install optuna')
import optuna.integration.lightgbm as lgb_optuna
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import torch
print(f"torch.__version__: {torch.__version__}")

os.system('python -m pip install sentencepiece')
os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

lightgbm.__version__: 3.3.2
torch.__version__: 1.11.0+cu113
tokenizers.__version__: 0.12.1
transformers.__version__: 4.18.0
env: TOKENIZERS_PARALLELISM=true


In [5]:
input_dir = Path('../input/us-patent-phrase-to-phrase-matching')
output_dir = Path(f"/content/drive/MyDrive/kaggle/PPPM/output/stacking{'_'.join(CFG.exps)}")
output_dir.mkdir(exist_ok=True)

ColabConfig.dataset_dir = str(output_dir)

In [6]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score


def get_logger(filename=output_dir / 'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

# LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [7]:
# 特徴量エンジニアリング
def tok_len(sentence, tokenizer):
    return len(tokenizer.tokenize(sentence))

def tokenizer_feature_engineering(df, tokenizer):
    def _tok_len(sentence):
        return tok_len(sentence, tokenizer)
    
    basecols = df.columns

    df['anchor_tok_len'] = df['anchor'].map(_tok_len)
    df['target_tok_len'] = df['target'].map(_tok_len)
    df['context_tok_len'] = df['context_text'].map(_tok_len)
    df['input_len'] = df['anchor_tok_len'] + df['target_tok_len'] + df['context_tok_len'] + 4

    df['len_anc_tgt_diff'] = df['anchor_tok_len'] - df['target_tok_len']
    df['len_anc_tgt_div'] = df['anchor_tok_len'] / df['target_tok_len']

    df['len_anc_cnt_diff'] = df['anchor_tok_len'] - df['context_tok_len']
    df['len_anc_cnt_div'] = df['anchor_tok_len'] / df['context_tok_len']

    df['len_tgt_cnt_diff'] = df['target_tok_len'] - df['context_tok_len']
    df['len_tgt_cnt_div'] = df['target_tok_len'] / df['context_tok_len']

    # 作成した特徴量とidカラムだけ返す
    usecols = [col for col in df.columns if col not in basecols]
    usecols.append('id')
    return df[usecols]

In [8]:
def make_ensemble_datasets(cfg):
    cv_scores = []
    train = pd.read_csv(input_dir / 'train.csv')

    for exp in cfg.exps:
        oof_dir = Path(f'/content/drive/MyDrive/kaggle/PPPM/output/{exp}')
        if exp in ['deberta-v3-large-e1_exp032', 'PatentSBERTa-exp035']:
            oof_df = pd.read_csv(oof_dir / 'oof_df.csv')
        else:
            oof_df = pd.read_pickle(oof_dir / 'oof_df.pkl')
        tokenizer = AutoTokenizer.from_pretrained(oof_dir / 'tokenizer')
        
        # 特徴量と予測スコアは'pred_000'のようなカラム名にする
        feature_df = tokenizer_feature_engineering(oof_df, tokenizer)
        rename_dict = {col: col+'_'+exp for col in feature_df.columns if col != 'id'}
        feature_df = feature_df.rename(columns=rename_dict)
        
        train = train.merge(feature_df, on='id', how='left')
        train = train.merge(oof_df[['id', 'pred']].rename(columns={'pred': f'pred_{exp}'}), 
                            on='id', how='left')

        cv_scores.append(get_score(train['score'], train[f'pred_{exp}']))
    
    print(f'max score: {max(cv_scores)}')
    print(f'avg score: {sum(cv_scores) / len(cv_scores)}')
    return train

In [9]:
train = make_ensemble_datasets(CFG)
display(train.head())

max score: 0.832151085616115
avg score: 0.813036860175736


Unnamed: 0,id,anchor,target,context,score,anchor_tok_len_026,target_tok_len_026,context_tok_len_026,input_len_026,len_anc_tgt_diff_026,len_anc_tgt_div_026,len_anc_cnt_diff_026,len_anc_cnt_div_026,len_tgt_cnt_diff_026,len_tgt_cnt_div_026,pred_026,anchor_tok_len_033,target_tok_len_033,context_tok_len_033,input_len_033,len_anc_tgt_diff_033,len_anc_tgt_div_033,len_anc_cnt_diff_033,len_anc_cnt_div_033,len_tgt_cnt_diff_033,len_tgt_cnt_div_033,pred_033,anchor_tok_len_PatentSBERTa-exp035,target_tok_len_PatentSBERTa-exp035,context_tok_len_PatentSBERTa-exp035,input_len_PatentSBERTa-exp035,len_anc_tgt_diff_PatentSBERTa-exp035,len_anc_tgt_div_PatentSBERTa-exp035,len_anc_cnt_diff_PatentSBERTa-exp035,len_anc_cnt_div_PatentSBERTa-exp035,len_tgt_cnt_diff_PatentSBERTa-exp035,len_tgt_cnt_div_PatentSBERTa-exp035,pred_PatentSBERTa-exp035
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,1,3,20,28,-2,0.333333,-19,0.05,-17,0.15,0.440476,1,3,22,30,-2,0.333333,-21,0.045455,-19,0.136364,0.413867,3,5,24,36,-2,0.6,-21,0.125,-19,0.208333,0.414768
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,1,5,20,30,-4,0.2,-19,0.05,-15,0.25,0.641294,1,3,22,30,-2,0.333333,-21,0.045455,-19,0.136364,0.531482,3,4,24,35,-1,0.75,-21,0.125,-20,0.166667,0.768174
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.247038,1,2,22,29,-1,0.5,-21,0.045455,-20,0.090909,0.237905,3,2,24,33,1,1.5,-21,0.125,-22,0.083333,0.09902
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.494176,1,2,22,29,-1,0.5,-21,0.045455,-20,0.090909,0.561054,3,2,24,33,1,1.5,-21,0.125,-22,0.083333,0.425223
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,1,2,20,27,-1,0.5,-19,0.05,-18,0.1,0.011766,1,2,22,29,-1,0.5,-21,0.045455,-20,0.090909,0.028578,3,2,24,33,1,1.5,-21,0.125,-22,0.083333,0.12524


In [10]:
# foldに分ける
# credits to: https://www.kaggle.com/code/abhishek/creating-folds-properly-hopefully-p

!pip install -q iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

dfx = pd.get_dummies(train, columns=["score"]).groupby(["anchor"], as_index=False).sum()
cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
dfx = dfx[cols]

mskf = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=42)
labels = [c for c in dfx.columns if c != "anchor"]
dfx_labels = dfx[labels]
dfx["fold"] = -1

for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
    print(len(trn_), len(val_))
    dfx.loc[val_, "fold"] = fold

train = train.merge(dfx[["anchor", "fold"]], on="anchor", how="left")
print(train.fold.value_counts())

550 183
549 184
550 183
550 183
3    9622
0    9379
1    8860
2    8612
Name: fold, dtype: int64


In [11]:
# カテゴリ変数の変換
train['section'] = train['context'].map(lambda s: s[0])
cat_cols = ['section']
for c in cat_cols:
    le = LabelEncoder()
    le.fit(train[c])
    train[f'{c}_encoded'] = le.transform(train[c])
    with open(output_dir / f'{c}_encoder.pkl', 'wb') as f:
        pickle.dump(le, f)

In [12]:
print(train.columns)

Index(['id', 'anchor', 'target', 'context', 'score', 'anchor_tok_len_026', 'target_tok_len_026', 'context_tok_len_026', 'input_len_026', 'len_anc_tgt_diff_026', 'len_anc_tgt_div_026', 'len_anc_cnt_diff_026', 'len_anc_cnt_div_026', 'len_tgt_cnt_diff_026', 'len_tgt_cnt_div_026', 'pred_026', 'anchor_tok_len_033', 'target_tok_len_033', 'context_tok_len_033', 'input_len_033', 'len_anc_tgt_diff_033', 'len_anc_tgt_div_033', 'len_anc_cnt_diff_033', 'len_anc_cnt_div_033', 'len_tgt_cnt_diff_033', 'len_tgt_cnt_div_033', 'pred_033', 'anchor_tok_len_PatentSBERTa-exp035', 'target_tok_len_PatentSBERTa-exp035', 'context_tok_len_PatentSBERTa-exp035', 'input_len_PatentSBERTa-exp035', 'len_anc_tgt_diff_PatentSBERTa-exp035', 'len_anc_tgt_div_PatentSBERTa-exp035', 'len_anc_cnt_diff_PatentSBERTa-exp035', 'len_anc_cnt_div_PatentSBERTa-exp035', 'len_tgt_cnt_diff_PatentSBERTa-exp035', 'len_tgt_cnt_div_PatentSBERTa-exp035', 'pred_PatentSBERTa-exp035', 'fold', 'section', 'section_encoded'], dtype='object')


In [13]:
excluded_columns = ['id', 'anchor', 'target', 'context', 'score', 'fold', 'section']
cols = [col for col in train.columns if col not in excluded_columns]
with open(output_dir / 'cols.pkl', 'wb') as f:
    pickle.dump(cols, f)
print(cols)

['anchor_tok_len_026', 'target_tok_len_026', 'context_tok_len_026', 'input_len_026', 'len_anc_tgt_diff_026', 'len_anc_tgt_div_026', 'len_anc_cnt_diff_026', 'len_anc_cnt_div_026', 'len_tgt_cnt_diff_026', 'len_tgt_cnt_div_026', 'pred_026', 'anchor_tok_len_033', 'target_tok_len_033', 'context_tok_len_033', 'input_len_033', 'len_anc_tgt_diff_033', 'len_anc_tgt_div_033', 'len_anc_cnt_diff_033', 'len_anc_cnt_div_033', 'len_tgt_cnt_diff_033', 'len_tgt_cnt_div_033', 'pred_033', 'anchor_tok_len_PatentSBERTa-exp035', 'target_tok_len_PatentSBERTa-exp035', 'context_tok_len_PatentSBERTa-exp035', 'input_len_PatentSBERTa-exp035', 'len_anc_tgt_diff_PatentSBERTa-exp035', 'len_anc_tgt_div_PatentSBERTa-exp035', 'len_anc_cnt_diff_PatentSBERTa-exp035', 'len_anc_cnt_div_PatentSBERTa-exp035', 'len_tgt_cnt_diff_PatentSBERTa-exp035', 'len_tgt_cnt_div_PatentSBERTa-exp035', 'pred_PatentSBERTa-exp035', 'section_encoded']


In [14]:
params = {
    'objective': 'regression',
    'boosting': 'gbdt',
    'num_iterations': 1000,
    'learning_rate': 0.01,
    # 'num_leaves': 20,  # 31
    'num_threads': cpu_count(logical=False),
    'seed': CFG.seed,

    # 'max_depth': 3,  # -1
    # 'min_data_in_leaf': 100,  # 20
    'verbosity': -1,  # 1

    'metric': 'l2'
}

In [15]:
def tuning_params(df, params):
    anchors = df['anchor'].unique()
    rng = np.random.default_rng(seed=71)
    rng.shuffle(anchors)
    val_prop = 0.25  # バリデーションデータの割合。4foldから0.25
    val_size = int(len(anchors)*val_prop)
    val_anchors = anchors[:val_size]
    is_val = np.isin(df['anchor'], val_anchors)
    idx = np.arange(len(df))
    val_idx = idx[is_val]
    tr_idx = idx[~is_val]
    print(f'train: {len(tr_idx)}, val: {len(val_idx)}')

    tr_df = df.iloc[tr_idx]
    val_df = df.iloc[val_idx]

    cat_cols_ = [f'{c}_encoded' for c in cat_cols]
    tr_ds = lgb.Dataset(tr_df[cols], tr_df['score'], categorical_feature=cat_cols_)
    val_ds = lgb.Dataset(val_df[cols], val_df['score'], categorical_feature=cat_cols_, reference=tr_ds)
    
    model = lgb_optuna.train(
        params,
        tr_ds, 
        valid_sets=[tr_ds, val_ds],
        callbacks=[lgb.early_stopping(stopping_rounds=100),
                    lgb.log_evaluation(period=100)],
        verbosity=-1,
    )

    best_params = model.params

    return best_params

In [None]:
best_params = tuning_params(train, params)
display(best_params)

[32m[I 2022-06-20 10:08:56,258][0m A new study created in memory with name: no-name-6a25ff3c-2efe-42b6-8e50-8ebb2b710e33[0m


train: 27261, val: 9212


feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]

[1]	valid_0's l2: 0.0662791	valid_1's l2: 0.0643159
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l2: 0.0653375	valid_1's l2: 0.0634083
[3]	valid_0's l2: 0.0644145	valid_1's l2: 0.0625182
[4]	valid_0's l2: 0.0635099	valid_1's l2: 0.061647
[5]	valid_0's l2: 0.0626226	valid_1's l2: 0.0607927
[6]	valid_0's l2: 0.061752	valid_1's l2: 0.059957
[7]	valid_0's l2: 0.0609001	valid_1's l2: 0.0591368
[8]	valid_0's l2: 0.0600636	valid_1's l2: 0.0583344
[9]	valid_0's l2: 0.0592443	valid_1's l2: 0.0575454
[10]	valid_0's l2: 0.0584636	valid_1's l2: 0.0567967
[11]	valid_0's l2: 0.0576752	valid_1's l2: 0.0560403
[12]	valid_0's l2: 0.056902	valid_1's l2: 0.0552974
[13]	valid_0's l2: 0.0561455	valid_1's l2: 0.0545704
[14]	valid_0's l2: 0.0554028	valid_1's l2: 0.0538575
[15]	valid_0's l2: 0.0546749	valid_1's l2: 0.0531589
[16]	valid_0's l2: 0.053961	valid_1's l2: 0.0524735
[17]	valid_0's l2: 0.0532618	valid_1's l2: 0.0518029
[18]	valid_0's l2: 0.052576	valid_1's l2: 0.0511455

feature_fraction, val_score: 0.019247:  14%|#4        | 1/7 [00:04<00:27,  4.58s/it]

[1]	valid_0's l2: 0.0663215	valid_1's l2: 0.0643598
[2]	valid_0's l2: 0.0654215	valid_1's l2: 0.0634949
[3]	valid_0's l2: 0.0646133	valid_1's l2: 0.0627144
[4]	valid_0's l2: 0.0644908	valid_1's l2: 0.062618
[5]	valid_0's l2: 0.0636123	valid_1's l2: 0.0617729
[6]	valid_0's l2: 0.062721	valid_1's l2: 0.0609145
[7]	valid_0's l2: 0.0618673	valid_1's l2: 0.0600869
[8]	valid_0's l2: 0.0610109	valid_1's l2: 0.0592632
[9]	valid_0's l2: 0.0601713	valid_1's l2: 0.0584561
[10]	valid_0's l2: 0.059454	valid_1's l2: 0.0577656
[11]	valid_0's l2: 0.0586712	valid_1's l2: 0.0570147
[12]	valid_0's l2: 0.0578915	valid_1's l2: 0.0562605
[13]	valid_0's l2: 0.0571496	valid_1's l2: 0.0555496
[14]	valid_0's l2: 0.0564225	valid_1's l2: 0.0548542
[15]	valid_0's l2: 0.0556857	valid_1's l2: 0.0541424
[16]	valid_0's l2: 0.0549756	valid_1's l2: 0.0534614
[17]	valid_0's l2: 0.0548762	valid_1's l2: 0.0533846
[18]	valid_0's l2: 0.0541821	valid_1's l2: 0.0527201
[19]	valid_0's l2: 0.0535702	valid_1's l2: 0.0521319
[20]	

feature_fraction, val_score: 0.019247:  29%|##8       | 2/7 [00:12<00:33,  6.79s/it]

[1]	valid_0's l2: 0.0663076	valid_1's l2: 0.0643446
[2]	valid_0's l2: 0.0654074	valid_1's l2: 0.0634785
[3]	valid_0's l2: 0.0645999	valid_1's l2: 0.0627001
[4]	valid_0's l2: 0.0637327	valid_1's l2: 0.061867
[5]	valid_0's l2: 0.0628693	valid_1's l2: 0.061037
[6]	valid_0's l2: 0.0619923	valid_1's l2: 0.060193
[7]	valid_0's l2: 0.0611522	valid_1's l2: 0.0593789
[8]	valid_0's l2: 0.0603098	valid_1's l2: 0.058568
[9]	valid_0's l2: 0.0594841	valid_1's l2: 0.0577747
[10]	valid_0's l2: 0.0587789	valid_1's l2: 0.0570958
[11]	valid_0's l2: 0.0579824	valid_1's l2: 0.0563307
[12]	valid_0's l2: 0.0572162	valid_1's l2: 0.0555903
[13]	valid_0's l2: 0.0564869	valid_1's l2: 0.0548926
[14]	valid_0's l2: 0.0557719	valid_1's l2: 0.0542088
[15]	valid_0's l2: 0.0550477	valid_1's l2: 0.0535093
[16]	valid_0's l2: 0.0543498	valid_1's l2: 0.0528415
[17]	valid_0's l2: 0.0537337	valid_1's l2: 0.052248
[18]	valid_0's l2: 0.0530366	valid_1's l2: 0.0515811
[19]	valid_0's l2: 0.0523655	valid_1's l2: 0.050934
[20]	val

feature_fraction, val_score: 0.019195:  43%|####2     | 3/7 [00:22<00:32,  8.03s/it]

[1]	valid_0's l2: 0.0662791	valid_1's l2: 0.0643159
[2]	valid_0's l2: 0.0653375	valid_1's l2: 0.0634083
[3]	valid_0's l2: 0.0645302	valid_1's l2: 0.0626283
[4]	valid_0's l2: 0.0636495	valid_1's l2: 0.0617816
[5]	valid_0's l2: 0.0627593	valid_1's l2: 0.0609232
[6]	valid_0's l2: 0.0618865	valid_1's l2: 0.0600818
[7]	valid_0's l2: 0.0610306	valid_1's l2: 0.0592584
[8]	valid_0's l2: 0.0601909	valid_1's l2: 0.0584507
[9]	valid_0's l2: 0.0593685	valid_1's l2: 0.0576605
[10]	valid_0's l2: 0.0585863	valid_1's l2: 0.0569105
[11]	valid_0's l2: 0.0577949	valid_1's l2: 0.0561508
[12]	valid_0's l2: 0.0570192	valid_1's l2: 0.0554045
[13]	valid_0's l2: 0.0562929	valid_1's l2: 0.0547093
[14]	valid_0's l2: 0.0555462	valid_1's l2: 0.053992
[15]	valid_0's l2: 0.0548269	valid_1's l2: 0.0532974
[16]	valid_0's l2: 0.0541095	valid_1's l2: 0.0526092
[17]	valid_0's l2: 0.0534063	valid_1's l2: 0.0519357
[18]	valid_0's l2: 0.0527169	valid_1's l2: 0.0512749
[19]	valid_0's l2: 0.0520415	valid_1's l2: 0.050627
[20]

feature_fraction, val_score: 0.019195:  57%|#####7    | 4/7 [00:31<00:25,  8.48s/it]

[522]	valid_0's l2: 0.0167391	valid_1's l2: 0.0192656
Early stopping, best iteration is:
[422]	valid_0's l2: 0.0172221	valid_1's l2: 0.0191952
[1]	valid_0's l2: 0.0663076	valid_1's l2: 0.0643446
[2]	valid_0's l2: 0.0654072	valid_1's l2: 0.0634793
[3]	valid_0's l2: 0.0645998	valid_1's l2: 0.0627008
[4]	valid_0's l2: 0.0637322	valid_1's l2: 0.0618672
[5]	valid_0's l2: 0.0628688	valid_1's l2: 0.0610372
[6]	valid_0's l2: 0.0619918	valid_1's l2: 0.060193
[7]	valid_0's l2: 0.0611473	valid_1's l2: 0.0593753
[8]	valid_0's l2: 0.0603049	valid_1's l2: 0.0585644
[9]	valid_0's l2: 0.0594794	valid_1's l2: 0.0577712
[10]	valid_0's l2: 0.0586968	valid_1's l2: 0.0570216
[11]	valid_0's l2: 0.0579021	valid_1's l2: 0.0562575
[12]	valid_0's l2: 0.0571231	valid_1's l2: 0.0555097
[13]	valid_0's l2: 0.0563967	valid_1's l2: 0.0548143
[14]	valid_0's l2: 0.0556483	valid_1's l2: 0.0540949
[15]	valid_0's l2: 0.0549261	valid_1's l2: 0.0533974
[16]	valid_0's l2: 0.0542062	valid_1's l2: 0.0527066
[17]	valid_0's l2: 

feature_fraction, val_score: 0.019182:  71%|#######1  | 5/7 [00:35<00:13,  6.96s/it]

[1]	valid_0's l2: 0.0663076	valid_1's l2: 0.0643446
[2]	valid_0's l2: 0.0654072	valid_1's l2: 0.0634791
[3]	valid_0's l2: 0.0645998	valid_1's l2: 0.0627013
[4]	valid_0's l2: 0.0637323	valid_1's l2: 0.0618677
[5]	valid_0's l2: 0.062839	valid_1's l2: 0.0610072
[6]	valid_0's l2: 0.0619629	valid_1's l2: 0.0601652
[7]	valid_0's l2: 0.0611041	valid_1's l2: 0.0593388
[8]	valid_0's l2: 0.0602622	valid_1's l2: 0.0585294
[9]	valid_0's l2: 0.0594371	valid_1's l2: 0.0577359
[10]	valid_0's l2: 0.0586549	valid_1's l2: 0.0569854
[11]	valid_0's l2: 0.0578615	valid_1's l2: 0.0562233
[12]	valid_0's l2: 0.0570838	valid_1's l2: 0.0554774
[13]	valid_0's l2: 0.0563578	valid_1's l2: 0.054782
[14]	valid_0's l2: 0.0556104	valid_1's l2: 0.0540648
[15]	valid_0's l2: 0.0548889	valid_1's l2: 0.0533683
[16]	valid_0's l2: 0.05417	valid_1's l2: 0.052678
[17]	valid_0's l2: 0.0534768	valid_1's l2: 0.0520088
[18]	valid_0's l2: 0.0527862	valid_1's l2: 0.0513467
[19]	valid_0's l2: 0.0521083	valid_1's l2: 0.050697
[20]	val

feature_fraction, val_score: 0.019182:  86%|########5 | 6/7 [00:40<00:06,  6.05s/it]

[1]	valid_0's l2: 0.0662791	valid_1's l2: 0.0643159
[2]	valid_0's l2: 0.0653375	valid_1's l2: 0.0634083
[3]	valid_0's l2: 0.0644145	valid_1's l2: 0.0625182
[4]	valid_0's l2: 0.0635099	valid_1's l2: 0.061647
[5]	valid_0's l2: 0.0626226	valid_1's l2: 0.0607927
[6]	valid_0's l2: 0.061752	valid_1's l2: 0.059957
[7]	valid_0's l2: 0.0609001	valid_1's l2: 0.0591368
[8]	valid_0's l2: 0.0600636	valid_1's l2: 0.0583344
[9]	valid_0's l2: 0.0592443	valid_1's l2: 0.0575451
[10]	valid_0's l2: 0.0584406	valid_1's l2: 0.0567746
[11]	valid_0's l2: 0.0576525	valid_1's l2: 0.056017
[12]	valid_0's l2: 0.0568808	valid_1's l2: 0.0552754
[13]	valid_0's l2: 0.0561235	valid_1's l2: 0.0545481
[14]	valid_0's l2: 0.0553815	valid_1's l2: 0.053836
[15]	valid_0's l2: 0.0546544	valid_1's l2: 0.0531385
[16]	valid_0's l2: 0.0539412	valid_1's l2: 0.0524555
[17]	valid_0's l2: 0.0532418	valid_1's l2: 0.0517854
[18]	valid_0's l2: 0.0525564	valid_1's l2: 0.0511281
[19]	valid_0's l2: 0.0518842	valid_1's l2: 0.0504837
[20]	va

feature_fraction, val_score: 0.019182: 100%|##########| 7/7 [00:44<00:00,  6.39s/it]
num_leaves, val_score: 0.019182:   0%|          | 0/20 [00:00<?, ?it/s]

[1]	valid_0's l2: 0.0663234	valid_1's l2: 0.0643602
[2]	valid_0's l2: 0.0654308	valid_1's l2: 0.0634999
[3]	valid_0's l2: 0.0646312	valid_1's l2: 0.0627232
[4]	valid_0's l2: 0.0637714	valid_1's l2: 0.0618951
[5]	valid_0's l2: 0.0629224	valid_1's l2: 0.0610787
[6]	valid_0's l2: 0.0620618	valid_1's l2: 0.0602464
[7]	valid_0's l2: 0.0612276	valid_1's l2: 0.0594372
[8]	valid_0's l2: 0.0603998	valid_1's l2: 0.0586361
[9]	valid_0's l2: 0.0595883	valid_1's l2: 0.0578536
[10]	valid_0's l2: 0.0588182	valid_1's l2: 0.0571164
[11]	valid_0's l2: 0.0580382	valid_1's l2: 0.0563625
[12]	valid_0's l2: 0.0572732	valid_1's l2: 0.0556249
[13]	valid_0's l2: 0.056552	valid_1's l2: 0.054932
[14]	valid_0's l2: 0.0558159	valid_1's l2: 0.0542219
[15]	valid_0's l2: 0.0551025	valid_1's l2: 0.0535306
[16]	valid_0's l2: 0.0543939	valid_1's l2: 0.0528442
[17]	valid_0's l2: 0.053782	valid_1's l2: 0.0522508
[18]	valid_0's l2: 0.0530998	valid_1's l2: 0.0515952
[19]	valid_0's l2: 0.0524382	valid_1's l2: 0.0509546
[20]	

num_leaves, val_score: 0.019152:   5%|5         | 1/20 [00:03<01:06,  3.52s/it]

[507]	valid_0's l2: 0.0179102	valid_1's l2: 0.0191577
[508]	valid_0's l2: 0.0179083	valid_1's l2: 0.0191581
[509]	valid_0's l2: 0.0179065	valid_1's l2: 0.0191584
[510]	valid_0's l2: 0.0179052	valid_1's l2: 0.0191582
[511]	valid_0's l2: 0.0179037	valid_1's l2: 0.0191582
[512]	valid_0's l2: 0.0179023	valid_1's l2: 0.0191585
[513]	valid_0's l2: 0.0179007	valid_1's l2: 0.0191584
[514]	valid_0's l2: 0.0178993	valid_1's l2: 0.0191578
[515]	valid_0's l2: 0.017898	valid_1's l2: 0.0191575
[516]	valid_0's l2: 0.0178964	valid_1's l2: 0.0191574
[517]	valid_0's l2: 0.0178948	valid_1's l2: 0.0191571
[518]	valid_0's l2: 0.0178931	valid_1's l2: 0.019158
[519]	valid_0's l2: 0.017892	valid_1's l2: 0.019158
[520]	valid_0's l2: 0.0178903	valid_1's l2: 0.0191583
[521]	valid_0's l2: 0.0178889	valid_1's l2: 0.0191587
[522]	valid_0's l2: 0.0178872	valid_1's l2: 0.0191582
Early stopping, best iteration is:
[422]	valid_0's l2: 0.018056	valid_1's l2: 0.0191518
[1]	valid_0's l2: 0.066349	valid_1's l2: 0.0643859
[

num_leaves, val_score: 0.019125:  10%|#         | 2/20 [00:06<01:01,  3.43s/it]

[554]	valid_0's l2: 0.0182962	valid_1's l2: 0.0191267
[555]	valid_0's l2: 0.0182955	valid_1's l2: 0.0191266
[556]	valid_0's l2: 0.0182945	valid_1's l2: 0.019127
[557]	valid_0's l2: 0.0182935	valid_1's l2: 0.0191272
[558]	valid_0's l2: 0.0182928	valid_1's l2: 0.019127
[559]	valid_0's l2: 0.0182921	valid_1's l2: 0.0191268
[560]	valid_0's l2: 0.0182911	valid_1's l2: 0.0191269
[561]	valid_0's l2: 0.0182903	valid_1's l2: 0.0191271
[562]	valid_0's l2: 0.0182896	valid_1's l2: 0.0191268
[563]	valid_0's l2: 0.0182888	valid_1's l2: 0.0191268
[564]	valid_0's l2: 0.0182882	valid_1's l2: 0.0191267
[565]	valid_0's l2: 0.0182874	valid_1's l2: 0.0191265
[566]	valid_0's l2: 0.0182865	valid_1's l2: 0.0191261
[567]	valid_0's l2: 0.0182859	valid_1's l2: 0.019126
[568]	valid_0's l2: 0.0182849	valid_1's l2: 0.0191258
[569]	valid_0's l2: 0.0182842	valid_1's l2: 0.0191254
[570]	valid_0's l2: 0.0182831	valid_1's l2: 0.0191255
[571]	valid_0's l2: 0.0182824	valid_1's l2: 0.0191255
[572]	valid_0's l2: 0.0182816	v

num_leaves, val_score: 0.019125:  15%|#5        | 3/20 [00:18<01:59,  7.05s/it]

[1]	valid_0's l2: 0.0663049	valid_1's l2: 0.0643436
[2]	valid_0's l2: 0.0654031	valid_1's l2: 0.0634781
[3]	valid_0's l2: 0.064594	valid_1's l2: 0.0626985
[4]	valid_0's l2: 0.0637248	valid_1's l2: 0.0618647
[5]	valid_0's l2: 0.0628588	valid_1's l2: 0.0610337
[6]	valid_0's l2: 0.0619796	valid_1's l2: 0.0601888
[7]	valid_0's l2: 0.0611333	valid_1's l2: 0.0593702
[8]	valid_0's l2: 0.060289	valid_1's l2: 0.0585593
[9]	valid_0's l2: 0.0594609	valid_1's l2: 0.0577637
[10]	valid_0's l2: 0.0586762	valid_1's l2: 0.0570123
[11]	valid_0's l2: 0.0578796	valid_1's l2: 0.0562487
[12]	valid_0's l2: 0.0570986	valid_1's l2: 0.0555006
[13]	valid_0's l2: 0.0563711	valid_1's l2: 0.0548056
[14]	valid_0's l2: 0.055621	valid_1's l2: 0.0540872
[15]	valid_0's l2: 0.0548971	valid_1's l2: 0.0533891
[16]	valid_0's l2: 0.0541752	valid_1's l2: 0.0526996
[17]	valid_0's l2: 0.0535584	valid_1's l2: 0.0521096
[18]	valid_0's l2: 0.0528628	valid_1's l2: 0.0514436
[19]	valid_0's l2: 0.0521929	valid_1's l2: 0.0507986
[20]	

num_leaves, val_score: 0.019125:  20%|##        | 4/20 [00:23<01:40,  6.29s/it]

[1]	valid_0's l2: 0.066271	valid_1's l2: 0.0643333
[2]	valid_0's l2: 0.0653419	valid_1's l2: 0.0634632
[3]	valid_0's l2: 0.0645003	valid_1's l2: 0.062679
[4]	valid_0's l2: 0.0636049	valid_1's l2: 0.0618423
[5]	valid_0's l2: 0.0627079	valid_1's l2: 0.0610019
[6]	valid_0's l2: 0.0617981	valid_1's l2: 0.0601522
[7]	valid_0's l2: 0.0609247	valid_1's l2: 0.0593322
[8]	valid_0's l2: 0.0600514	valid_1's l2: 0.0585119
[9]	valid_0's l2: 0.0591922	valid_1's l2: 0.0577099
[10]	valid_0's l2: 0.058379	valid_1's l2: 0.0569513
[11]	valid_0's l2: 0.0575542	valid_1's l2: 0.0561816
[12]	valid_0's l2: 0.0567453	valid_1's l2: 0.055429
[13]	valid_0's l2: 0.0559957	valid_1's l2: 0.0547326
[14]	valid_0's l2: 0.0552179	valid_1's l2: 0.0540114
[15]	valid_0's l2: 0.0544691	valid_1's l2: 0.0533131
[16]	valid_0's l2: 0.0537188	valid_1's l2: 0.0526181
[17]	valid_0's l2: 0.0530752	valid_1's l2: 0.0520272
[18]	valid_0's l2: 0.052352	valid_1's l2: 0.0513557
[19]	valid_0's l2: 0.051659	valid_1's l2: 0.0507114
[20]	val

num_leaves, val_score: 0.019125:  25%|##5       | 5/20 [00:35<02:03,  8.24s/it]

[1]	valid_0's l2: 0.0662807	valid_1's l2: 0.0643335
[2]	valid_0's l2: 0.0653602	valid_1's l2: 0.0634631
[3]	valid_0's l2: 0.0645281	valid_1's l2: 0.0626785
[4]	valid_0's l2: 0.063641	valid_1's l2: 0.0618425
[5]	valid_0's l2: 0.0627526	valid_1's l2: 0.061003
[6]	valid_0's l2: 0.0618511	valid_1's l2: 0.0601511
[7]	valid_0's l2: 0.0609857	valid_1's l2: 0.059332
[8]	valid_0's l2: 0.0601205	valid_1's l2: 0.0585134
[9]	valid_0's l2: 0.0592701	valid_1's l2: 0.0577125
[10]	valid_0's l2: 0.0584646	valid_1's l2: 0.056955
[11]	valid_0's l2: 0.0576479	valid_1's l2: 0.0561848
[12]	valid_0's l2: 0.0568469	valid_1's l2: 0.0554313
[13]	valid_0's l2: 0.0561042	valid_1's l2: 0.054734
[14]	valid_0's l2: 0.0553342	valid_1's l2: 0.0540111
[15]	valid_0's l2: 0.0545928	valid_1's l2: 0.0533126
[16]	valid_0's l2: 0.0538508	valid_1's l2: 0.052619
[17]	valid_0's l2: 0.0532165	valid_1's l2: 0.0520283
[18]	valid_0's l2: 0.052501	valid_1's l2: 0.0513579
[19]	valid_0's l2: 0.0518148	valid_1's l2: 0.0507136
[20]	vali

num_leaves, val_score: 0.019125:  30%|###       | 6/20 [00:44<01:59,  8.57s/it]

[1]	valid_0's l2: 0.0663021	valid_1's l2: 0.0643428
[2]	valid_0's l2: 0.0653986	valid_1's l2: 0.0634776
[3]	valid_0's l2: 0.0645875	valid_1's l2: 0.0626959
[4]	valid_0's l2: 0.0637161	valid_1's l2: 0.0618621
[5]	valid_0's l2: 0.0628476	valid_1's l2: 0.0610301
[6]	valid_0's l2: 0.0619659	valid_1's l2: 0.0601841
[7]	valid_0's l2: 0.0611176	valid_1's l2: 0.0593651
[8]	valid_0's l2: 0.060271	valid_1's l2: 0.0585529
[9]	valid_0's l2: 0.0594408	valid_1's l2: 0.057758
[10]	valid_0's l2: 0.0586539	valid_1's l2: 0.0570061
[11]	valid_0's l2: 0.0578552	valid_1's l2: 0.0562416
[12]	valid_0's l2: 0.0570723	valid_1's l2: 0.0554928
[13]	valid_0's l2: 0.0563434	valid_1's l2: 0.0547977
[14]	valid_0's l2: 0.0555911	valid_1's l2: 0.0540763
[15]	valid_0's l2: 0.0548655	valid_1's l2: 0.0533775
[16]	valid_0's l2: 0.0541416	valid_1's l2: 0.0526877
[17]	valid_0's l2: 0.0535233	valid_1's l2: 0.0520973
[18]	valid_0's l2: 0.0528252	valid_1's l2: 0.0514322
[19]	valid_0's l2: 0.0521539	valid_1's l2: 0.0507862
[20]

In [None]:
def prepare_datasets(df, fold):
    tr_df = df[df['fold']!=fold]
    val_df = df[df['fold']==fold]

    cat_cols_ = [f'{c}_encoded' for c in cat_cols]
    tr_ds = lgb.Dataset(tr_df[cols], tr_df['score'], categorical_feature=cat_cols_)
    val_ds = lgb.Dataset(val_df[cols], val_df['score'], categorical_feature=cat_cols_, reference=tr_ds)

    return tr_ds, val_ds


def train_fn(df, params):
    preds = []

    for fold in df['fold'].unique():
        tr_ds, val_ds = prepare_datasets(df, fold)
        
        model = lgb.train(
            params, 
            tr_ds, 
            valid_sets=[tr_ds, val_ds],
            callbacks=[lgb.early_stopping(stopping_rounds=100),
                       lgb.log_evaluation(period=100)]
        )

        pred = df[df['fold']==fold].copy()
        pred['pred_lgb'] = model.predict(pred[cols], num_iteration=model.best_iteration)
        preds.append(pred)

        model.save_model(output_dir / f'lgb_fold{fold}.txt', num_iteration=model.best_iteration)

    return pd.concat(preds, axis=0, ignore_index=False)

In [None]:
pred = train_fn(train, best_params)

In [None]:
importances = []
for fold in train['fold'].unique():
    importances.append(
        lgb.Booster(model_file=output_dir / f'lgb_fold{fold}.txt').feature_importance(importance_type='gain'))
importance_df = pd.DataFrame({
    'feature': cols,
    'importance': np.mean(importances, axis=0)
    })
importance_df.sort_values('importance', ascending=False)

In [None]:
# optunaのbest params
print(get_score(pred['score'], pred['pred_lgb']))
pred.head()

In [None]:
import os
import json
from kaggle.api.kaggle_api_extended import KaggleApi

ID = 'hanejiyuto'
DATASET_ID = ColabConfig.dataset_name + '-' + ColabConfig.dataset_version.replace('_', '-')
UPLOAD_DIR = ColabConfig.dataset_dir
VERSION_NOTES = ColabConfig.dataset_note

def dataset_create_new():
    dataset_metadata = {}
    dataset_metadata['id'] = f'{ID}/{DATASET_ID}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = DATASET_ID
    with open(os.path.join(UPLOAD_DIR, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    # api.dataset_create_new(folder=UPLOAD_DIR, convert_to_csv=False, dir_mode='tar')
    !kaggle datasets create -t -p $UPLOAD_DIR -r tar

def dataset_create_version():  # バージョンアップデート
    dataset_metadata = {}
    dataset_metadata['id'] = f'{ID}/{DATASET_ID}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = DATASET_ID
    if not os.path.exists('dataset-metadata.json'):
        with open(os.path.join(UPLOAD_DIR, 'dataset-metadata.json'), 'w') as f:
            json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    # api.dataset_create_version(folder=UPLOAD_DIR, version_notes=VERSION_NOTES, convert_to_csv=False, dir_mode='tar')
    !kaggle datasets version -t -p $UPLOAD_DIR -r tar -m $VERSION_NOTES

if CFG.debug:
    pass
elif ColabConfig.dataset_new:
    dataset_create_new()
else:
    dataset_create_version()