In [10]:
# ==================
# Library
# ==================
import warnings
warnings.simplefilter('ignore')
import math
import pandas as pd
import numpy as np
import lightgbm as lgb
import time
import datetime
import gc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from contextlib import contextmanager
import logging
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import log_loss
import sys
import time
import feather
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import pickle
%matplotlib inline
from sklearn.metrics import average_precision_score

sys.path.append("../src/")
from logger import setup_logger, LOGGER
from trainer import train_lgbm
from util_tool import reduce_mem_usage
pd.set_option('display.max_columns', 300)

In [11]:
# ==================
# Constant
# ==================
ex = "021"
TRAIN_PATH = "../input/train.csv"
TEST_PATH = "../input/test.csv"
USER_PATH = "../input/user_x_anime.csv"
SUB_PATH = "../input/sample_submission.csv"
SAVE_OOF_PATH = f"../output/exp/ex{ex}_oof.npy"
SAVE_TEST_SUB_PATH = f"../output/exp/ex{ex}_test_sub.csv"
LOGGER_PATH = f"../output/exp/ex_{ex}.txt"

In [22]:

# ===============
# Settings
# ===============

SEED = 0
N_SPLITS = 5
SHUFFLE = True
LGBM_PARAMS = {'num_leaves': 32,
               'min_data_in_leaf': 64,
               'objective': 'regression',
               'max_depth': -1,
               'learning_rate': 0.05,
               "boosting": "gbdt",
               "bagging_freq": 1,
               "bagging_fraction": 0.8,
               "verbosity": -1,
              'reg_alpha': 0.1,
              'reg_lambda': 0.3,
              'colsample_bytree': 0.7,
              'metric':"rmse",
              'num_threads':6,
         }

LGBM_FIT_PARAMS = {'num_boost_round': 5000,
                'early_stopping_rounds': 200,
               'verbose_eval': 1000,
         }

load_feature = ["../output/fe/fe001.feather",
                "../output/fe/fe002.feather",
                "../output/fe/fe003.feather",
                "../output/fe/fe004.feather",
                "../output/fe/fe005.feather",
                "../output/fe/fe006.feather",
                "../output/fe/fe007.feather",
                "../output/fe/fe009.feather",
                "../output/fe/fe011.feather",
                "../output/fe/fe012.feather",
                "../output/fe/fe020.feather",
                "../output/fe/fe021.feather",]

In [13]:
# ====================
# Function
# ====================

def calc_loss(y_true, y_pred):
    return  np.sqrt(mean_squared_error(y_true, y_pred))


@contextmanager
def timer(name):
    t0 = time.time()
    yield 
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')

In [14]:
LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
setup_logger(out_file=LOGGER_PATH)

2021-08-23 06:32:35,936 - INFO - logger set up


<RootLogger root (DEBUG)>

In [15]:
# ====================
# Main
# ====================
train_raw = pd.read_csv(TRAIN_PATH)
y = train_raw["Score"]
df = None
for i in load_feature:
    if df is not None:
        _df = pd.read_feather(i)
        if "Source" in _df.columns:
            print(i)
        df  = pd.concat([df, _df], axis=1)
    else:
        df = pd.read_feather(i)
train = df.iloc[:len(train_raw)]
test = df.iloc[len(train_raw):].reset_index(drop=True)

../output/fe/fe009.feather


In [16]:
for n,i in enumerate(train.columns):
    if i =="Source":
        print(n)

82


In [23]:
with timer("lightgbm"):
    kf = KFold(n_splits=N_SPLITS,random_state=SEED, shuffle=SHUFFLE)
    y_oof = np.empty([len(train),])
    y_test = []
    drop_cols = []
    features = list(train.columns)
    features = [i for i in features if i not in drop_cols]
    feature_importances = pd.DataFrame()
    categorical_features = []
    for fold, (train_idx, valid_idx) in enumerate(kf.split(train,y)):
        print('Fold {}'.format(fold + 1))
        with timer(f"fold {fold}"):
            x_train, y_train = train.iloc[train_idx][features], y.iloc[train_idx]
            x_val, y_val =train.iloc[valid_idx][features], y.iloc[valid_idx]
            print("train:",len(x_train))

            y_pred_valid, y_pred_test, valid_loss, importances, best_iter,_ = train_lgbm(
                        x_train, y_train, x_val, y_val,test[features],
                        categorical_features=categorical_features,
                        feature_name=features,
                        fold_id=fold,
                        lgb_params=LGBM_PARAMS,
                        fit_params=LGBM_FIT_PARAMS,
                        loss_func=calc_loss,
                        calc_importances=True
                    )

            y_oof[valid_idx] = y_pred_valid
            score = calc_loss(y[valid_idx], y_pred_valid)
            LOGGER.info(f'Fold{fold}:CV={score}')
            y_test.append(y_pred_test)
            feature_importances = pd.concat([feature_importances, importances], axis=0, sort=False)

    score = calc_loss(y, y_oof)
    np.save(SAVE_OOF_PATH, y_oof)
    LOGGER.info(f'CV={score}')

Fold 1
train: 4000
Training until validation scores don't improve for 200 rounds
[1000]	training's rmse: 0.0460744	valid_1's rmse: 0.320517
Early stopping, best iteration is:
[1174]	training's rmse: 0.0386037	valid_1's rmse: 0.320318


2021-08-23 06:36:09,354 - INFO - Fold0:CV=0.32031825611961684
2021-08-23 06:36:09,360 - INFO - [fold 0] done in 6 s


Fold 2
train: 4000
Training until validation scores don't improve for 200 rounds
[1000]	training's rmse: 0.0421439	valid_1's rmse: 0.310091
Early stopping, best iteration is:
[1347]	training's rmse: 0.02944	valid_1's rmse: 0.309474


2021-08-23 06:36:17,247 - INFO - Fold1:CV=0.3094743857436537
2021-08-23 06:36:17,250 - INFO - [fold 1] done in 8 s


Fold 3
train: 4000
Training until validation scores don't improve for 200 rounds
[1000]	training's rmse: 0.0478318	valid_1's rmse: 0.350704
Early stopping, best iteration is:
[939]	training's rmse: 0.0511125	valid_1's rmse: 0.350471


2021-08-23 06:36:23,482 - INFO - Fold2:CV=0.350471238227671
2021-08-23 06:36:23,487 - INFO - [fold 2] done in 6 s


Fold 4
train: 4000
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[774]	training's rmse: 0.0629252	valid_1's rmse: 0.328938


2021-08-23 06:36:29,207 - INFO - Fold3:CV=0.32893784337597265
2021-08-23 06:36:29,212 - INFO - [fold 3] done in 6 s


Fold 5
train: 4000
Training until validation scores don't improve for 200 rounds
[1000]	training's rmse: 0.0404266	valid_1's rmse: 0.3394
Early stopping, best iteration is:
[1430]	training's rmse: 0.026709	valid_1's rmse: 0.338878


2021-08-23 06:36:39,408 - INFO - Fold4:CV=0.3388775357755466
2021-08-23 06:36:39,414 - INFO - [fold 4] done in 10 s
2021-08-23 06:36:39,429 - INFO - CV=0.32992312999179185
2021-08-23 06:36:39,434 - INFO - [lightgbm] done in 36 s


In [9]:
with timer("lightgbm"):
    kf = KFold(n_splits=N_SPLITS,random_state=SEED, shuffle=SHUFFLE)
    y_oof = np.empty([len(train),])
    y_test = []
    drop_cols = []
    features = list(train.columns)
    features = [i for i in features if i not in drop_cols]
    feature_importances = pd.DataFrame()
    categorical_features = []
    for fold, (train_idx, valid_idx) in enumerate(kf.split(train,y)):
        print('Fold {}'.format(fold + 1))
        with timer(f"fold {fold}"):
            x_train, y_train = train.iloc[train_idx][features], y.iloc[train_idx]
            x_val, y_val =train.iloc[valid_idx][features], y.iloc[valid_idx]
            print("train:",len(x_train))

            y_pred_valid, y_pred_test, valid_loss, importances, best_iter,_ = train_lgbm(
                        x_train, y_train, x_val, y_val,test[features],
                        categorical_features=categorical_features,
                        feature_name=features,
                        fold_id=fold,
                        lgb_params=LGBM_PARAMS,
                        fit_params=LGBM_FIT_PARAMS,
                        loss_func=calc_loss,
                        calc_importances=True
                    )

            y_oof[valid_idx] = y_pred_valid
            score = calc_loss(y[valid_idx], y_pred_valid)
            LOGGER.info(f'Fold{fold}:CV={score}')
            y_test.append(y_pred_test)
            feature_importances = pd.concat([feature_importances, importances], axis=0, sort=False)

    score = calc_loss(y, y_oof)
    np.save(SAVE_OOF_PATH, y_oof)
    LOGGER.info(f'CV={score}')

Fold 1
train: 4000
Training until validation scores don't improve for 200 rounds
[1000]	training's rmse: 0.0460744	valid_1's rmse: 0.320517
Early stopping, best iteration is:
[1174]	training's rmse: 0.0386037	valid_1's rmse: 0.320318


2021-08-23 02:41:57,201 - INFO - Fold0:CV=0.32031825611961684
2021-08-23 02:41:57,206 - INFO - [fold 0] done in 7 s


Fold 2
train: 4000
Training until validation scores don't improve for 200 rounds
[1000]	training's rmse: 0.0421439	valid_1's rmse: 0.310091
Early stopping, best iteration is:
[1347]	training's rmse: 0.02944	valid_1's rmse: 0.309474


2021-08-23 02:42:04,983 - INFO - Fold1:CV=0.3094743857436537
2021-08-23 02:42:04,988 - INFO - [fold 1] done in 8 s


Fold 3
train: 4000
Training until validation scores don't improve for 200 rounds
[1000]	training's rmse: 0.0478318	valid_1's rmse: 0.350704
Early stopping, best iteration is:
[939]	training's rmse: 0.0511125	valid_1's rmse: 0.350471


2021-08-23 02:42:11,089 - INFO - Fold2:CV=0.350471238227671
2021-08-23 02:42:11,105 - INFO - [fold 2] done in 6 s


Fold 4
train: 4000
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[774]	training's rmse: 0.0629252	valid_1's rmse: 0.328938


2021-08-23 02:42:16,495 - INFO - Fold3:CV=0.32893784337597265
2021-08-23 02:42:16,498 - INFO - [fold 3] done in 5 s


Fold 5
train: 4000
Training until validation scores don't improve for 200 rounds
[1000]	training's rmse: 0.0404266	valid_1's rmse: 0.3394
Early stopping, best iteration is:
[1430]	training's rmse: 0.026709	valid_1's rmse: 0.338878


2021-08-23 02:42:25,489 - INFO - Fold4:CV=0.3388775357755466
2021-08-23 02:42:25,494 - INFO - [fold 4] done in 9 s
2021-08-23 02:42:25,507 - INFO - CV=0.32992312999179185
2021-08-23 02:42:25,513 - INFO - [lightgbm] done in 35 s
