In [1]:
# ==================
# Library
# ==================
import warnings
warnings.simplefilter('ignore')
import math
import pandas as pd
import numpy as np
import lightgbm as lgb
import time
import datetime
import gc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from contextlib import contextmanager
import logging
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import log_loss
import sys
import time
import feather
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import pickle
%matplotlib inline
from sklearn.metrics import average_precision_score

sys.path.append("../src/")
from logger import setup_logger, LOGGER
from trainer import train_lgbm,train_cat_regressor
from util_tool import reduce_mem_usage
pd.set_option('display.max_columns', 300)

In [2]:
# ==================
# Constant
# ==================
ex = "020"
TRAIN_PATH = "../input/train.csv"
TEST_PATH = "../input/test.csv"
USER_PATH = "../input/user_x_anime.csv"
SUB_PATH = "../input/sample_submission.csv"
SAVE_OOF_PATH = f"../output/exp/ex{ex}_oof.npy"
SAVE_TEST_SUB_PATH = f"../output/exp/ex{ex}_test_sub.csv"
LOGGER_PATH = f"../output/exp/ex_{ex}.txt"

In [3]:

# ===============
# Settings
# ===============

SEED = 0
N_SPLITS = 5
SHUFFLE = True
CAT_PARAMS = {
   'iterations':20000,
    'learning_rate': 0.05,
    'loss_function':'RMSE',
    'random_seed':SEED,
    'metric_period':200,
    'od_wait':200,
    'depth': 8,
    }

load_feature = ["../output/fe/fe001.feather",
                "../output/fe/fe002.feather",
                "../output/fe/fe003.feather",
                "../output/fe/fe004.feather",
                "../output/fe/fe005.feather",
                "../output/fe/fe006.feather",
                "../output/fe/fe007.feather",
                "../output/fe/fe009.feather",
                "../output/fe/fe011.feather",
                "../output/fe/fe012.feather",
                "../output/fe/fe020.feather",]

In [4]:
# ====================
# Function
# ====================

def calc_loss(y_true, y_pred):
    return  np.sqrt(mean_squared_error(y_true, y_pred))


@contextmanager
def timer(name):
    t0 = time.time()
    yield 
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')

In [5]:
LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
setup_logger(out_file=LOGGER_PATH)

2021-08-20 13:17:49,936 - INFO - logger set up


<RootLogger root (DEBUG)>

In [7]:
# ====================
# Main
# ====================
train_raw = pd.read_csv(TRAIN_PATH)
y = train_raw["Score"]
df = None
for i in load_feature:
    if df is not None:
        _df = pd.read_feather(i)
        df  = pd.concat([df, _df], axis=1)
    else:
        df = pd.read_feather(i)
train = df.iloc[:len(train_raw)]
test = df.iloc[len(train_raw):].reset_index(drop=True)

In [10]:
with timer("cat"):
    kf = KFold(n_splits=N_SPLITS,random_state=SEED, shuffle=SHUFFLE)
    y_oof = np.empty([len(train),])
    y_test = []
    drop_cols = []
    features = list(train.columns)
    features = [i for i in features if i not in drop_cols]
    feature_importances = pd.DataFrame()
    categorical_features = []
    for fold, (train_idx, valid_idx) in enumerate(kf.split(train,y)):
        print('Fold {}'.format(fold + 1))
        with timer(f"fold {fold}"):
            x_train, y_train = train.iloc[train_idx][features], y.iloc[train_idx]
            x_val, y_val =train.iloc[valid_idx][features], y.iloc[valid_idx]
            print("train:",len(x_train))

            y_pred_valid, y_pred_test, valid_loss, best_iter,_ = train_cat_regressor(
                    x_train, y_train, x_val, y_val,test[features],
                    categorical_features=categorical_features,
                    feature_name=features,
                    cat_params =CAT_PARAMS,
                    loss_func=calc_loss,
                )

            y_oof[valid_idx] = y_pred_valid
            score = calc_loss(y[valid_idx], y_pred_valid)
            LOGGER.info(f'Fold{fold}:CV={score}')
            y_test.append(y_pred_test)
            #feature_importances = pd.concat([feature_importances, importances], axis=0, sort=False)

    score = calc_loss(y, y_oof)
    np.save(SAVE_OOF_PATH, y_oof)
    LOGGER.info(f'CV={score}')

Fold 1
train: 4000
0:	learn: 0.8518994	test: 0.8568559	best: 0.8568559 (0)	total: 42.1ms	remaining: 14m 2s




200:	learn: 0.2552114	test: 0.3488712	best: 0.3488712 (200)	total: 7.17s	remaining: 11m 46s
400:	learn: 0.1744673	test: 0.3357815	best: 0.3357815 (400)	total: 13.9s	remaining: 11m 20s
600:	learn: 0.1290417	test: 0.3332477	best: 0.3331908 (596)	total: 20.4s	remaining: 10m 58s
800:	learn: 0.0971300	test: 0.3314566	best: 0.3314473 (796)	total: 26.9s	remaining: 10m 43s
1000:	learn: 0.0750533	test: 0.3304170	best: 0.3304170 (1000)	total: 33.3s	remaining: 10m 32s
1200:	learn: 0.0576286	test: 0.3297896	best: 0.3297819 (1199)	total: 39.8s	remaining: 10m 23s
1400:	learn: 0.0450115	test: 0.3292687	best: 0.3292493 (1396)	total: 46.3s	remaining: 10m 14s
1600:	learn: 0.0354175	test: 0.3289647	best: 0.3289411 (1588)	total: 52.8s	remaining: 10m 7s
1800:	learn: 0.0280936	test: 0.3288008	best: 0.3287708 (1789)	total: 59.3s	remaining: 9m 59s
2000:	learn: 0.0224442	test: 0.3286887	best: 0.3286809 (1999)	total: 1m 5s	remaining: 9m 51s
2200:	learn: 0.0180487	test: 0.3286662	best: 0.3286657 (2120)	total: 1m

2021-08-20 13:23:07,988 - INFO - Fold0:CV=0.32849747972953836
2021-08-20 13:23:07,990 - INFO - [fold 0] done in 91 s


Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.3284974825
bestIteration = 2471

Shrink model to first 2472 iterations.
Fold 2
train: 4000
0:	learn: 0.8526102	test: 0.8594184	best: 0.8594184 (0)	total: 27ms	remaining: 9m




200:	learn: 0.2550676	test: 0.3425579	best: 0.3425579 (200)	total: 6.58s	remaining: 10m 48s
400:	learn: 0.1745801	test: 0.3261381	best: 0.3261381 (400)	total: 13.2s	remaining: 10m 44s
600:	learn: 0.1275198	test: 0.3215203	best: 0.3215100 (599)	total: 19.7s	remaining: 10m 36s
800:	learn: 0.0971412	test: 0.3205821	best: 0.3205821 (800)	total: 26.3s	remaining: 10m 29s
1000:	learn: 0.0752884	test: 0.3197521	best: 0.3196709 (938)	total: 33.2s	remaining: 10m 29s
1200:	learn: 0.0583635	test: 0.3193147	best: 0.3193147 (1200)	total: 39.9s	remaining: 10m 24s
1400:	learn: 0.0447415	test: 0.3187240	best: 0.3186647 (1391)	total: 46.7s	remaining: 10m 19s
1600:	learn: 0.0346937	test: 0.3183796	best: 0.3183647 (1598)	total: 53.4s	remaining: 10m 13s
1800:	learn: 0.0274842	test: 0.3183215	best: 0.3182960 (1647)	total: 1m	remaining: 10m 8s
2000:	learn: 0.0218112	test: 0.3182222	best: 0.3181999 (1987)	total: 1m 6s	remaining: 10m 2s
2200:	learn: 0.0173175	test: 0.3181222	best: 0.3181177 (2198)	total: 1m 13

2021-08-20 13:24:51,860 - INFO - Fold1:CV=0.31792256060783936
2021-08-20 13:24:51,863 - INFO - [fold 1] done in 104 s


Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.3179225573
bestIteration = 2783

Shrink model to first 2784 iterations.
Fold 3
train: 4000




0:	learn: 0.8548214	test: 0.8433616	best: 0.8433616 (0)	total: 34.3ms	remaining: 11m 25s
200:	learn: 0.2517570	test: 0.3779170	best: 0.3779170 (200)	total: 6.76s	remaining: 11m 6s
400:	learn: 0.1723518	test: 0.3587731	best: 0.3587731 (400)	total: 13.5s	remaining: 10m 59s
600:	learn: 0.1261947	test: 0.3521270	best: 0.3521270 (600)	total: 20.2s	remaining: 10m 50s
800:	learn: 0.0971664	test: 0.3491603	best: 0.3491539 (798)	total: 26.8s	remaining: 10m 42s
1000:	learn: 0.0752967	test: 0.3474950	best: 0.3474619 (990)	total: 33.5s	remaining: 10m 35s
1200:	learn: 0.0576364	test: 0.3460298	best: 0.3460298 (1200)	total: 40.2s	remaining: 10m 28s
1400:	learn: 0.0447130	test: 0.3453108	best: 0.3453015 (1372)	total: 46.9s	remaining: 10m 21s
1600:	learn: 0.0352038	test: 0.3449663	best: 0.3449595 (1598)	total: 53.6s	remaining: 10m 15s
1800:	learn: 0.0276757	test: 0.3448138	best: 0.3447820 (1752)	total: 1m	remaining: 10m 8s
2000:	learn: 0.0217834	test: 0.3446521	best: 0.3446046 (1926)	total: 1m 6s	rema

2021-08-20 13:26:05,748 - INFO - Fold2:CV=0.34460462950536647
2021-08-20 13:26:05,750 - INFO - [fold 2] done in 74 s


Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.3446046388
bestIteration = 1926

Shrink model to first 1927 iterations.
Fold 4
train: 4000
0:	learn: 0.8549949	test: 0.8419980	best: 0.8419980 (0)	total: 30.6ms	remaining: 10m 12s




200:	learn: 0.2532112	test: 0.3600480	best: 0.3600480 (200)	total: 6.9s	remaining: 11m 19s
400:	learn: 0.1727612	test: 0.3434508	best: 0.3434508 (400)	total: 13.6s	remaining: 11m 7s
600:	learn: 0.1262227	test: 0.3386079	best: 0.3386079 (600)	total: 20.4s	remaining: 10m 57s
800:	learn: 0.0958982	test: 0.3361780	best: 0.3361103 (795)	total: 27.3s	remaining: 10m 54s
1000:	learn: 0.0737644	test: 0.3352122	best: 0.3352122 (1000)	total: 34s	remaining: 10m 45s
1200:	learn: 0.0561076	test: 0.3344617	best: 0.3344617 (1200)	total: 40.7s	remaining: 10m 36s
1400:	learn: 0.0440863	test: 0.3337836	best: 0.3337836 (1400)	total: 47.4s	remaining: 10m 29s
1600:	learn: 0.0346524	test: 0.3335325	best: 0.3334939 (1582)	total: 54.1s	remaining: 10m 21s
1800:	learn: 0.0272135	test: 0.3334522	best: 0.3334522 (1800)	total: 1m	remaining: 10m 15s
2000:	learn: 0.0218037	test: 0.3333116	best: 0.3333024 (1959)	total: 1m 7s	remaining: 10m 7s
2200:	learn: 0.0173868	test: 0.3331911	best: 0.3331672 (2185)	total: 1m 14s	

2021-08-20 13:27:38,717 - INFO - Fold3:CV=0.3330678126294243
2021-08-20 13:27:38,719 - INFO - [fold 3] done in 93 s


Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.3330678111
bestIteration = 2464

Shrink model to first 2465 iterations.
Fold 5
train: 4000




0:	learn: 0.8492829	test: 0.8658530	best: 0.8658530 (0)	total: 32.5ms	remaining: 10m 50s
200:	learn: 0.2548141	test: 0.3729050	best: 0.3729050 (200)	total: 6.66s	remaining: 10m 55s
400:	learn: 0.1747158	test: 0.3558609	best: 0.3558609 (400)	total: 13.2s	remaining: 10m 44s
600:	learn: 0.1291633	test: 0.3501902	best: 0.3501774 (599)	total: 19.8s	remaining: 10m 37s
800:	learn: 0.0982781	test: 0.3474737	best: 0.3474625 (799)	total: 26.5s	remaining: 10m 35s
1000:	learn: 0.0755819	test: 0.3466008	best: 0.3465993 (998)	total: 33.1s	remaining: 10m 28s
1200:	learn: 0.0591677	test: 0.3458934	best: 0.3458865 (1196)	total: 39.7s	remaining: 10m 22s
1400:	learn: 0.0464909	test: 0.3452610	best: 0.3452416 (1399)	total: 46.4s	remaining: 10m 15s
1600:	learn: 0.0365399	test: 0.3448054	best: 0.3447929 (1597)	total: 52.9s	remaining: 10m 8s
1800:	learn: 0.0290415	test: 0.3445475	best: 0.3445254 (1793)	total: 59.5s	remaining: 10m 1s
2000:	learn: 0.0231340	test: 0.3443768	best: 0.3443744 (1997)	total: 1m 6s	r

2021-08-20 13:29:35,696 - INFO - Fold4:CV=0.3438590735777117
2021-08-20 13:29:35,698 - INFO - [fold 4] done in 117 s
2021-08-20 13:29:35,708 - INFO - CV=0.3337396978784831
2021-08-20 13:29:35,712 - INFO - [cat] done in 479 s


Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.343859077
bestIteration = 3210

Shrink model to first 3211 iterations.


In [11]:
y_test_sub =  np.mean(y_test,axis=0)
sub = pd.read_csv(SUB_PATH)

In [12]:
sub["Score"] = y_test_sub
sub.to_csv(SAVE_TEST_SUB_PATH,index=False)

In [14]:
oof19 = np.load("../output/exp/ex019_oof.npy")

In [16]:
np.corrcoef(y_oof,oof19)

array([[1.        , 0.98890199],
       [0.98890199, 1.        ]])

In [18]:
for i in range(11):
    print(i,calc_loss(y, y_oof * i * 0.1 + oof19 * (10-i)*0.1))

0 0.3337613055899626
1 0.33184602403217
2 0.3303481984979753
3 0.3292735259078852
4 0.32862615766980396
5 0.3284086207362435
6 0.32862176871138615
7 0.3292647651908305
8 0.33033510008686723
9 0.3318286382015323
10 0.3337396978784832


In [20]:
ex19 = pd.read_csv("../output/exp/ex019_test_sub.csv")
sub["Score"] = ex19["Score"] * 0.5 + sub["Score"] * 0.5
sub.to_csv("../output/exp/ex19_20.csv",index=False)