In [1]:
import os
import gc
import random
import pickle
import joblib

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.preprocessing import MaxAbsScaler, Normalizer, RobustScaler, StandardScaler,MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold      # St for class
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

from sklearn.multioutput import MultiOutputRegressor

import lightgbm as lgb
from lightgbm import LGBMRegressor


In [2]:
# set visible device
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "1, 2, 3"

CONFIG = {
    'n_worker':16,
    # LGBM model
    'max_depth' : 200,
    'n_estimators' : 20000,
    'patience' : 200,
    'learning_rate':2e-2,
    'weight_decay':1e-5,
    'threshold':0.5,
    'seed':42,
    'fold':5
}

# seed setting function
def seed_everything(seed:int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG['seed']) # Seed setting


lgbm_settings = {'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree':1,
    'learning_rate': 2e-2,
    # 'max_depth': CONFIG['max_depth'],
    'max_depth': -1,
    'min_child_samples': 20,
    'num_leaves':  32,
    'n_estimators':CONFIG['n_estimators'],
    'subsample_for_bin': 100000,
    'n_jobs':-1,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'random_state': CONFIG['seed']}

In [3]:
# ============= Norm 
def norm_transform(datatype, data, scaler_name='z-score', scaler=None):
    scaler_dict = {
        'z-score':StandardScaler(),
        'minmax':MinMaxScaler(),
        'maxabs':MaxAbsScaler(),
        'robust':RobustScaler(),
        'norm':Normalizer()
    }
    
    # use only train
    if not datatype=="test":
        scaler = scaler_dict[scaler_name]
        scaled_train = scaler.fit_transform(data)
        return scaled_train, scaler
    else:
        scaled_test = scaler.transform(data)
        return scaled_test

# ============= pca 
def pca_transform(datatype, data, n_comp=300, pca=None):
    if not datatype=="test":
        pca = PCA(n_components=n_comp, random_state=CONFIG["seed"])
        pca_train = pca.fit_transform(data)
        print(f"with {n_comp} components, pca variance ratio : {sum(pca.explained_variance_ratio_)}")
        return pca_train, pca
    else:
        pca_test = pca.transform(data)
        return pca_test


# ============= evaluator
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score


def lgb_nrmse(data, preds):
    y_true = data
    # y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'NRMSE', lg_nrmse(y_true, preds), True

In [4]:
input_dir = '../../../dataset/'
TRAIN_DATA_PATH = input_dir+'train.csv'
TEST_DATA_PATH = input_dir+'test.csv'
SAMPLE_SUB_PATH = input_dir+'sample_submission.csv'

MODEL_DIR_NAME = "./LGBM_models"
SCALER_PATH = os.path.join(MODEL_DIR_NAME, "x_scaler.pkl")

if not os.path.exists(MODEL_DIR_NAME):
    os.makedirs(MODEL_DIR_NAME)

n_targets = 14

df_train = pd.read_csv(TRAIN_DATA_PATH)
df_test = pd.read_csv(TEST_DATA_PATH)

X_features, y_features = df_train.iloc[:, 1:-14].values, df_train.iloc[:, -14:].values
X_test_features = df_test.iloc[:, 1:].values
X_norm_features, scaler = norm_transform("train", X_features, "minmax")
with open(SCALER_PATH, "wb") as fw:
    pickle.dump(scaler, fw)

X_test_norm_features = norm_transform("test", X_test_features, "minmax", scaler)

kf = KFold(n_splits=CONFIG['fold'], random_state=CONFIG['seed'], shuffle=True)
avg_loss, avg_nrmse = 0, 0


LOG_PATH = os.path.join(MODEL_DIR_NAME, "log.txt")

with open(LOG_PATH, "w") as fw:
    fw.write("LGBM model ==")

for fold, (train_idx, test_idx) in enumerate(kf.split(X_norm_features)):
    with open(LOG_PATH, "a") as fa:
        fa.write(f": ========= FOLD {fold+1} ========= :\n")

    model = LGBMRegressor(**lgbm_settings)
    model.fit(X_norm_features[train_idx], y_features[train_idx],
                eval_set=(X_norm_features[test_idx], y_features[test_idx]),
                eval_metric=lgb_nrmse,
                verbose = 10,
                early_stopping_rounds=200)

    model.fit(X_norm_features[train_idx], y_features[train_idx])
    model_name = f'./lgbm_fold{fold+1}.pkl'
    model_path = os.path.join(MODEL_DIR_NAME, model_name)
    joblib.dump(model, model_path)


ValueError: y should be a 1d array, got an array of shape (31685, 14) instead.