In [1]:
import os
import gc
import random
import pickle

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.preprocessing import MaxAbsScaler, Normalizer, RobustScaler, StandardScaler,MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold      # St for class
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.metrics import Metric

In [2]:
# set visible device
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
#os.environ["CUDA_VISIBLE_DEVICES"]= "1, 2, 3"

CONFIG = {
    'n_worker':16,
    # Tabnet model
    'epochs' : 100,
    'patience' : 20,
    'learning_rate':2e-3,
    'weight_decay':1e-5,
    'threshold':0.5,
    'seed':42,
    'fold':5
}

# seed setting function
def seed_everything(seed:int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG['seed']) # Seed setting

# tabnet params
tabnet_params = dict(
    n_d = 64,   # 8 to 64
    n_a = 128,  # n_d = n_a usally good
    n_steps = 3,
    gamma = 1.3,
    lambda_sparse = 0,
    n_independent = 2,
    n_shared = 1,
    optimizer_fn = optim.Adam,
    optimizer_params = dict(lr = CONFIG['learning_rate'], weight_decay = CONFIG['weight_decay']),
    mask_type = "entmax",
    scheduler_params = dict(
        mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = ReduceLROnPlateau,
    seed = CONFIG["seed"],
    verbose = 5
)

In [3]:
# ============= Norm 
def norm_transform(datatype, data, scaler_name='z-score', scaler=None):
    scaler_dict = {
        'z-score':StandardScaler(),
        'minmax':MinMaxScaler(),
        'maxabs':MaxAbsScaler(),
        'robust':RobustScaler(),
        'norm':Normalizer()
    }
    
    # use only train
    if not datatype=="test":
        scaler = scaler_dict[scaler_name]
        scaled_train = scaler.fit_transform(data)
        return scaled_train, scaler
    else:
        scaled_test = scaler.transform(data)
        return scaled_test

# ============= pca 
def pca_transform(datatype, data, n_comp=300, pca=None):
    if not datatype=="test":
        pca = PCA(n_components=n_comp, random_state=CONFIG["seed"])
        pca_train = pca.fit_transform(data)
        print(f"with {n_comp} components, pca variance ratio : {sum(pca.explained_variance_ratio_)}")
        return pca_train, pca
    else:
        pca_test = pca.transform(data)
        return pca_test


def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

    
class NRMSE(Metric):
    def __init__(self):
        self._name = "NormRMSE"
        self._maximize = False

    def __call__(self, y_true, y_score):
        nrmse = lg_nrmse(y_true, y_score)
        return nrmse

In [4]:
MODEL_DIR_NAME = "./Tabnet_models"

TABNET_OUTPUT_DIR_NAME = "./tabnet_outputs"
SCALER_PATH = os.path.join(TABNET_OUTPUT_DIR_NAME, "x_scaler.pkl")

if not os.path.exists(MODEL_DIR_NAME):
    os.makedirs(MODEL_DIR_NAME)

n_targets = 14

df_train = pd.read_csv("datasets/train.csv")
df_test = pd.read_csv("datasets/test.csv")

df_train = df_train.drop('X_10', axis=1)
df_train = df_train.drop('X_11', axis=1)
df_test = df_test.drop('X_10', axis=1)
df_test = df_test.drop('X_11', axis=1)


# train_yhat_data = pd.read_csv("tabnet_outputs/tabnet_train_yhat.csv")
# test_yhat_data = pd.read_csv("tabnet_outputs/tabnet_test_yhat.csv")
# train_yhat_data = train_yhat_data.values
# test_yhat_data = test_yhat_data.values
# X_train_features = np.load("autoencoder_output/autoencoder_feature.npy")
# X_test_features = np.load("autoencoder_output/autoencoder_feature_test.npy")
# X_train_features = np.concatenate((X_train_features,train_yhat_data),axis=1)
# X_test_features = np.concatenate((X_test_features,test_yhat_data),axis=1)

df_train_x = df_train.iloc[:,1:-14].values
df_train_y = df_train.iloc[:,-14:].values


X_test_features =  df_test.iloc[:,1:].values
X_train_norm_features, scaler = norm_transform("train", df_train_x, "minmax")


with open(SCALER_PATH, "wb") as fw:
    pickle.dump(scaler, fw)
X_test_norm_features = norm_transform("test", X_test_features, "minmax", scaler)

kf = KFold(n_splits=CONFIG['fold'], random_state=CONFIG['seed'], shuffle=True)

avg_loss, avg_nrmse = 0, 0
LOG_PATH = os.path.join(MODEL_DIR_NAME, "log.txt")
with open(LOG_PATH, "w") as fw:
    fw.write("Tabnet model ==")


for fold, (train_idx, test_idx) in enumerate(kf.split(X_train_norm_features)):
    with open(LOG_PATH, "a") as fa:
        fa.write(f": ========= FOLD {fold+1} ========= :\n")
    for i in range(14):
        
        model = TabNetRegressor(**tabnet_params)

        model.fit(X_train=X_train_norm_features[train_idx], y_train=df_train_y[i,train_idx],
                    eval_set=[(X_train_norm_features[test_idx],df_train_y[test_idx])],
                    max_epochs=CONFIG['epochs'], patience=CONFIG['patience'], 
                    eval_metric=['rmse', 'mse', NRMSE])
        
        model_path = os.path.join(MODEL_DIR_NAME, f'./tabnet_mode{i+1}_fold{fold+1}')

        model.save_model(model_path)
        del model

TypeError: TabNetRegressor() takes no arguments

## get target

## test y_hat 추출

In [None]:
preds_reg = np.zeros((len(df_test), 1))
epreds_reg = np.zeros((len(df_test), 14))

for n_model in range(14):
    
    for fold in range(CONFIG['fold']):
        model_path = os.path.join(MODEL_DIR_NAME, f'./tabnet_model{n_model+1}_fold{fold+1}')
        infer_model = TabNetRegressor(**tabnet_params)
        infer_model.load_model(model_path)

        preds_reg += infer_model.predict(X_test_norm_features)
    preds_reg /= CONFIG['fold']
    
    if n_model == 0:
        epreds_reg[:,n_model] = preds_reg
    else:
        epreds_reg = np.concat([epreds_reg,preds_reg])

Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda


In [None]:
SAMPLE_SUB_PATH = TABNET_OUTPUT_DIR_NAME+'/sample_submission.csv'
submit = pd.read_csv(SAMPLE_SUB_PATH)
submit.iloc[:, 1:] = epreds_reg
submit.to_csv('./tabnet_outputs/tabnet_submit.csv', index=False)