## Library

In [1]:
import os
import gc
import random
import pickle

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.preprocessing import MaxAbsScaler, Normalizer, RobustScaler, StandardScaler,MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold      # St for class
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.metrics import Metric
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

## hyperparameters

In [2]:
# set visible device
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
#os.environ["CUDA_VISIBLE_DEVICES"]= "1, 2, 3"

CONFIG = {
    'n_worker':16,
    # Tabnet model
    'epochs' : 100,
    'patience' : 20,
    'learning_rate':1e-3,
    'weight_decay':1e-5,
    'threshold':0.5,
    'seed':42,
    'fold':5
}

# seed setting function
def seed_everything(seed:int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CONFIG['seed']) # Seed setting



In [3]:
# ============= Norm 
def norm_transform(datatype, data, scaler_name='z-score', scaler=None):
    scaler_dict = {
        'z-score':StandardScaler(),
        'minmax':MinMaxScaler(),
        'maxabs':MaxAbsScaler(),
        'robust':RobustScaler(),
        'norm':Normalizer()
    }
    
    # use only train
    if not datatype=="test":
        scaler = scaler_dict[scaler_name]
        scaled_train = scaler.fit_transform(data)
        return scaled_train, scaler
    else:
        scaled_test = scaler.transform(data)
        return scaled_test

# ============= pca 
def pca_transform(datatype, data, n_comp=300, pca=None):
    if not datatype=="test":
        pca = PCA(n_components=n_comp, random_state=CONFIG["seed"])
        pca_train = pca.fit_transform(data)
        print(f"with {n_comp} components, pca variance ratio : {sum(pca.explained_variance_ratio_)}")
        return pca_train, pca
    else:
        pca_test = pca.transform(data)
        return pca_test


def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

    
class NRMSE(Metric):
    def __init__(self):
        self._name = "NormRMSE"
        self._maximize = False

    def __call__(self, y_true, y_score):
        nrmse = lg_nrmse(y_true, y_score)
        return nrmse

## Dada load and feature engineering

In [4]:
from pickle import FALSE


MODEL_DIR_NAME = "./Tabnet_models"

TABNET_OUTPUT_DIR_NAME = "./tabnet_outputs"
SCALER_PATH = os.path.join(TABNET_OUTPUT_DIR_NAME, "x_scaler.pkl")

if not os.path.exists(MODEL_DIR_NAME):
    os.makedirs(MODEL_DIR_NAME)

n_targets = 14

df_train = pd.read_csv("datasets/train.csv")
df_test = pd.read_csv("datasets/test.csv")

df_train = df_train.drop('X_10', axis=1)
df_train = df_train.drop('X_11', axis=1)
df_test = df_test.drop('X_10', axis=1)
df_test = df_test.drop('X_11', axis=1)



df_train_x = df_train.iloc[:,1:-14].values
df_train_y = df_train.iloc[:,-14:].values

X_test_features = df_test.iloc[:,1:].values


X_train_norm_features, scaler = norm_transform("train", df_train_x, "minmax")

with open(SCALER_PATH, "wb") as fw:
    pickle.dump(scaler, fw)
X_test_norm_features = norm_transform("test", X_test_features, "minmax", scaler)


#### generate k-menas feature ### 
K_Means_model = KMeans(n_clusters = 6, random_state = 10)

K_Means_model.fit(X_train_norm_features)


X_train_cluster_feature = K_Means_model.fit_predict(X_train_norm_features)
X_test_cluster_feature = K_Means_model.fit_predict(X_test_norm_features)



# ### one-hot encoding
# oh_enc = OneHotEncoder(handle_unknown='ignore',sparse=False)
# oh_train_cluster_feature = oh_enc.fit_transform(X_train_cluster_feature.reshape(-1, 1))
# oh_test_cluster_feature = oh_enc.transform(X_test_cluster_feature.reshape(-1, 1))



# X_train_features = np.concatenate((X_train_norm_features,oh_train_cluster_feature),axis=1)

# X_test_features = np.concatenate((X_test_norm_features,oh_test_cluster_feature),axis=1)

In [5]:
X_train_norm_features_df =  pd.DataFrame(X_train_norm_features, columns = [ "X_" + str(i+1) for i in range(X_train_norm_features.shape[1])])
X_test_norm_features_df =  pd.DataFrame(X_test_norm_features, columns = [ "X_" + str(i+1) for i in range(X_train_norm_features.shape[1])])

X_train_cluster_feature = pd.DataFrame(X_train_cluster_feature, columns = ["kmeans"])
X_test_cluster_feature = pd.DataFrame(X_test_cluster_feature, columns = ["kmeans"])

X_train_cluster_feature = X_train_cluster_feature.kmeans.astype("object")
X_test_cluster_feature = X_test_cluster_feature.kmeans.astype("object")

X_train_features = pd.concat([X_train_norm_features_df,X_train_cluster_feature],axis=1)
X_test_features = pd.concat([X_test_norm_features_df,X_test_cluster_feature],axis=1)

In [6]:
l_enc = LabelEncoder()

nunique = X_train_features.nunique()
types = X_train_features.dtypes

categorical_columns = []
categorical_dims =  {}
for col in X_train_features.columns:
    if types[col] == 'object':
        print(col, X_train_features[col].nunique())
        X_train_features[col] = l_enc.fit_transform(X_train_features[col].values)
        print(X_train_features[col])
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
        print(categorical_dims[col])
    # else:
    #     X_train_features.fillna(X_train_features.loc[train_indices, col].mean(), inplace=True)


# Categorical Embedding을 위해 Categorical 변수의 차원과 idxs를 담음.
unused_feat = ['Set'] 

features = [ col for col in X_train_features.columns if col not in unused_feat+['target']]  # 7 target
cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]



kmeans 6
0        1
1        3
2        2
3        1
4        1
        ..
39602    2
39603    3
39604    2
39605    1
39606    1
Name: kmeans, Length: 39607, dtype: int32
6


In [7]:
# tabnet params
tabnet_params = dict(
    cat_idxs=cat_idxs,  # cat_idxs: List[int] = field(default_factory=list)
    cat_dims=cat_dims,  # cat_dims: List[int] = field(default_factory=list)
    cat_emb_dim=10,  
    n_d = 64,   # 8 to 64
    n_a = 128,  # n_d = n_a usally good
    n_steps = 1,
    gamma = 1.3,
    lambda_sparse = 0,
    n_independent = 2,
    n_shared = 1,
    optimizer_fn = optim.NAdam,
    optimizer_params = dict(lr = CONFIG['learning_rate'], weight_decay = CONFIG['weight_decay']),
    mask_type = "entmax",
    scheduler_params = dict(
        mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = ReduceLROnPlateau,
    seed = CONFIG["seed"],
    verbose = 5
)

## training

In [8]:

X_train_features = X_train_features.values



kf = KFold(n_splits=CONFIG['fold'], random_state=CONFIG['seed'], shuffle=True)

avg_loss, avg_nrmse = 0, 0
LOG_PATH = os.path.join(MODEL_DIR_NAME, "log.txt")
with open(LOG_PATH, "w") as fw:
    fw.write("Tabnet model ==")

for fold, (train_idx, test_idx) in enumerate(kf.split(X_train_features)):
    with open(LOG_PATH, "a") as fa:
        fa.write(f": ========= FOLD {fold+1} ========= :\n")

    model = TabNetRegressor(**tabnet_params)
    model.fit(X_train=X_train_features[train_idx], y_train=df_train_y[train_idx],
                eval_set=[(X_train_features[test_idx],df_train_y[test_idx])],
                max_epochs=CONFIG['epochs'], patience=CONFIG['patience'], 
                eval_metric=['rmse', 'mse', NRMSE])
    
    
    model_name = f'./tabnet_fold{fold+1}'
    model_path = os.path.join(MODEL_DIR_NAME, model_name)
    model.save_model(model_path)

TypeError: TabNetRegressor() takes no arguments

## train y_hat 추출

In [None]:
# preds_reg = np.zeros((len(df_train), n_targets))

# for fold in range(CONFIG['fold']):
#     model_path = os.path.join(MODEL_DIR_NAME, f"tabnet_fold{fold+1}.zip")
#     infer_model = TabNetRegressor(**tabnet_params)
#     infer_model.load_model(model_path)
    

#     preds_reg += infer_model.predict(X_train_norm_features)
#     print(len(preds_reg))
    
# preds_reg /= CONFIG['fold']


In [None]:
# SAMPLE_SUB_PATH = TABNET_OUTPUT_DIR_NAME+'/sample_submission.csv'

# preds_reg = pd.DataFrame(preds_reg)
# preds_reg.to_csv('./tabnet_outputs/tabnet_train_yhat.csv', index=False)

In [None]:
nunique = X_test_features.nunique()
types = X_test_features.dtypes


categorical_columns = []
categorical_dims =  {}
for col in X_test_features.columns:
    if types[col] == 'object':
        print(col, X_test_features[col].nunique())
        X_test_features[col] = l_enc.fit_transform(X_test_features[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)


unused_feat = ['Set']
features = [ col for col in X_test_features.columns if col not in unused_feat+['target']]  # 7 target
cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]



kmeans 6


In [None]:
# tabnet params
tabnet_params = dict(
    cat_idxs=cat_idxs,  # cat_idxs: List[int] = field(default_factory=list)
    cat_dims=cat_dims,  # cat_dims: List[int] = field(default_factory=list)
    cat_emb_dim=10,  
    n_d = 64,   # 8 to 64
    n_a = 128,  # n_d = n_a usally good
    n_steps = 1,
    gamma = 1.3,
    lambda_sparse = 0,
    n_independent = 2,
    n_shared = 1,
    optimizer_fn = optim.NAdam,
    optimizer_params = dict(lr = CONFIG['learning_rate'], weight_decay = CONFIG['weight_decay']),
    mask_type = "entmax",
    scheduler_params = dict(
        mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = ReduceLROnPlateau,
    seed = CONFIG["seed"],
    verbose = 5
)

## test y_hat 추출

In [None]:
X_test_features = X_test_features.values
preds_reg = np.zeros((len(df_test), n_targets))

for fold in range(CONFIG['fold']):
    model_path = os.path.join(MODEL_DIR_NAME, f"tabnet_fold{fold+1}.zip")
    infer_model = TabNetRegressor(**tabnet_params)
    infer_model.load_model(model_path)

    preds_reg += infer_model.predict(X_test_features)

preds_reg /= CONFIG['fold']


Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda
Device used : cuda


In [None]:
SAMPLE_SUB_PATH = TABNET_OUTPUT_DIR_NAME+'/sample_submission.csv'
submit = pd.read_csv(SAMPLE_SUB_PATH)
submit.iloc[:, 1:] = preds_reg
submit.to_csv('./tabnet_outputs/tabnet_submit.csv',index=False)