In [1]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Tabnet 
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from pytorch_tabnet.metrics import Metric

#model save
from joblib import dump, load

import warnings
warnings.filterwarnings('ignore')

import tidalUtl.PrpUtl as prp
import tidalUtl.EdaUtl as eda

In [2]:
#import sys
#sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

https://www.kaggle.com/tidalryoku/new-baseline-pytorch-moa/

# Version

__ver1__<br>
baseline：CV:0.01465 LB:0.01874<br>
__ver2__<br>
Hyperopt, 2Layer：CV:0.01460 LB:0.01869<br>
__ver3__<br>
3Layer：CV:0.01464 LB:0.01868<br>
__ver4__<br>
MLSMOTE baseline：CV:0.01476 LB:0.01978<br>

# Config

In [3]:
INPUT = "/home/tidal/ML_Data/MoA/lish-moa"
OUTPUT = "/home/tidal/ML_Data/MoA/output"
#INPUT = "/Users/hfuis/ML_Data/MoA/lish-moa"
#OUTPUT = "/Users/hfuis/ML_Data/MoA/output"

SUBMIT = OUTPUT + "/submittion/"
SAVEMODEL = OUTPUT + "/model/tabnet_regressor/"
SAVEOOF = OUTPUT + "/OOF/tabnet_regressor/"

In [4]:
#Loading
trainFeature = pd.read_csv(INPUT + '/train_features.csv')
testFeature = pd.read_csv(INPUT + '/test_features.csv')
trainTargetScored = pd.read_csv(INPUT + '/train_targets_scored.csv')
sample_submission = pd.read_csv(INPUT + '/sample_submission.csv')
drug = pd.read_csv(INPUT + '/train_drug.csv')

In [5]:
GENES = [col for col in trainFeature.columns if col.startswith('g-')] #gから始まる列名のセット
CELLS = [col for col in trainFeature.columns if col.startswith('c-')] #cから始まる列名のセット

In [6]:
#Seed固定
def seed_everything(seed=42):
    #data取得についてのランダム性固定
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #cudnnによる演算の安定化(評価値の安定)
    torch.backends.cudnn.deterministic = True
    
    #os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(seed=42)

In [7]:
#HyperParameter
param_space = {'hidden_size1': 512, 
               'hidden_size2': 512, 
               'dropOutRate1': 0.20393004966355735, 
               'dropOutRate2': 0.39170486751620137,
               'n_d':8,
               'n_a':8,
               'n_steps':3,
               'gamma':1.3,
               'cat_idxs':[],
               'cat_dims':[],
               'cat_emb_dim':1,
               'n_independent':2,
               'n_shared':2,
               'epsilon':1e-15,
               'virtual_batch_size':128,
               'momentum':0.02,
               'device_name':'cuda',
               'mask_type':'sparsemax',
               'rankGauss_n_quantiles': 488.0393350201078,
               'leakyReluSlope': 0.01973893854348531,
              }

# Preprocessing

## Func: In & Out Type is DataFrame

### PCA features add

In [8]:
def PCA_features_add(trainFeature, testFeature):
    # GENES
    n_comp = 50
    
    inTrain = trainFeature[GENES]
    inTest = testFeature[GENES]
    
    #PCA実行＆変換後のデータ作成
    pca_train, pca_test, _ = prp.tidalPCA(inTrain, inTest, Dim=n_comp, random_state=42)
    
    #columの名前付け
    trainTmp = pd.DataFrame(pca_train, columns=[f'pca_G-{i}' for i in range(n_comp)])
    testTmp = pd.DataFrame(pca_test, columns=[f'pca_G-{i}' for i in range(n_comp)])
    
    #データに付け足し
    trainFeature = pd.concat((trainFeature, trainTmp), axis=1)
    testFeature = pd.concat((testFeature, testTmp), axis=1)
    
    
    # CELLS
    # CELLSもGENESと同様。
    n_comp = 15
    
    inTrain = trainFeature[CELLS]
    inTest = testFeature[CELLS]
    
    pca_train, pca_test, _ = prp.tidalPCA(inTrain, inTest, Dim=n_comp, random_state=42)
    
    trainTmp = pd.DataFrame(pca_train, columns=[f'pca_C-{i}' for i in range(n_comp)])
    testTmp = pd.DataFrame(pca_test, columns=[f'pca_C-{i}' for i in range(n_comp)])
    
    trainFeature = pd.concat((trainFeature, trainTmp), axis=1)
    testFeature = pd.concat((testFeature, testTmp), axis=1)
    
    
    return trainFeature, testFeature

### feature Selection using Variance Encoding

In [9]:
def feature_Selection_using_Variance_Encoding(trainFeature, testFeature):
    data = trainFeature.append(testFeature)
    
    #['sig_id','cp_type','cp_time','cp_dose']を除いたfeatureで低い分散の特徴量を除去
    #列名は連番になる。
    data_transformed = prp.tidalVarianceThrs(data.iloc[:, 4:], threshold=0.5)
    
    
    trainFeature_transformed = data_transformed[ : trainFeature.shape[0]]
    testFeature_transformed = data_transformed[-testFeature.shape[0] : ]
    
    trainFeature = trainFeature[['sig_id','cp_type','cp_time','cp_dose']]
    trainFeature = pd.concat([trainFeature, pd.DataFrame(trainFeature_transformed)], axis=1)
    
    testFeature = testFeature[['sig_id','cp_type','cp_time','cp_dose']]
    testFeature = pd.concat([testFeature, pd.DataFrame(testFeature_transformed)], axis=1)

    
    return trainFeature, testFeature

## cp_type = ctl_vehicleのレコードを削除

__※提出用データ(test)も同様に一部レコードを削除するが、こちらは最後submittionデータを作る際に0埋めを行う。__<br>
__（CV_Evaluation(), Submit()参照。）__

In [10]:
def drop_ctl_vehicle(trainFeature, testFeature, trainTargetScored):
    
    #Pkey(sig_id)でfeatureとtargetを内部結合。
    train = trainFeature.merge(trainTargetScored, on='sig_id')
    test = testFeature.merge(sample_submission, on='sig_id')
    
    #件のレコードを削除。
    train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
    test = test[test['cp_type']!='ctl_vehicle'].reset_index(drop=True)
    target = train[trainTargetScored.columns]
    
    #cp_typeは使用しない。(今となっては全て同じ特徴量(trt_cp)であるため)
    train = train.drop('cp_type', axis=1)
    test = test.drop('cp_type', axis=1)
    
    #trainFeature,testFeatureに戻す
    tmpTraget = trainTargetScored.drop('sig_id', axis=1)
    trainFeature = train.drop(tmpTraget.columns, axis=1)
    testFeature = test.drop(tmpTraget.columns, axis=1)
    
    
    return trainFeature, testFeature, target
    

## One-Hot Encoding

In [11]:
def oneHotEncoding(train, test):
    #One-Hot Encoding(カテゴリデータをすべてOne-Hot化)
    feature_name = ['cp_time','cp_dose']
    train, test = prp.OneHot_encode(train, test, feature_name)
    
    return train, test

## MLSMOTE

__MULTI SMOTE: 頻度の少ないターゲットに当たるTrainDataをAugumentする手法__

In [12]:
def MlSmote(train, target, thrsQlMin):
    #trainについている['sig_id']を除いたfeatureを使う。
    #(targetも除く)
    trainFeatureSMOTE = train.drop(target.columns.values.tolist(), axis=1)
    #targetについている['sig_id']を除去。
    targetSMOTE = target.iloc[:,1:]
    
    #MLSMOTE実行
    X_sub, y_sub = prp.get_minority_samples(trainFeatureSMOTE, targetSMOTE, ql=[thrsQlMin, 1.])  # ターゲットの頻度が不足のデータを返す。
    trainFeatureAug, targetAug = prp.MLSMOTE(X_sub, y_sub, len(X_sub), neigh=5)  # Applying MLSMOTE to augment the dataframe
    
    #cp_time_*, cp_dose_*で絶対値の大きなものを1,それ以外を0に変更。
    
    #train,targetの形に成形(targetにsig_idを付与。trainにtargetをくっ付ける。)
    #1.targetにsig_idを付与
    targetAug["sig_id"] = ""
    for i in range(len(trainFeatureAug)):
        addedId = "id_MLSMOTE"+str(i)
        targetAug.iloc[i,-1]= addedId
    #2.trainにtargetをくっ付ける。
    trainAug = pd.concat([trainFeatureAug, targetAug], axis=1)
    
    #AugmentDataを元のデータにくっ付ける.
    train = train.append(trainAug)
    target = target.append(targetAug)
    
    #インデックス整理
    train = train.reset_index(drop=True)
    target = target.reset_index(drop=True)
    
    return train, target

## rankGauss

In [13]:
def rankGauss(trainFeature, testFeature, n_quantiles):
    dfTrain = trainFeature.copy()
    dfTest = testFeature.copy()
    #'g-','c-'が対象。
    for col in (GENES + CELLS):
        dfTrain[[col]], dfTest[[col]] = prp.rankGauss(trainFeature[[col]], testFeature[[col]],n_quantiles=n_quantiles)
    
    return dfTrain, dfTest

## createCluster

In [14]:
def createCluster(trainFeature, testFeature, n_clusters_g=35, n_clusters_c=5):
    #"g-"と"c-"でそれぞれクラスター分析を行う。
    features_g = list(trainFeature.columns[4:776])
    features_c = list(trainFeature.columns[776:876])
    
    train = trainFeature.copy()
    test = testFeature.copy()
    
    #実行。
    train, test = eda.createClusterKmeans(train, test, features_g, n_clusters=n_clusters_g, kind = 'cluster_g', seed = 0)
    train, test = eda.createClusterKmeans(train, test, features_c, n_clusters=n_clusters_c, kind = 'cluster_c', seed = 0)
    
    return train, test

## statsAdd

In [15]:
def statsAdd(trainFeature, testFeature):
    features_g = list(trainFeature.columns[4:776])
    features_c = list(trainFeature.columns[776:876])
    
    for df in trainFeature, testFeature:
        df['g_sum'] = df[features_g].sum(axis = 1)
        df['g_mean'] = df[features_g].mean(axis = 1)
        df['g_std'] = df[features_g].std(axis = 1)
        df['g_kurt'] = df[features_g].kurtosis(axis = 1)
        df['g_skew'] = df[features_g].skew(axis = 1)
        df['c_sum'] = df[features_c].sum(axis = 1)
        df['c_mean'] = df[features_c].mean(axis = 1)
        df['c_std'] = df[features_c].std(axis = 1)
        df['c_kurt'] = df[features_c].kurtosis(axis = 1)
        df['c_skew'] = df[features_c].skew(axis = 1)
        df['gc_sum'] = df[features_g + features_c].sum(axis = 1)
        df['gc_mean'] = df[features_g + features_c].mean(axis = 1)
        df['gc_std'] = df[features_g + features_c].std(axis = 1)
        df['gc_kurt'] = df[features_g + features_c].kurtosis(axis = 1)
        df['gc_skew'] = df[features_g + features_c].skew(axis = 1)
        
    return trainFeature, testFeature

## Scaling

In [16]:
def Scaling(trainFeature, testFeature):
    features = trainFeature.columns[3:]
    
    #Scaler
    scaler = StandardScaler()
    scaler.fit(pd.concat([trainFeature[features], testFeature[features]], axis = 0))
    
    trainFeature[features] = scaler.transform(trainFeature[features])
    testFeature[features] = scaler.transform(testFeature[features])
    
    return trainFeature, testFeature

## Collecting

__train,testにターゲット値も連結__

In [17]:
def Collecting(trainFeature, testFeature, trainTargetScored):
    #Pkey(sig_id)でfeatureとtargetを内部結合。
    train = trainFeature.merge(trainTargetScored, on='sig_id')
    test = testFeature.merge(sample_submission, on='sig_id')
    
    return train, test

## Preprocessing Summary

In [18]:
def preprocessing(param, trainFeature, testFeature, trainTargetScored):
    rankGauss_n_quantiles=int(param['rankGauss_n_quantiles'])
    
    #statsAdd
    trainFeature, testFeature = statsAdd(trainFeature, testFeature)

    #createCluster
    trainFeature, testFeature = createCluster(trainFeature, testFeature, n_clusters_g=35, n_clusters_c=5)
    
    #rankGauss
    trainFeature, testFeature = rankGauss(trainFeature, testFeature, rankGauss_n_quantiles)
    
    #PCA成分付与
    trainFeature, testFeature = PCA_features_add(trainFeature, testFeature)
    
    #低分散特徴量除去
    trainFeature, testFeature = feature_Selection_using_Variance_Encoding(trainFeature, testFeature)
    
    #cp_type = ctl_vehicleのレコードを削除.
    trainFeature, testFeature, target = drop_ctl_vehicle(trainFeature, testFeature, trainTargetScored)
    
    #Scaler候補２
    #trainFeature, testFeature = Scaling(trainFeature, testFeature)
    #print("trainFeature.shape:")
    #print(trainFeature.shape)
    #print("trainFeature.column:")
    #print(trainFeature.columns.values.tolist())
    
    #One-Hot Encoding
    trainFeature, testFeature = oneHotEncoding(trainFeature, testFeature)
    
    #train,testにターゲット値を連結。
    train, test = Collecting(trainFeature, testFeature, target)
    
    
    return train, test, target

## Visualization

In [19]:
%%time
trainVsl, testVsl, targetVsl = preprocessing(param_space, trainFeature, testFeature, trainTargetScored)

CPU times: user 2min 3s, sys: 1.99 s, total: 2min 5s
Wall time: 24.2 s


In [20]:
testVsl.head(5)

Unnamed: 0,sig_id,0,1,2,3,4,5,6,7,8,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,-0.781655,0.26169,-0.779696,0.63929,1.530035,-0.190445,-0.290313,0.375362,-0.438749,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,id_001897cda,-0.245179,0.416399,1.169205,-0.708885,-0.51666,-0.390634,-2.248303,0.658783,-0.471261,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2,id_00276f245,0.628909,0.360378,0.366456,0.61682,-0.839884,-1.149103,0.763434,-0.252542,1.089884,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
3,id_0027f1083,-0.564344,-1.450582,1.657585,0.303089,-0.8415,-0.00129,0.656935,1.007719,0.315749,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
4,id_006fc47b8,0.4905,0.901397,-1.02514,-2.11851,0.741403,-0.449726,-0.637563,1.562304,0.458562,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5


In [21]:
targetVsl.head(5)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
sample_submission.head(5)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,id_001897cda,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2,id_002429b5b,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
3,id_00276f245,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
4,id_0027f1083,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5


In [23]:
print("Train: "+ str(trainVsl.shape))
print("Test: "+ str(testVsl.shape))
print("Target: "+ str(targetVsl.shape))
print("sample_submission: "+ str(sample_submission.shape))

Train: (21948, 1159)
Test: (3624, 1159)
Target: (21948, 207)
sample_submission: (3982, 207)


# Fitting

## Config about Fitting

In [24]:
#configは辞書化しておく。
def Config_about_Fitting(train, test, target, folds):
    confFitting = {}
    
    #Fitするときに"y"として使う列の列名配列
    confFitting["target_cols"] = target.drop('sig_id', axis=1).columns.values.tolist()
    #Fitするときに"X"として使う列の列名配列
    #kfold, id等はここで削除。
    feature_cols = [c for c in folds.columns if c not in confFitting["target_cols"]]
    confFitting["feature_cols"] = [c for c in feature_cols if c not in ['kfold','sig_id']]
    #特徴量、ターゲットのサイズ
    confFitting["num_features"]=len(confFitting["feature_cols"])
    confFitting["num_targets"]=len(confFitting["target_cols"])
    
    return confFitting

## Loss, Metric

In [25]:
#metric
class LogitsLogLoss(Metric):
    """
    LogLoss with sigmoid applied
    """

    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        """
        Compute LogLoss of predictions.

        Parameters
        ----------
        y_true: np.ndarray
            Target matrix or vector
        y_score: np.ndarray
            Score matrix or vector

        Returns
        -------
            float
            LogLoss of predictions vs targets.
        """
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15)
        return np.mean(-aux)

In [26]:
#loss
class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self):
        super(LabelSmoothingCrossEntropy, self).__init__()
    def forward(self, x, target, smoothing=0.0001):
        confidence = 1. - smoothing
        logprobs = F.log_softmax(x, dim=-1)
        bcs_loss = F.binary_cross_entropy_with_logits(x, target)
        smooth_loss = -logprobs.mean(dim=-1)
        loss = confidence * bcs_loss + smoothing * smooth_loss
        return loss.mean()

# Run

## HyperParameter

In [27]:
# HyperParameters
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 200
BATCH_SIZE = 1024
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 5
EARLY_STOPPING_STEPS = 20
EARLY_STOP = True

## CV folds

In [28]:
def CV_folds(train, target):
    folds = train.copy()
    
    mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)
    
    for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
        folds.loc[v_idx, 'kfold'] = int(f)
    
    folds['kfold'] = folds['kfold'].astype(int)
    
    return folds

In [39]:
def CV_folds_drug_id(train, target):
    ###drug_idを考慮####
    
    targets = target.columns[1:]
    
    # foldsにdrug_id付与
    folds = train.copy()
    folds = folds.merge(drug, on='sig_id', how='left') 
    
    # LOCATE DRUGS
    vc = folds.drug_id.value_counts()
    vc1 = vc.loc[vc<=18].index.sort_values()
    vc2 = vc.loc[vc>18].index.sort_values()
    
    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}; dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, 
              random_state=42)
    tmp = folds.groupby('drug_id')[targets].mean().loc[vc1]
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)
    
    # STRATIFY DRUGS MORE THAN 18X
    skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, 
              random_state=42)
    tmp = folds.loc[folds.drug_id.isin(vc2)].reset_index(drop=True)
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)
    
    # ASSIGN NFOLDS
    folds['kfold'] = folds.drug_id.map(dct1)
    folds.loc[folds.kfold.isna(),'kfold'] =\
        folds.loc[folds.kfold.isna(),'sig_id'].map(dct2)
    folds.kfold = folds.kfold.astype('int8')
    
    folds = folds.drop('drug_id', axis=1)
    
    return folds

In [40]:
%%time
#Preprocessing Data
trainVsl, testVsl, targetVsl = preprocessing(param_space, trainFeature, testFeature, trainTargetScored)
#CV folds
foldsVsl = CV_folds_drug_id(trainVsl, targetVsl)

foldsVsl.head(5)

CPU times: user 2min 10s, sys: 1.49 s, total: 2min 11s
Wall time: 24.6 s


Unnamed: 0,sig_id,0,1,2,3,4,5,6,7,8,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_000644bb2,1.134936,0.907607,-0.41609,-0.968042,-0.255626,-1.015203,-1.367034,-0.024938,0.679054,...,0,0,0,0,0,0,0,0,0,4
1,id_000779bfc,0.119254,0.682062,0.272262,0.080347,1.203946,0.686698,0.31455,0.554765,-0.537428,...,0,0,0,0,0,0,0,0,0,2
2,id_000a6266a,0.779855,0.94591,1.425056,-0.131341,-0.006697,1.49267,0.234401,0.364718,-0.005477,...,0,0,0,0,0,0,0,0,0,4
3,id_0015fd391,-0.735029,-0.274233,-0.438096,0.760073,2.45407,-0.859297,-2.302074,0.308738,-0.192191,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,-0.451791,-0.476988,0.972928,0.97107,1.462687,-0.870623,-0.375908,-0.204468,-1.064448,...,0,0,0,0,0,0,0,0,0,1


## Single Fold Running

In [31]:
def run_training(confFitting, Tester, fold, seed, param,
                 folds, train, test, target):
    
    seed_everything(seed)
    
    train = folds
    
    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index
    
    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)
    
    x_train, y_train  = train_df[confFitting["feature_cols"]].values, train_df[confFitting["target_cols"]].values
    x_valid, y_valid =  valid_df[confFitting["feature_cols"]].values, valid_df[confFitting["target_cols"]].values
    
    ### Model ###
    model_params = dict(
        n_d = 32,
        n_a = 32,
        n_steps = 1,
        gamma = 1.3,
        lambda_sparse = 0,
        optimizer_fn = optim.Adam,
        optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
        mask_type = "entmax",
        scheduler_params = dict(
            mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
        scheduler_fn = ReduceLROnPlateau,
        seed = seed,
        verbose = 10)
    model = TabNetRegressor(**model_params)
    
    ##### 評価関数 ######
    #train_loss_fn = LabelSmoothingCrossEntropy()
    
    ### Fit ###
    # Another change to the original code
    # virtual_batch_size of 32 instead of 128
    model.fit(
        X_train = x_train,
        y_train = y_train,
        eval_set = [(x_valid, y_valid)],
        eval_name = ["val"],
        eval_metric = ["logits_ll"],
        max_epochs = EPOCHS,
        patience = EARLY_STOPPING_STEPS,
        batch_size = BATCH_SIZE, 
        virtual_batch_size = 32,
        num_workers = 1,
        drop_last = False,
        # To use binary cross entropy because this is not a regression problem
        loss_fn = F.binary_cross_entropy_with_logits
        #loss_fn = train_loss_fn
    )
    
    ### Save ###
    #dump(model, f"{SAVEMODEL}SEED{seed}_FOLD{fold}.pth")
    
    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    oof[val_idx] = 1 / (1 + np.exp(-model.predict(x_valid))) #回帰器なのでsigmoidを通さないといけない？
            
    
    #--------------------- PREDICTION---------------------
    x_test = test[confFitting["feature_cols"]].values
    
    predictions = np.zeros((len(test), target.iloc[:, 1:].shape[1]))
    predictions = 1 / (1 + np.exp(-model.predict(x_test)))
    
    return oof, predictions


## K-Fold Running

In [32]:
def run_k_fold(Tester, NFOLDS, seed, param,
              folds, train, test, target, confFitting):
    oof = np.zeros((len(train), confFitting["num_targets"]))
    predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    for fold in range(NFOLDS):
        if Tester:
            print('=' * 20, 'Fold', fold, '=' * 20)
        oof_, pred_ = run_training(confFitting, Tester, fold, seed, param,
                                   folds, train, test, target)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

## CV Evaluation

In [33]:
 def CV_Evaluation(confFitting, oof, train, target):
    #CV score : OOFの評価結果。
    #OOF(学習モデルによるtrain dataの予測)
    train[confFitting["target_cols"]] = oof
    #target(予測結果)：ここで処理「cp_type = ctl_vehicleのレコードを削除」で抜けたところに0を入れている。
    valid_results = trainTargetScored.drop(columns=confFitting["target_cols"]).merge(train[['sig_id']+confFitting["target_cols"]], on='sig_id', how='left').fillna(0)
    
    y_true = trainTargetScored[confFitting["target_cols"]].values
    y_pred = valid_results[confFitting["target_cols"]].values
    
    score = 0
    for i in range(confFitting["num_targets"]):
        score_ = log_loss(y_true[:, i], y_pred[:, i]) #問題の評価指標によって変わる。
        score += score_ / target.shape[1]
        
    print("CV log_loss: ", score)
    
    #OOF save
    np.save(SAVEOOF + 'oof', y_pred)
    
    return score

## Postprocessing

In [34]:
# 特になし

## Submit

In [35]:
def Submit(confFitting, predictions, test):
    test[confFitting["target_cols"]] = predictions
    sub = sample_submission.drop(columns=confFitting["target_cols"]).merge(test[['sig_id']+confFitting["target_cols"]], on='sig_id', how='left').fillna(0)
    sub.to_csv(f'{SUBMIT}submission.csv', index=False)

    print("sub.shape" + str(sub.shape))
    
    return

# Execute

In [36]:
def Exec(param):
    
    #Tester(True/False)
    Tester = True
    
    #Preprocessing Data
    train, test, target = preprocessing(param, trainFeature, testFeature, trainTargetScored)
    
    #CV folds
    folds = CV_folds_drug_id(train, target)
    
    #Config about Fitting
    confFitting = Config_about_Fitting(train, test, target, folds)
    
    # Averaging on multiple SEEDS
    SEED = [0, 1, 2, 3, 4, 5]
    oof = np.zeros((len(train), confFitting["num_targets"]))
    predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    ### RUN ###
    for seed in SEED:
        if Tester:
            print('~' * 20, 'SEED', seed, '~' * 20)
        oof_, predictions_ = run_k_fold(Tester, NFOLDS, seed, param,
                                       folds, train, test, target, confFitting)
        oof += oof_ / len(SEED)
        predictions += predictions_ / len(SEED)
    
    #CV 評価
    score = CV_Evaluation(confFitting, oof, train, target)
    
    # 課題提出
    Submit(confFitting, predictions, test)
    
    return score


In [41]:
%%time
score= Exec(param_space)

~~~~~~~~~~~~~~~~~~~~ SEED 0 ~~~~~~~~~~~~~~~~~~~~
Device used : cuda
epoch 0  | loss: 0.37418 | val_logits_ll: 0.03969 |  0:00:00s
epoch 10 | loss: 0.01871 | val_logits_ll: 0.02129 |  0:00:09s
epoch 20 | loss: 0.01745 | val_logits_ll: 0.02213 |  0:00:17s
epoch 30 | loss: 0.01712 | val_logits_ll: 0.019   |  0:00:26s
epoch 40 | loss: 0.01701 | val_logits_ll: 0.01836 |  0:00:34s
epoch 50 | loss: 0.01636 | val_logits_ll: 0.01828 |  0:00:43s
epoch 60 | loss: 0.01617 | val_logits_ll: 0.01797 |  0:00:52s
epoch 70 | loss: 0.01585 | val_logits_ll: 0.01855 |  0:01:00s
epoch 80 | loss: 0.01557 | val_logits_ll: 0.01811 |  0:01:09s

Early stopping occured at epoch 82 with best_epoch = 62 and best_val_logits_ll = 0.01785
Best weights from best epoch are automatically used!
Device used : cuda
epoch 0  | loss: 0.37348 | val_logits_ll: 0.03868 |  0:00:00s
epoch 10 | loss: 0.01905 | val_logits_ll: 0.02062 |  0:00:09s
epoch 20 | loss: 0.01753 | val_logits_ll: 0.02188 |  0:00:18s
epoch 30 | loss: 0.01694 |

epoch 50 | loss: 0.01669 | val_logits_ll: 0.01753 |  0:00:43s
epoch 60 | loss: 0.01642 | val_logits_ll: 0.01736 |  0:00:52s

Early stopping occured at epoch 68 with best_epoch = 48 and best_val_logits_ll = 0.01726
Best weights from best epoch are automatically used!
Device used : cuda
epoch 0  | loss: 0.38265 | val_logits_ll: 0.04597 |  0:00:00s
epoch 10 | loss: 0.01944 | val_logits_ll: 0.01968 |  0:00:09s
epoch 20 | loss: 0.01753 | val_logits_ll: 0.02239 |  0:00:18s
epoch 30 | loss: 0.01682 | val_logits_ll: 0.01797 |  0:00:26s
epoch 40 | loss: 0.01667 | val_logits_ll: 0.01819 |  0:00:35s
epoch 50 | loss: 0.01636 | val_logits_ll: 0.01779 |  0:00:44s
epoch 60 | loss: 0.01615 | val_logits_ll: 0.0177  |  0:00:52s
epoch 70 | loss: 0.01599 | val_logits_ll: 0.01767 |  0:01:01s
epoch 80 | loss: 0.0155  | val_logits_ll: 0.01751 |  0:01:10s

Early stopping occured at epoch 84 with best_epoch = 64 and best_val_logits_ll = 0.01748
Best weights from best epoch are automatically used!
Device used :

Device used : cuda
epoch 0  | loss: 0.38114 | val_logits_ll: 0.04662 |  0:00:00s
epoch 10 | loss: 0.01974 | val_logits_ll: 0.01979 |  0:00:09s
epoch 20 | loss: 0.01775 | val_logits_ll: 0.02166 |  0:00:18s
epoch 30 | loss: 0.01709 | val_logits_ll: 0.01798 |  0:00:26s
epoch 40 | loss: 0.01658 | val_logits_ll: 0.01774 |  0:00:35s
epoch 50 | loss: 0.01621 | val_logits_ll: 0.01757 |  0:00:43s
epoch 60 | loss: 0.01606 | val_logits_ll: 0.01767 |  0:00:52s
epoch 70 | loss: 0.01579 | val_logits_ll: 0.01787 |  0:01:01s

Early stopping occured at epoch 79 with best_epoch = 59 and best_val_logits_ll = 0.01744
Best weights from best epoch are automatically used!
Device used : cuda
epoch 0  | loss: 0.37815 | val_logits_ll: 0.04261 |  0:00:00s
epoch 10 | loss: 0.01909 | val_logits_ll: 0.01918 |  0:00:09s
epoch 20 | loss: 0.01763 | val_logits_ll: 0.01811 |  0:00:18s
epoch 30 | loss: 0.01702 | val_logits_ll: 0.02186 |  0:00:27s
epoch 40 | loss: 0.01675 | val_logits_ll: 0.0184  |  0:00:36s
epoch 50 | lo

In [42]:
print("score: " + str(score))

score: 0.015624473906583655


In [36]:
%%time
score= Exec(param_space)

~~~~~~~~~~~~~~~~~~~~ SEED 0 ~~~~~~~~~~~~~~~~~~~~
Device used : cuda
epoch 0  | loss: 0.37286 | val_logits_ll: 0.04157 |  0:00:00s
epoch 10 | loss: 0.01955 | val_logits_ll: 0.01907 |  0:00:10s
epoch 20 | loss: 0.01769 | val_logits_ll: 0.01921 |  0:00:18s
epoch 30 | loss: 0.01728 | val_logits_ll: 0.01861 |  0:00:27s
epoch 40 | loss: 0.01692 | val_logits_ll: 0.0173  |  0:00:36s
epoch 50 | loss: 0.01659 | val_logits_ll: 0.01697 |  0:00:45s
epoch 60 | loss: 0.01637 | val_logits_ll: 0.01687 |  0:00:53s
epoch 70 | loss: 0.0161  | val_logits_ll: 0.01752 |  0:01:02s
epoch 80 | loss: 0.01565 | val_logits_ll: 0.01665 |  0:01:11s
epoch 90 | loss: 0.01547 | val_logits_ll: 0.01666 |  0:01:20s
epoch 100| loss: 0.01493 | val_logits_ll: 0.01685 |  0:01:29s

Early stopping occured at epoch 106 with best_epoch = 86 and best_val_logits_ll = 0.01657
Best weights from best epoch are automatically used!
Device used : cuda
epoch 0  | loss: 0.37356 | val_logits_ll: 0.04124 |  0:00:00s
epoch 10 | loss: 0.01927 

epoch 100| loss: 0.01483 | val_logits_ll: 0.01717 |  0:01:30s

Early stopping occured at epoch 104 with best_epoch = 84 and best_val_logits_ll = 0.0168
Best weights from best epoch are automatically used!
Device used : cuda
epoch 0  | loss: 0.38771 | val_logits_ll: 0.04644 |  0:00:00s
epoch 10 | loss: 0.01924 | val_logits_ll: 0.01904 |  0:00:09s
epoch 20 | loss: 0.01759 | val_logits_ll: 0.02048 |  0:00:19s
epoch 30 | loss: 0.01702 | val_logits_ll: 0.02055 |  0:00:28s
epoch 40 | loss: 0.01678 | val_logits_ll: 0.01731 |  0:00:36s
epoch 50 | loss: 0.01669 | val_logits_ll: 0.0172  |  0:00:45s
epoch 60 | loss: 0.0164  | val_logits_ll: 0.01714 |  0:00:54s
epoch 70 | loss: 0.01606 | val_logits_ll: 0.0169  |  0:01:03s
epoch 80 | loss: 0.01582 | val_logits_ll: 0.01682 |  0:01:12s
epoch 90 | loss: 0.01575 | val_logits_ll: 0.0168  |  0:01:21s
epoch 100| loss: 0.01567 | val_logits_ll: 0.01706 |  0:01:30s
epoch 110| loss: 0.01507 | val_logits_ll: 0.01678 |  0:01:39s
epoch 120| loss: 0.01507 | val_l

epoch 0  | loss: 0.35885 | val_logits_ll: 0.03873 |  0:00:00s
epoch 10 | loss: 0.01913 | val_logits_ll: 0.02003 |  0:00:10s
epoch 20 | loss: 0.01754 | val_logits_ll: 0.01971 |  0:00:18s
epoch 30 | loss: 0.0171  | val_logits_ll: 0.0173  |  0:00:28s
epoch 40 | loss: 0.01672 | val_logits_ll: 0.01718 |  0:00:37s
epoch 50 | loss: 0.01647 | val_logits_ll: 0.01706 |  0:00:46s
epoch 60 | loss: 0.01673 | val_logits_ll: 0.01691 |  0:00:55s
epoch 70 | loss: 0.01621 | val_logits_ll: 0.01687 |  0:01:04s
epoch 80 | loss: 0.01597 | val_logits_ll: 0.01672 |  0:01:13s
epoch 90 | loss: 0.01576 | val_logits_ll: 0.01689 |  0:01:22s
epoch 100| loss: 0.01537 | val_logits_ll: 0.01668 |  0:01:31s

Early stopping occured at epoch 106 with best_epoch = 86 and best_val_logits_ll = 0.01664
Best weights from best epoch are automatically used!
Device used : cuda
epoch 0  | loss: 0.35464 | val_logits_ll: 0.03489 |  0:00:01s
epoch 10 | loss: 0.01923 | val_logits_ll: 0.02104 |  0:00:11s
epoch 20 | loss: 0.01758 | val_

Device used : cuda
epoch 0  | loss: 0.36492 | val_logits_ll: 0.03749 |  0:00:00s
epoch 10 | loss: 0.01903 | val_logits_ll: 0.02098 |  0:00:09s
epoch 20 | loss: 0.01757 | val_logits_ll: 0.02055 |  0:00:18s
epoch 30 | loss: 0.01711 | val_logits_ll: 0.01991 |  0:00:27s
epoch 40 | loss: 0.01664 | val_logits_ll: 0.01714 |  0:00:37s
epoch 50 | loss: 0.01654 | val_logits_ll: 0.01717 |  0:00:46s
epoch 60 | loss: 0.01621 | val_logits_ll: 0.01732 |  0:00:55s
epoch 70 | loss: 0.0159  | val_logits_ll: 0.01683 |  0:01:04s
epoch 80 | loss: 0.01574 | val_logits_ll: 0.0168  |  0:01:13s
epoch 90 | loss: 0.01543 | val_logits_ll: 0.01692 |  0:01:22s

Early stopping occured at epoch 93 with best_epoch = 73 and best_val_logits_ll = 0.01676
Best weights from best epoch are automatically used!
Device used : cuda
epoch 0  | loss: 0.36461 | val_logits_ll: 0.03832 |  0:00:00s
epoch 10 | loss: 0.01907 | val_logits_ll: 0.02199 |  0:00:09s
epoch 20 | loss: 0.01771 | val_logits_ll: 0.02122 |  0:00:19s
epoch 30 | lo

In [37]:
print("score: " + str(score))

score: 0.01478125867998373


# Predict

In [41]:
def run_predict(confFitting, param, test, target, fold, seed):
    
    seed_everything(seed)
    
    
    #--------------------- PREDICTION---------------------
    x_test = test[confFitting["feature_cols"]].values
    
    model = load(f"{SAVEMODEL}SEED{seed}_FOLD{fold}.pth")
    
    predictions = np.zeros((len(test), target.iloc[:, 1:].shape[1]))
    predictions = 1 / (1 + np.exp(-model.predict(x_test)))
    
    del model
    
    return predictions


In [42]:
def run_k_fold_predict(confFitting, test, target, param, Tester, NFOLDS, seed):
    predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    for fold in range(NFOLDS):
        if Tester:
            print('=' * 20, 'Fold', fold, '=' * 20)
        pred_ = run_predict(confFitting, param, test, target, fold, seed)
        
        predictions += pred_ / NFOLDS
        
    return predictions

In [43]:
def SubmitPredict(confFitting, predictions, test, prefix):
    test[confFitting["target_cols"]] = predictions
    sub = sample_submission.drop(columns=confFitting["target_cols"]).merge(test[['sig_id']+confFitting["target_cols"]], on='sig_id', how='left').fillna(0)
    sub.to_csv(f'{SUBMIT}{prefix}submission.csv', index=False)

    print("sub.shape" + str(sub.shape))
    
    return

In [44]:
def Predict(param):
    #Tester(True/False)
    Tester = True
    
    #Preprocessing Data
    train, test, target = preprocessing(param, trainFeature, testFeature, trainTargetScored)
    
    #CV folds
    folds = CV_folds(train, target)
    
    #Config about Fitting
    confFitting = Config_about_Fitting(train, test, target, folds)
    
    # Averaging on multiple SEEDS
    SEED = [0, 1, 2, 3 ,4, 5]
    predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    ### RUN ###
    for seed in SEED:
        if Tester:
            print('~' * 20, 'SEED', seed, '~' * 20)
        predictions_ = run_k_fold_predict(confFitting, test, target, param, Tester, NFOLDS, seed)
        predictions += predictions_ / len(SEED)
    
    # 課題提出
    prefix = "TabnetRegressor"
    SubmitPredict(confFitting, predictions, test, prefix)
    
    return

In [45]:
%%time
Predict(param_space)

~~~~~~~~~~~~~~~~~~~~ SEED 0 ~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~ SEED 1 ~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~ SEED 2 ~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~ SEED 3 ~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~ SEED 4 ~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~ SEED 5 ~~~~~~~~~~~~~~~~~~~~
sub.shape(3982, 207)
CPU times: user 2min 28s, sys: 1.88 s, total: 2min 30s
Wall time: 28.2 s


# Hyperparameter Tuning

In [32]:
#hyperopt
from hyperopt import fmin, tpe, hp, rand, Trials

In [33]:
def HOptExec(param):
    #Tester(True/False)
    Tester = False
    
    #Preprocessing Data
    train, test, target = preprocessing(param, trainFeature, testFeature, trainTargetScored)
    
    #CV folds
    folds = CV_folds(train, target)
    
    #Config about Fitting
    confFitting = Config_about_Fitting(train, test, target, folds)
    
    # Averaging on multiple SEEDS
    SEED = [0, 1, 2, 3 ,4, 5]
    oof = np.zeros((len(train), confFitting["num_targets"]))
    predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    ### RUN ###
    for seed in SEED:
        if Tester:
            print('~' * 20, 'SEED', seed, '~' * 20)
        oof_, predictions_ = run_k_fold(Tester, NFOLDS, seed, param,
                                       folds, train, test, target, confFitting)
        oof += oof_ / len(SEED)
        predictions += predictions_ / len(SEED)
    
    #CV 評価
    score = CV_Evaluation(confFitting, oof, train, target)
    
    # 課題提出
    #Submit(confFitting, predictions, test)
    
    return score

In [1]:
%%time

trials = Trials()

hopt = fmin(fn = HOptExec, 
            space = PARAMSPACE, 
            algo = tpe.suggest, 
            max_evals = 15, 
            #timeout = 8.9 * 60 * 60, 
            trials = trials, 
           )

print(hopt)

NameError: name 'Trials' is not defined