In [1]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')

import tidalUtl.PrpUtl as prp
import tidalUtl.EdaUtl as eda

In [2]:
#import sys
#sys.path.append('../input/iterative-stratification/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

https://www.kaggle.com/tidalryoku/new-baseline-pytorch-moa/

# Version

__ver1__<br>
baseline：CV:0.01465 LB:0.01874<br>
__ver2__<br>
Hyperopt, 2Layer：CV:0.01460 LB:0.01869<br>
__ver3__<br>
3Layer：CV:0.01464 LB:0.01868<br>
__ver4__<br>
MLSMOTE baseline：CV:0.01476 LB:0.01978<br>
__ver5__<br>
2Layer,refactoring：CV:0.01476 LB:0.01869<br>
__ver6__<br>
rankGauss：CV:0.01456 LB:0.01865<br>
__ver7__<br>
labelSmoothing：CV:0.01502 LB:0.01859<br>

# Config

In [3]:
INPUT = "/home/tidal/ML_Data/MoA/lish-moa"
OUTPUT = "/home/tidal/ML_Data/MoA/output"
#INPUT = "/Users/hfuis/ML_Data/MoA/lish-moa"
#OUTPUT = "/Users/hfuis/ML_Data/MoA/output"

SUBMIT = OUTPUT + "/submittion/"
SAVEMODEL = OUTPUT + "/model/Pytorch/"
SAVEOOF = OUTPUT + "/OOF/Pytorch/"

In [4]:
#Loading
trainFeature = pd.read_csv(INPUT + '/train_features.csv')
testFeature = pd.read_csv(INPUT + '/test_features.csv')
trainTargetScored = pd.read_csv(INPUT + '/train_targets_scored.csv')
sample_submission = pd.read_csv(INPUT + '/sample_submission.csv')
drug = pd.read_csv(INPUT + '/train_drug.csv')

In [5]:
GENES = [col for col in trainFeature.columns if col.startswith('g-')] #gから始まる列名のセット
CELLS = [col for col in trainFeature.columns if col.startswith('c-')] #cから始まる列名のセット

In [6]:
#Seed固定
def seed_everything(seed=42):
    #data取得についてのランダム性固定
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #cudnnによる演算の安定化(評価値の安定)
    torch.backends.cudnn.deterministic = True
    
    #os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(seed=42)

In [7]:
#HyperParameter
param_space = {'hidden_size1': 512, 
               'hidden_size2': 512, 
               'dropOutRate1': 0.20393004966355735, 
               'dropOutRate2': 0.39170486751620137,
               'rankGauss_n_quantiles': 488.0393350201078,
               'leakyReluSlope': 0.01973893854348531,
              }

# Preprocessing

## Func: In & Out Type is DataFrame

### PCA features add

In [8]:
def PCA_features_add(trainFeature, testFeature):
    # GENES
    n_comp = 50
    
    inTrain = trainFeature[GENES]
    inTest = testFeature[GENES]
    
    #PCA実行＆変換後のデータ作成
    pca_train, pca_test, _ = prp.tidalPCA(inTrain, inTest, Dim=n_comp, random_state=42)
    
    #columの名前付け
    trainTmp = pd.DataFrame(pca_train, columns=[f'pca_G-{i}' for i in range(n_comp)])
    testTmp = pd.DataFrame(pca_test, columns=[f'pca_G-{i}' for i in range(n_comp)])
    
    #データに付け足し
    trainFeature = pd.concat((trainFeature, trainTmp), axis=1)
    testFeature = pd.concat((testFeature, testTmp), axis=1)
    
    
    # CELLS
    # CELLSもGENESと同様。
    n_comp = 15
    
    inTrain = trainFeature[CELLS]
    inTest = testFeature[CELLS]
    
    pca_train, pca_test, _ = prp.tidalPCA(inTrain, inTest, Dim=n_comp, random_state=42)
    
    trainTmp = pd.DataFrame(pca_train, columns=[f'pca_C-{i}' for i in range(n_comp)])
    testTmp = pd.DataFrame(pca_test, columns=[f'pca_C-{i}' for i in range(n_comp)])
    
    trainFeature = pd.concat((trainFeature, trainTmp), axis=1)
    testFeature = pd.concat((testFeature, testTmp), axis=1)
    
    
    return trainFeature, testFeature

### feature Selection using Variance Encoding

In [9]:
def feature_Selection_using_Variance_Encoding(trainFeature, testFeature):
    data = trainFeature.append(testFeature)
    
    #['sig_id','cp_type','cp_time','cp_dose']を除いたfeatureで低い分散の特徴量を除去
    #列名は連番になる。
    data_transformed = prp.tidalVarianceThrs(data.iloc[:, 4:], threshold=0.5)
    
    
    trainFeature_transformed = data_transformed[ : trainFeature.shape[0]]
    testFeature_transformed = data_transformed[-testFeature.shape[0] : ]
    
    trainFeature = trainFeature[['sig_id','cp_type','cp_time','cp_dose']]
    trainFeature = pd.concat([trainFeature, pd.DataFrame(trainFeature_transformed)], axis=1)
    
    testFeature = testFeature[['sig_id','cp_type','cp_time','cp_dose']]
    testFeature = pd.concat([testFeature, pd.DataFrame(testFeature_transformed)], axis=1)

    
    return trainFeature, testFeature

## cp_type = ctl_vehicleのレコードを削除

__※提出用データ(test)も同様に一部レコードを削除するが、こちらは最後submittionデータを作る際に0埋めを行う。__<br>
__（CV_Evaluation(), Submit()参照。）__

In [10]:
def drop_ctl_vehicle(trainFeature, testFeature, trainTargetScored):
    
    #Pkey(sig_id)でfeatureとtargetを内部結合。
    train = trainFeature.merge(trainTargetScored, on='sig_id')
    test = testFeature.merge(sample_submission, on='sig_id')
    
    #件のレコードを削除。
    train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
    test = test[test['cp_type']!='ctl_vehicle'].reset_index(drop=True)
    target = train[trainTargetScored.columns]
    
    #cp_typeは使用しない。(今となっては全て同じ特徴量(trt_cp)であるため)
    train = train.drop('cp_type', axis=1)
    test = test.drop('cp_type', axis=1)
    
    #trainFeature,testFeatureに戻す
    tmpTraget = trainTargetScored.drop('sig_id', axis=1)
    trainFeature = train.drop(tmpTraget.columns, axis=1)
    testFeature = test.drop(tmpTraget.columns, axis=1)
    
    
    return trainFeature, testFeature, target
    

## One-Hot Encoding

In [11]:
def oneHotEncoding(train, test):
    #One-Hot Encoding(カテゴリデータをすべてOne-Hot化)
    feature_name = ['cp_time','cp_dose']
    train, test = prp.OneHot_encode(train, test, feature_name)
    
    return train, test

## MLSMOTE

__MULTI SMOTE: 頻度の少ないターゲットに当たるTrainDataをAugumentする手法__

In [12]:
def MlSmote(train, target, thrsQlMin):
    #trainについている['sig_id']を除いたfeatureを使う。
    #(targetも除く)
    trainFeatureSMOTE = train.drop(target.columns.values.tolist(), axis=1)
    #targetについている['sig_id']を除去。
    targetSMOTE = target.iloc[:,1:]
    
    #MLSMOTE実行
    X_sub, y_sub = prp.get_minority_samples(trainFeatureSMOTE, targetSMOTE, ql=[thrsQlMin, 1.])  # ターゲットの頻度が不足のデータを返す。
    trainFeatureAug, targetAug = prp.MLSMOTE(X_sub, y_sub, len(X_sub), neigh=5)  # Applying MLSMOTE to augment the dataframe
    
    #cp_time_*, cp_dose_*で絶対値の大きなものを1,それ以外を0に変更。
    
    #train,targetの形に成形(targetにsig_idを付与。trainにtargetをくっ付ける。)
    #1.targetにsig_idを付与
    targetAug["sig_id"] = ""
    for i in range(len(trainFeatureAug)):
        addedId = "id_MLSMOTE"+str(i)
        targetAug.iloc[i,-1]= addedId
    #2.trainにtargetをくっ付ける。
    trainAug = pd.concat([trainFeatureAug, targetAug], axis=1)
    
    #AugmentDataを元のデータにくっ付ける.
    train = train.append(trainAug)
    target = target.append(targetAug)
    
    #インデックス整理
    train = train.reset_index(drop=True)
    target = target.reset_index(drop=True)
    
    return train, target

## rankGauss

In [13]:
def rankGauss(trainFeature, testFeature, n_quantiles):
    dfTrain = trainFeature.copy()
    dfTest = testFeature.copy()
    #'g-','c-'が対象。
    for col in (GENES + CELLS):
        dfTrain[[col]], dfTest[[col]] = prp.rankGauss(trainFeature[[col]], testFeature[[col]],n_quantiles=n_quantiles)
    
    return dfTrain, dfTest

## createCluster

In [14]:
def createCluster(trainFeature, testFeature, n_clusters_g=35, n_clusters_c=5):
    #"g-"と"c-"でそれぞれクラスター分析を行う。
    features_g = list(trainFeature.columns[4:776])
    features_c = list(trainFeature.columns[776:876])
    
    train = trainFeature.copy()
    test = testFeature.copy()
    
    #実行。
    train, test = eda.createClusterKmeans(train, test, features_g, n_clusters=n_clusters_g, kind = 'cluster_g', seed = 0)
    train, test = eda.createClusterKmeans(train, test, features_c, n_clusters=n_clusters_c, kind = 'cluster_c', seed = 0)
    
    return train, test

## statsAdd

In [15]:
def statsAdd(trainFeature, testFeature):
    features_g = list(trainFeature.columns[4:776])
    features_c = list(trainFeature.columns[776:876])
    
    for df in trainFeature, testFeature:
        df['g_sum'] = df[features_g].sum(axis = 1)
        df['g_mean'] = df[features_g].mean(axis = 1)
        df['g_std'] = df[features_g].std(axis = 1)
        df['g_kurt'] = df[features_g].kurtosis(axis = 1)
        df['g_skew'] = df[features_g].skew(axis = 1)
        df['c_sum'] = df[features_c].sum(axis = 1)
        df['c_mean'] = df[features_c].mean(axis = 1)
        df['c_std'] = df[features_c].std(axis = 1)
        df['c_kurt'] = df[features_c].kurtosis(axis = 1)
        df['c_skew'] = df[features_c].skew(axis = 1)
        df['gc_sum'] = df[features_g + features_c].sum(axis = 1)
        df['gc_mean'] = df[features_g + features_c].mean(axis = 1)
        df['gc_std'] = df[features_g + features_c].std(axis = 1)
        df['gc_kurt'] = df[features_g + features_c].kurtosis(axis = 1)
        df['gc_skew'] = df[features_g + features_c].skew(axis = 1)
        
    return trainFeature, testFeature

## Scaling

In [16]:
def Scaling(trainFeature, testFeature):
    features = trainFeature.columns[3:]
    
    #Scaler
    scaler = StandardScaler()
    scaler.fit(pd.concat([trainFeature[features], testFeature[features]], axis = 0))
    
    trainFeature[features] = scaler.transform(trainFeature[features])
    testFeature[features] = scaler.transform(testFeature[features])
    
    return trainFeature, testFeature

## Collecting

__train,testにターゲット値も連結__

In [17]:
def Collecting(trainFeature, testFeature, trainTargetScored):
    #Pkey(sig_id)でfeatureとtargetを内部結合。
    train = trainFeature.merge(trainTargetScored, on='sig_id')
    test = testFeature.merge(sample_submission, on='sig_id')
    
    return train, test

## Preprocessing Summary

In [18]:
def preprocessing(param, trainFeature, testFeature, trainTargetScored):
    rankGauss_n_quantiles=int(param['rankGauss_n_quantiles'])
    
    #Scaler候補１
    #print("trainFeature.shape:")
    #print(trainFeature.shape)
    #print("trainFeature.column:")
    #print(trainFeature.columns.values.tolist())
    
    #statsAdd
    trainFeature, testFeature = statsAdd(trainFeature, testFeature)

    #createCluster
    trainFeature, testFeature = createCluster(trainFeature, testFeature, n_clusters_g=35, n_clusters_c=5)
    
    #rankGauss
    trainFeature, testFeature = rankGauss(trainFeature, testFeature, rankGauss_n_quantiles)
    
    #PCA成分付与
    trainFeature, testFeature = PCA_features_add(trainFeature, testFeature)
    
    #低分散特徴量除去
    trainFeature, testFeature = feature_Selection_using_Variance_Encoding(trainFeature, testFeature)
    
    #cp_type = ctl_vehicleのレコードを削除.
    trainFeature, testFeature, target = drop_ctl_vehicle(trainFeature, testFeature, trainTargetScored)
    
    #Scaler候補２
    #trainFeature, testFeature = Scaling(trainFeature, testFeature)
    #print("trainFeature.shape:")
    #print(trainFeature.shape)
    #print("trainFeature.column:")
    #print(trainFeature.columns.values.tolist())
    
    #One-Hot Encoding
    trainFeature, testFeature = oneHotEncoding(trainFeature, testFeature)
    
    #train,testにターゲット値を連結。
    train, test = Collecting(trainFeature, testFeature, target)
    
    
    return train, test, target

## Work

## Visualization

In [19]:
%%time
trainVsl, testVsl, targetVsl = preprocessing(param_space, trainFeature, testFeature, trainTargetScored)

CPU times: user 2min 5s, sys: 1.7 s, total: 2min 7s
Wall time: 24.5 s


In [20]:
trainVsl.head(5)

Unnamed: 0,sig_id,0,1,2,3,4,5,6,7,8,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,1.134936,0.907607,-0.41609,-0.968042,-0.255626,-1.015203,-1.367034,-0.024938,0.679054,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,0.119254,0.682062,0.272262,0.080347,1.203946,0.686698,0.31455,0.554765,-0.537428,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,0.779855,0.94591,1.425056,-0.131341,-0.006697,1.49267,0.234401,0.364718,-0.005477,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,-0.735029,-0.274233,-0.438096,0.760073,2.45407,-0.859297,-2.302074,0.308738,-0.192191,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,-0.451791,-0.476988,0.972928,0.97107,1.462687,-0.870623,-0.375908,-0.204468,-1.064448,...,0,0,0,0,0,0,0,0,0,0


In [21]:
targetVsl.head(5)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
sample_submission.head(5)

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,id_001897cda,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2,id_002429b5b,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
3,id_00276f245,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
4,id_0027f1083,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5


In [23]:
print("Train: "+ str(trainVsl.shape))
print("Test: "+ str(testVsl.shape))
print("Target: "+ str(targetVsl.shape))
print("sample_submission: "+ str(sample_submission.shape))

Train: (21948, 1159)
Test: (3624, 1159)
Target: (21948, 207)
sample_submission: (3982, 207)


# Fitting

## Config about Fitting

In [24]:
#configは辞書化しておく。
def Config_about_Fitting(train, test, target, folds):
    confFitting = {}
    
    #Fitするときに"y"として使う列の列名配列
    confFitting["target_cols"] = target.drop('sig_id', axis=1).columns.values.tolist()
    #Fitするときに"X"として使う列の列名配列
    #kfold, id等はここで削除。
    feature_cols = [c for c in folds.columns if c not in confFitting["target_cols"]]
    confFitting["feature_cols"] = [c for c in feature_cols if c not in ['kfold','sig_id']]
    #特徴量、ターゲットのサイズ
    confFitting["num_features"]=len(confFitting["feature_cols"])
    confFitting["num_targets"]=len(confFitting["target_cols"])
    
    return confFitting

## Dataset Classes

In [25]:
#Train,Valid用のデータクラス
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        #torch.DataLoaderに入れるための形式
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)            
        }
        return dct
    
#Test用のデータクラス
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            #torch.DataLoaderに入れるための形式
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct
    

## Loss, Metric

In [26]:
#loss
class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self):
        super(LabelSmoothingCrossEntropy, self).__init__()
    def forward(self, x, target, smoothing=0.001):
        confidence = 1. - smoothing
        logprobs = F.log_softmax(x, dim=-1)
        bcs_loss = nn.BCEWithLogitsLoss()(x, target)
        smooth_loss = -logprobs.mean(dim=-1)
        loss = confidence * bcs_loss + smoothing * smooth_loss
        return loss.mean()

In [27]:
#metric
#nn.BCEWithLogitsLoss()

## Func: Fitting, Evaluation, Predict

In [28]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
#         print(inputs.shape)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss


def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds
   
    

## Model architect

In [29]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, param):
        super(Model, self).__init__()
        #hyperoptによる被探索パラメータ
        hidden_size1=param['hidden_size1']
        hidden_size2=param['hidden_size2']
        dropOutRate1=param['dropOutRate1']
        dropOutRate2=param['dropOutRate2']
        leakyReluSlope=param['leakyReluSlope']
        
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(dropOutRate1)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size1))
        self.leakyRelu1 = nn.LeakyReLU(negative_slope=leakyReluSlope)
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size1)
        self.dropout2 = nn.Dropout(dropOutRate2)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size1, hidden_size2))
        self.leakyRelu2 = nn.LeakyReLU(negative_slope=leakyReluSlope)
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size2)
        self.dropout3 = nn.Dropout(dropOutRate2)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size2, num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.leakyRelu1(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.leakyRelu2(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

# Run

## HyperParameter

In [30]:
# HyperParameters
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 25
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 5
EARLY_STOPPING_STEPS = 10
EARLY_STOP = True

## CV folds

In [31]:
def CV_folds(train, target):
    folds = train.copy()
    
    mskf = MultilabelStratifiedKFold(n_splits=NFOLDS)
    
    for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
        folds.loc[v_idx, 'kfold'] = int(f)
    
    folds['kfold'] = folds['kfold'].astype(int)
    
    return folds

In [32]:
def CV_folds_drug_id(train, target):
    ###drug_idを考慮####
    
    targets = target.columns[1:]
    
    # foldsにdrug_id付与
    folds = train.copy()
    folds = folds.merge(drug, on='sig_id', how='left') 
    
    # LOCATE DRUGS
    vc = folds.drug_id.value_counts()
    vc1 = vc.loc[vc<=18].index.sort_values()
    vc2 = vc.loc[vc>18].index.sort_values()
    
    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}; dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, 
              random_state=42)
    tmp = folds.groupby('drug_id')[targets].mean().loc[vc1]
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)
    
    # STRATIFY DRUGS MORE THAN 18X
    skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, 
              random_state=42)
    tmp = folds.loc[folds.drug_id.isin(vc2)].reset_index(drop=True)
    for fold,(idxT,idxV) in enumerate( skf.split(tmp,tmp[targets])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)
    
    # ASSIGN NFOLDS
    folds['kfold'] = folds.drug_id.map(dct1)
    folds.loc[folds.kfold.isna(),'kfold'] =\
        folds.loc[folds.kfold.isna(),'sig_id'].map(dct2)
    folds.kfold = folds.kfold.astype('int8')
    
    folds = folds.drop('drug_id', axis=1)
    
    return folds

In [33]:
%%time
#Preprocessing Data
trainVsl, testVsl, targetVsl = preprocessing(param_space, trainFeature, testFeature, trainTargetScored)
#CV folds
foldsVsl = CV_folds_drug_id(trainVsl, targetVsl)

foldsVsl.head(5)

CPU times: user 2min 11s, sys: 1.65 s, total: 2min 12s
Wall time: 25.2 s


Unnamed: 0,sig_id,0,1,2,3,4,5,6,7,8,...,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,kfold
0,id_000644bb2,1.134936,0.907607,-0.41609,-0.968042,-0.255626,-1.015203,-1.367034,-0.024938,0.679054,...,0,0,0,0,0,0,0,0,0,4
1,id_000779bfc,0.119254,0.682062,0.272262,0.080347,1.203946,0.686698,0.31455,0.554765,-0.537428,...,0,0,0,0,0,0,0,0,0,2
2,id_000a6266a,0.779855,0.94591,1.425056,-0.131341,-0.006697,1.49267,0.234401,0.364718,-0.005477,...,0,0,0,0,0,0,0,0,0,4
3,id_0015fd391,-0.735029,-0.274233,-0.438096,0.760073,2.45407,-0.859297,-2.302074,0.308738,-0.192191,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,-0.451791,-0.476988,0.972928,0.97107,1.462687,-0.870623,-0.375908,-0.204468,-1.064448,...,0,0,0,0,0,0,0,0,0,1


## Single Fold Running

In [34]:
def run_training(confFitting, Tester, fold, seed, param,
                 folds, train, test, target):
    
    seed_everything(seed)
    
    train = folds
    
    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index
    
    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)
    
    x_train, y_train  = train_df[confFitting["feature_cols"]].values, train_df[confFitting["target_cols"]].values
    x_valid, y_valid =  valid_df[confFitting["feature_cols"]].values, valid_df[confFitting["target_cols"]].values
    
    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=confFitting["num_features"],
        num_targets=confFitting["num_targets"],
        param=param
    )
    
    model.to(DEVICE)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    
    ##### 評価関数 ######
    train_loss_fn = LabelSmoothingCrossEntropy()
    valid_loss_fn = nn.BCEWithLogitsLoss()
    
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0
    
    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf
    
    for epoch in range(EPOCHS):
        
        train_loss = train_fn(model, optimizer,scheduler, train_loss_fn, trainloader, DEVICE)
        valid_loss, valid_preds = valid_fn(model, valid_loss_fn, validloader, DEVICE)
        if Tester:
            print("EPOCH: {:03}: | train_loss: {:.3f}: | valid_loss: {:.3f}".format(epoch, train_loss, valid_loss))
        
        if valid_loss < best_loss:
            
            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(), f"{SAVEMODEL}SEED{seed}_FOLD{fold}.pth")
        
        elif(EARLY_STOP == True):
            early_step += 1
            if (early_step >= early_stopping_steps):
                if Tester:
                    print('Early stopping. Best Val loss: {:.3f}'.format(best_loss))
                break
            
    
    #--------------------- PREDICTION---------------------
    x_test = test[confFitting["feature_cols"]].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=confFitting["num_features"],
        num_targets=confFitting["num_targets"],
        param=param
    )
    
    model.load_state_dict(torch.load(f"{SAVEMODEL}SEED{seed}_FOLD{fold}.pth"))
    model.to(DEVICE)
    
    predictions = np.zeros((len(test), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    
    
    return oof, predictions


## K-Fold Running

In [35]:
def run_k_fold(Tester, NFOLDS, seed, param,
              folds, train, test, target, confFitting):
    oof = np.zeros((len(train), confFitting["num_targets"]))
    predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    for fold in range(NFOLDS):
        if Tester:
            print('=' * 20, 'Fold', fold, '=' * 20)
        oof_, pred_ = run_training(confFitting, Tester, fold, seed, param,
                                   folds, train, test, target)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

## CV Evaluation

In [36]:
 def CV_Evaluation(confFitting, oof, train, target):
    #CV score : OOFの評価結果。
    #OOF(学習モデルによるtrain dataの予測)
    train[confFitting["target_cols"]] = oof
    #target(予測結果)：ここで処理「cp_type = ctl_vehicleのレコードを削除」で抜けたところに0を入れている。
    valid_results = trainTargetScored.drop(columns=confFitting["target_cols"]).merge(train[['sig_id']+confFitting["target_cols"]], on='sig_id', how='left').fillna(0)
    
    y_true = trainTargetScored[confFitting["target_cols"]].values
    y_pred = valid_results[confFitting["target_cols"]].values
    
    score = 0
    for i in range(confFitting["num_targets"]):
        score_ = log_loss(y_true[:, i], y_pred[:, i]) #問題の評価指標によって変わる。
        score += score_ / target.shape[1]
        
    print("CV log_loss: ", score)
    
    #OOF save
    np.save(SAVEOOF + 'oof', y_pred)
    
    return score

## Postprocessing

In [37]:
# 特になし

## Submit

In [38]:
def Submit(confFitting, predictions, test):
    test[confFitting["target_cols"]] = predictions
    sub = sample_submission.drop(columns=confFitting["target_cols"]).merge(test[['sig_id']+confFitting["target_cols"]], on='sig_id', how='left').fillna(0)
    sub.to_csv(f'{SUBMIT}submission.csv', index=False)

    print("sub.shape" + str(sub.shape))
    
    return

# Execute

In [39]:
def Exec(param):
    
    #Tester(True/False)
    Tester = True
    
    #Preprocessing Data
    train, test, target = preprocessing(param, trainFeature, testFeature, trainTargetScored)
    
    #CV folds
    folds = CV_folds_drug_id(train, target)
    
    #Config about Fitting
    confFitting = Config_about_Fitting(train, test, target, folds)
    
    # Averaging on multiple SEEDS
    SEED = [0, 1, 2, 3 ,4, 5]
    oof = np.zeros((len(train), confFitting["num_targets"]))
    predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    ### RUN ###
    for seed in SEED:
        if Tester:
            print('~' * 20, 'SEED', seed, '~' * 20)
        oof_, predictions_ = run_k_fold(Tester, NFOLDS, seed, param,
                                       folds, train, test, target, confFitting)
        oof += oof_ / len(SEED)
        predictions += predictions_ / len(SEED)
    
    #CV 評価
    score = CV_Evaluation(confFitting, oof, train, target)
    
    # 課題提出
    Submit(confFitting, predictions, test)
    
    return score


In [40]:
%%time
score= Exec(param_space)
print("score: " + str(score))

~~~~~~~~~~~~~~~~~~~~ SEED 0 ~~~~~~~~~~~~~~~~~~~~
EPOCH: 000: | train_loss: 0.569: | valid_loss: 0.049
EPOCH: 001: | train_loss: 0.031: | valid_loss: 0.021
EPOCH: 002: | train_loss: 0.026: | valid_loss: 0.019
EPOCH: 003: | train_loss: 0.025: | valid_loss: 0.019
EPOCH: 004: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 005: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 006: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 007: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 008: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 009: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 010: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 011: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 012: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 013: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 014: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 015: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 016: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 017: | train_loss: 0.023: | valid_loss: 0.0

EPOCH: 022: | train_loss: 0.021: | valid_loss: 0.018
EPOCH: 023: | train_loss: 0.021: | valid_loss: 0.018
EPOCH: 024: | train_loss: 0.021: | valid_loss: 0.018
EPOCH: 000: | train_loss: 0.569: | valid_loss: 0.052
EPOCH: 001: | train_loss: 0.031: | valid_loss: 0.020
EPOCH: 002: | train_loss: 0.026: | valid_loss: 0.019
EPOCH: 003: | train_loss: 0.025: | valid_loss: 0.018
EPOCH: 004: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 005: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 006: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 007: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 008: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 009: | train_loss: 0.024: | valid_loss: 0.017
EPOCH: 010: | train_loss: 0.024: | valid_loss: 0.017
EPOCH: 011: | train_loss: 0.024: | valid_loss: 0.017
EPOCH: 012: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 013: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 014: | train_loss: 0.023: | valid_loss: 0.017
EPOCH: 015: | train_loss: 0.023: | valid_loss:

EPOCH: 019: | train_loss: 0.022: | valid_loss: 0.017
EPOCH: 020: | train_loss: 0.022: | valid_loss: 0.017
EPOCH: 021: | train_loss: 0.022: | valid_loss: 0.017
EPOCH: 022: | train_loss: 0.022: | valid_loss: 0.017
EPOCH: 023: | train_loss: 0.021: | valid_loss: 0.017
Early stopping. Best Val loss: 0.017
EPOCH: 000: | train_loss: 0.571: | valid_loss: 0.050
EPOCH: 001: | train_loss: 0.031: | valid_loss: 0.020
EPOCH: 002: | train_loss: 0.026: | valid_loss: 0.019
EPOCH: 003: | train_loss: 0.025: | valid_loss: 0.018
EPOCH: 004: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 005: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 006: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 007: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 008: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 009: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 010: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 011: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 012: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 01

EPOCH: 021: | train_loss: 0.022: | valid_loss: 0.017
EPOCH: 022: | train_loss: 0.021: | valid_loss: 0.017
EPOCH: 023: | train_loss: 0.021: | valid_loss: 0.017
EPOCH: 024: | train_loss: 0.021: | valid_loss: 0.017
EPOCH: 000: | train_loss: 0.566: | valid_loss: 0.047
EPOCH: 001: | train_loss: 0.031: | valid_loss: 0.021
EPOCH: 002: | train_loss: 0.026: | valid_loss: 0.019
EPOCH: 003: | train_loss: 0.025: | valid_loss: 0.018
EPOCH: 004: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 005: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 006: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 007: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 008: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 009: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 010: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 011: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 012: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 013: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 014: | train_loss: 0.023: | valid_loss:

EPOCH: 018: | train_loss: 0.023: | valid_loss: 0.017
EPOCH: 019: | train_loss: 0.022: | valid_loss: 0.017
EPOCH: 020: | train_loss: 0.022: | valid_loss: 0.017
EPOCH: 021: | train_loss: 0.022: | valid_loss: 0.017
EPOCH: 022: | train_loss: 0.021: | valid_loss: 0.017
EPOCH: 023: | train_loss: 0.021: | valid_loss: 0.017
EPOCH: 024: | train_loss: 0.021: | valid_loss: 0.017
EPOCH: 000: | train_loss: 0.570: | valid_loss: 0.049
EPOCH: 001: | train_loss: 0.031: | valid_loss: 0.021
EPOCH: 002: | train_loss: 0.026: | valid_loss: 0.019
EPOCH: 003: | train_loss: 0.025: | valid_loss: 0.019
EPOCH: 004: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 005: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 006: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 007: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 008: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 009: | train_loss: 0.024: | valid_loss: 0.018
EPOCH: 010: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 011: | train_loss: 0.023: | valid_loss:

EPOCH: 017: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 018: | train_loss: 0.023: | valid_loss: 0.018
EPOCH: 019: | train_loss: 0.022: | valid_loss: 0.017
EPOCH: 020: | train_loss: 0.022: | valid_loss: 0.017
EPOCH: 021: | train_loss: 0.022: | valid_loss: 0.017
EPOCH: 022: | train_loss: 0.022: | valid_loss: 0.017
EPOCH: 023: | train_loss: 0.021: | valid_loss: 0.017
EPOCH: 024: | train_loss: 0.021: | valid_loss: 0.017
CV log_loss:  0.01580344356749381
sub.shape(3982, 207)
score: 0.01580344356749381
CPU times: user 14min 46s, sys: 7.3 s, total: 14min 54s
Wall time: 11min 49s


# Predict

In [7]:
def run_predict(confFitting, param, test, target, fold, seed):
    
    seed_everything(seed)
  
    #--------------------- PREDICTION---------------------
    x_test = test[confFitting["feature_cols"]].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=confFitting["num_features"],
        num_targets=confFitting["num_targets"],
        param=param
    )
    
    model.load_state_dict(torch.load(f"{SAVEMODEL}SEED{seed}_FOLD{fold}.pth"))
    model.to(DEVICE)
    
    predictions = np.zeros((len(test), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    
    
    return predictions


In [8]:
def run_k_fold_predict(confFitting, test, target, param, Tester, NFOLDS, seed):
    predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    for fold in range(NFOLDS):
        if Tester:
            print('=' * 20, 'Fold', fold, '=' * 20)
        pred_ = run_predict(confFitting, param, test, target, fold, seed)
        
        predictions += pred_ / NFOLDS
        
    return predictions

In [45]:
def SubmitPredict(confFitting, predictions, test, prefix):
    test[confFitting["target_cols"]] = predictions
    sub = sample_submission.drop(columns=confFitting["target_cols"]).merge(test[['sig_id']+confFitting["target_cols"]], on='sig_id', how='left').fillna(0)
    sub.to_csv(f'{SUBMIT}{prefix}submission.csv', index=False)

    print("sub.shape" + str(sub.shape))
    
    return

In [46]:
def Predict(param):
    #Tester(True/False)
    Tester = False
    
    #Preprocessing Data
    train, test, target = preprocessing(param, trainFeature, testFeature, trainTargetScored)
    
    #CV folds
    folds = CV_folds(train, target)
    
    #Config about Fitting
    confFitting = Config_about_Fitting(train, test, target, folds)
    
    # Averaging on multiple SEEDS
    SEED = [0, 1, 2, 3 ,4, 5]
    predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    ### RUN ###
    for seed in SEED:
        if Tester:
            print('~' * 20, 'SEED', seed, '~' * 20)
        predictions_ = run_k_fold_predict(confFitting, test, target, param, Tester, NFOLDS, seed)
        predictions += predictions_ / len(SEED)
    
    # 課題提出
    prefix = "Pytorch"
    SubmitPredict(confFitting, predictions, test, prefix)
    
    return

In [65]:
%%time
Predict(param_space)

sub.shape(3982, 207)
CPU times: user 18.5 s, sys: 12.8 s, total: 31.3 s
Wall time: 6.22 s


# Hyperparameter Tuning

In [29]:
#hyperopt
from hyperopt import fmin, tpe, hp, rand, Trials

In [30]:
def HOptExec(param):
    #Tester(True/False)
    Tester = False
    
    #Preprocessing Data
    train, test, target = preprocessing(param, trainFeature, testFeature, trainTargetScored)
    
    #CV folds
    folds = CV_folds(train, target)
    
    #Config about Fitting
    confFitting = Config_about_Fitting(train, test, target, folds)
    
    # Averaging on multiple SEEDS
    SEED = [0, 1, 2, 3 ,4, 5]
    oof = np.zeros((len(train), confFitting["num_targets"]))
    predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    ### RUN ###
    for seed in SEED:
        if Tester:
            print('~' * 20, 'SEED', seed, '~' * 20)
        oof_, predictions_ = run_k_fold(Tester, NFOLDS, seed, param,
                                       folds, train, test, target, confFitting)
        oof += oof_ / len(SEED)
        predictions += predictions_ / len(SEED)
    
    #CV 評価
    score = CV_Evaluation(confFitting, oof, train, target)
    
    # 課題提出
    #Submit(confFitting, predictions, test)
    
    return score

In [None]:
%%time

param_space = {'hidden_size1': 512, 
               'hidden_size2': 512, 
               'dropOutRate1': 0.20393004966355735, 
               'dropOutRate2': 0.39170486751620137,
               'rankGauss_n_quantiles': 488.0393350201078,
               'leakyReluSlope': hp.uniform('leakyReluSlope', 1e-3, 1e-1),
              }

trials = Trials()

hopt = fmin(fn = HOptExec, 
            space = param_space, 
            algo = tpe.suggest, 
            max_evals = 15, 
            #timeout = 8.9 * 60 * 60, 
            trials = trials, 
           )

print(hopt)

CV log_loss:                                          
0.014981391207012364                                  
CV log_loss:                                                                         
0.01504250432043703                                                                  
CV log_loss:                                                                         
0.015004835293169368                                                                 
CV log_loss:                                                                         
0.015002514832957038                                                                 
CV log_loss:                                                                         
0.015008986227264749                                                                 
CV log_loss:                                                                         
0.014993115273980633                                                                   
CV log_loss:                