In [1]:
#!pip install datatable > /dev/null

import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns
import datatable as dt

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score, roc_curve

#PurgedGroupTimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

#model
from xgboost import XGBClassifier
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

import tidalUtl.PrpUtl as prp
import tidalUtl.EdaUtl as eda
from sklearn.model_selection import GroupKFold

#model save
from joblib import dump, load

In [2]:
# modified code for group gaps; source
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]


In [3]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name

        if col_type not in ['object', 'category', 'datetime64[ns, UTC]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

# Version

__ver1__<br>
baseline：CV:0.01465 LB:0.01874<br>


# Config

In [4]:
MODEL = "Xgboost"

In [5]:
INPUT = "/home/tidal/ML_Data/JaneStreet/jane-street-market-prediction"
OUTPUT = "/home/tidal/ML_Data/JaneStreet/output"
#INPUT = "/Users/hfuis/ML_Data/aneStreet/jane-street-market-prediction"
#OUTPUT = "/Users/hfuis/ML_Data/aneStreet/jane-street-market-prediction"

SUBMIT = OUTPUT + "/submittion/"
SAVEMODEL = OUTPUT + "/model/" + MODEL +"/"
SAVEOOF = OUTPUT + "/OOF/" + MODEL +"/"

In [6]:
%%time
#Loading
train_data_datatable = dt.fread(INPUT + '/train.csv')
trainFeature = train_data_datatable.to_pandas()
print(trainFeature.shape)
#trainFeature = trainFeature.query('weight > 0').pipe(reduce_mem_usage).reset_index(drop = True)
#print(trainFeature.shape)

(2390491, 138)
CPU times: user 25.3 s, sys: 1.8 s, total: 27.1 s
Wall time: 2.63 s


In [7]:
features = [c for c in trainFeature.columns if 'feature' in c]

In [8]:
#Seed固定
def seed_everything(seed=42):
    #data取得についてのランダム性固定
    random.seed(seed)
    np.random.seed(seed)
    #torch.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    #cudnnによる演算の安定化(評価値の安定)
    #torch.backends.cudnn.deterministic = True
    
    #os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(seed=42)

In [9]:
#HyperParameter
param_space = {'n_estimators': 527,
               'max_depth': 4,
               'learning_rate': 0.03580866008035822,
               'subsample': 0.5289264117776996,
               'colsample_bytree': 0.8436545143704768,
               'gamma': 5,
               'missing': -999,
               'tree_method': 'gpu_hist',
               "actThrs":0.5,
              }

# Preprocessing

In [10]:
trainFeature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2390491 entries, 0 to 2390490
Columns: 138 entries, date to ts_id
dtypes: float64(135), int32(3)
memory usage: 2.4 GB


In [11]:
trainFeature

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
0,0,0.000000,0.009916,0.014079,0.008773,0.001390,0.006270,1,-1.872746,-2.191242,...,,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,...,,-1.178850,1.777472,-0.915458,2.831612,-1.417010,2.297459,-1.304614,1.898684,1
2,0,0.000000,0.025134,0.027607,0.033406,0.034380,0.023970,-1,0.812780,-0.256156,...,,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,2
3,0,0.000000,-0.004730,-0.003273,-0.000461,-0.000476,-0.003200,-1,1.174378,0.344640,...,,2.838853,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,3
4,0,0.138531,0.001252,0.002165,-0.001215,-0.006219,-0.002604,1,-3.172026,-3.093182,...,,0.344850,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390486,499,0.000000,0.000142,0.000142,0.005829,0.020342,0.015396,1,-1.649365,-1.169996,...,-1.896874,-1.260055,1.947725,-1.994399,-1.685163,-2.866165,-0.216130,-1.892048,0.901585,2390486
2390487,499,0.000000,0.000012,0.000012,-0.000935,-0.006326,-0.004718,1,2.432943,5.284504,...,-0.936553,1.064936,3.119762,-0.419796,-0.208975,-0.146749,0.730166,0.648452,2.068737,2390487
2390488,499,0.000000,0.000499,0.000499,0.007605,0.024907,0.016591,1,-0.622475,-0.963682,...,-2.956745,-0.640334,-2.279663,-0.950259,-4.388417,-1.669922,-3.288939,-1.336142,-2.814239,2390488
2390489,499,0.283405,-0.000156,-0.000156,-0.001375,-0.003702,-0.002004,-1,-1.463757,-1.107228,...,-2.035894,-1.780962,0.881246,-2.202140,-1.912601,-3.341684,-0.571188,-2.185795,0.627452,2390489


In [12]:
features

['feature_0',
 'feature_1',
 'feature_2',
 'feature_3',
 'feature_4',
 'feature_5',
 'feature_6',
 'feature_7',
 'feature_8',
 'feature_9',
 'feature_10',
 'feature_11',
 'feature_12',
 'feature_13',
 'feature_14',
 'feature_15',
 'feature_16',
 'feature_17',
 'feature_18',
 'feature_19',
 'feature_20',
 'feature_21',
 'feature_22',
 'feature_23',
 'feature_24',
 'feature_25',
 'feature_26',
 'feature_27',
 'feature_28',
 'feature_29',
 'feature_30',
 'feature_31',
 'feature_32',
 'feature_33',
 'feature_34',
 'feature_35',
 'feature_36',
 'feature_37',
 'feature_38',
 'feature_39',
 'feature_40',
 'feature_41',
 'feature_42',
 'feature_43',
 'feature_44',
 'feature_45',
 'feature_46',
 'feature_47',
 'feature_48',
 'feature_49',
 'feature_50',
 'feature_51',
 'feature_52',
 'feature_53',
 'feature_54',
 'feature_55',
 'feature_56',
 'feature_57',
 'feature_58',
 'feature_59',
 'feature_60',
 'feature_61',
 'feature_62',
 'feature_63',
 'feature_64',
 'feature_65',
 'feature_66',
 'fea

## Func: In & Out Type is DataFrame

### Fill NAN

In [13]:
def FillNAN(trainFeature_inp):
    trainFeature = trainFeature_inp.copy()
    #trainFeature = trainFeature_inp
    
    
    f_mean = trainFeature[features[1:]].mean()
    trainFeature = trainFeature.query('weight > 0').pipe(reduce_mem_usage).reset_index(drop = True)
    
    
    #欠損値を平均で埋める
    trainFeature[features[1:]] = trainFeature[features[1:]].fillna(f_mean)
    
    #targetを生成
    trainFeature['action'] = (trainFeature['resp'] > 0).astype('int')
    
    
    
    return trainFeature

## Scaling

In [14]:
def Scaling(trainFeature_inp):
    trainFeature = trainFeature_inp.copy()
    #trainFeature = trainFeature_inp
    
    #Scaler
    scaler = StandardScaler()
    scaler.fit(trainFeature[features])
    
    trainFeature[features] = scaler.transform(trainFeature[features])
    
    del scaler
    
    return trainFeature

## Collecting

__train,testにターゲット値も連結__

In [15]:
def Collecting(trainFeature_inp):
    trainFeature = trainFeature_inp.copy()
    #trainFeature = trainFeature_inp
    
    #targetを生成
    trainFeature['action'] = (trainFeature['resp'] > 0).astype('int')
    
    #target単体でも保持。
    target = trainFeature['action'].copy()
    target = pd.DataFrame(target,columns=["action"])
    return trainFeature, target

## Preprocessing Summary

In [16]:
def preprocessing(param, trainFeature):
    
    #欠損値処理
    trainFeature = FillNAN(trainFeature)
    
    #standard scaler
    trainFeature = Scaling(trainFeature)
    
    #trainにターゲット値を連結。
    train, target = Collecting(trainFeature)
    
    
    return train, target

## Work

## Visualization

In [17]:
%%time
#trainVsl, targetVsl = preprocessing(param_space, trainFeature)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.1 µs


In [18]:
#trainVsl.head(5)

In [19]:
#targetVsl.head(5)

In [20]:
#targetVsl.value_counts()

In [21]:
#type(targetVsl)

In [22]:
#print("Train: "+ str(trainVsl.shape))
#print("Target: "+ str(targetVsl.shape))

In [23]:
#eda.chkDfIsNull(trainFeature)

In [24]:
#eda.chkDfIsNull(trainVsl)

# Fitting

## Config about Fitting

In [25]:
#configは辞書化しておく。
def Config_about_Fitting(train, target):
    confFitting = {}
    
    #Fitするときに"y"として使う列の列名配列
    confFitting["target_cols"] = target.columns.values.tolist()
    #Fitするときに"X"として使う列の列名配列
    #kfold, id等はここで削除。
    #feature_cols = [c for c in train.columns if c not in confFitting["target_cols"]]
    #confFitting["feature_cols"] = [c for c in feature_cols if c not in ['kfold']]
    confFitting["feature_cols"] = features
    #特徴量、ターゲットのサイズ
    confFitting["num_features"]=len(confFitting["feature_cols"])
    confFitting["num_targets"]=len(confFitting["target_cols"])
    
    return confFitting

## Loss, Metric

In [26]:
#loss
#なし。

In [27]:
#metric
#roc_auc_score(y_val, oof[te])

# Run

## HyperParameter

In [28]:
# HyperParameters
#DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 25
BATCH_SIZE = 4096
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 5
EARLY_STOPPING_STEPS = 7
EARLY_STOP = True
GROUP_GAP = 31

## CV folds

In [29]:
def CV_folds(train, target):
    folds = train.copy()
    
    gkf = GroupKFold(n_splits=NFOLDS)
    
    for f, (t_idx, v_idx) in enumerate(gkf.split(train, target, train["date"])):
        folds.loc[v_idx, 'kfold'] = int(f)
        #print(type(v_idx))
        #print(v_idx.shape)
    
    folds['kfold'] = folds['kfold'].astype(int)
    
    return folds

In [30]:
#1->train, 0->valid, -1->使用なし
def CV_folds_PurgedTimeSeries(train, target):
    folds = train.copy()
    
    gkf = PurgedGroupTimeSeriesSplit(n_splits=NFOLDS, group_gap=GROUP_GAP)
    #kf = PurgedGroupTimeSeriesSplit(
    #   n_splits=NFOLDS,
    #   max_train_group_size=150,
    #   group_gap=20,
    #   max_test_group_size=60
    #
    
    for f, (t_idx, v_idx) in enumerate(gkf.split(train.values, target.values, train["date"].values)):
        
        folds.loc[t_idx, f'kfold_{f}'] = 1
        folds.loc[v_idx, f'kfold_{f}'] = 0
        folds[f'kfold_{f}'] = folds[f'kfold_{f}'].fillna(-1)
        folds[f'kfold_{f}'] = folds[f'kfold_{f}'].astype(int)
        #print("train:",len(t_idx),"  valid:",len(v_idx))
    
    return folds

In [31]:
%%time
##Preprocessing Data
#trainVsl, targetVsl = preprocessing(param_space, trainFeature)
##CV folds
#foldsVsl= CV_folds_PurgedTimeSeries(trainVsl, targetVsl)
#
#foldsVsl.head(5)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.58 µs


In [32]:
#foldsVsl["kfold_0"].value_counts()

In [33]:
#foldsVsl["kfold_1"].value_counts()

In [34]:
#foldsVsl["kfold_2"].value_counts()

In [35]:
#foldsVsl.query('date > 100')

## CV Evaluation

In [36]:
def utility_score_bincount(date, weight, resp, action):
    #print(date.shape)
    #print(weight.shape)
    #print(resp.shape)
    #print(action.shape)
    #a = weight * resp * action
    #print(a.shape)
    
    
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u

In [37]:
def CV_Evaluation(confFitting, oof, train, fold, param):
    
    #print(train[confFitting["target_cols"]].shape)
    #print(train.loc[:,confFitting["target_cols"]].shape)
    #print(len(oof))
    #print(oof.shape)
    train[confFitting["target_cols"]] = oof
    
    #各引数の生成
    date =  train["date"].values
    weight = train["weight"].values
    resp = train["resp"].values
    action = train["action"].values
    action = np.where(action >= param["actThrs"], 1, 0).astype(int)
    #print(action)
    #utility scoreの計算。
    score = utility_score_bincount(date, weight, resp, action)
        
    print("Fold", fold, " CV utility score: ", score)
    
    return score

## Single Fold Running

In [51]:
def run_training(confFitting, Tester, fold, seed, param,
                 folds, train, target):
    
    seed_everything(seed)
    
    train = folds.copy()
    
    trn_idx = train[train[f'kfold_{fold}'] == 1].index
    val_idx = train[train[f'kfold_{fold}'] == 0].index
    
    train_df = train[train[f'kfold_{fold}'] == 1].reset_index(drop=True)
    valid_df = train[train[f'kfold_{fold}'] == 0].reset_index(drop=True)
    
    x_train, y_train  = train_df[confFitting["feature_cols"]].values, train_df[confFitting["target_cols"]].values
    x_valid, y_valid =  valid_df[confFitting["feature_cols"]].values, valid_df[confFitting["target_cols"]].values
    
    params = {'colsample_bytree':  param["colsample_bytree"],
              'gamma': param["gamma"],
              'learning_rate': param["learning_rate"],
              'max_depth': param["max_depth"],
              'n_estimators': param["max_depth"],
              'subsample': param["subsample"],
              'tree_method':param["tree_method"],
              'missing': param["missing"]
             }
    
    model = XGBClassifier(**params)
    
    model.fit(x_train, y_train)
    #print("y_train shape: ", y_train.shape)
    #print(y_train[0:5,:])
    #print("class: ", model.classes_)
    
    ### Save ###
    dump(model, f"{SAVEMODEL}SEED{seed}_FOLD{fold}.pth")
    
    oof = np.zeros((len(train), target.iloc[:, :].shape[1]))
    #oof = np.zeros(len(train))
    tmp_oof = model.predict(x_valid)
    print(tmp_oof)
    tmp_oof = np.array([tmp_oof]).T
    #tmp_np = np.array(tmp_oof)
    print("tmp_oof shape: ", tmp_oof.shape)
    #print(tmp_np[0:5,:])
    #print("oof[val_idx].shape", oof[val_idx].shape)
    #print("np.array([tmp_oof[:,1]]).T", np.array([tmp_oof[:,1]]).T.shape)
    #print("np.array([tmp_oof[:,1]])", np.array([tmp_oof[:,1]]).shape)
    #print("oof[val_idx] zero",oof[val_idx])
    oof[val_idx] = tmp_oof
    #print("oof[val_idx]",oof[val_idx])
    
    #foldごとのCV evaluation(utility score)
    score = CV_Evaluation(confFitting, tmp_oof, valid_df, fold, param)
    
    #foldごとのAUC
    score_auc = roc_auc_score(y_valid, tmp_oof)
    print("CV AUC score: ", score_auc)
    
    del train
    
    return oof
    ################本コンペの仕様上テストデータがない#############################################
    
    #--------------------- PREDICTION---------------------
    #x_test = test[confFitting["feature_cols"]].values
    #testdataset = TestDataset(x_test)
    #testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    #model = Model(
    #    num_features=confFitting["num_features"],
    #    num_targets=confFitting["num_targets"],
    #    param=param
    #)
    #
    #model.load_state_dict(torch.load(f"{SAVEMODEL}SEED{seed}_FOLD{fold}.pth"))
    #model.to(DEVICE)
    #
    #predictions = np.zeros((len(test), target.iloc[:, 1:].shape[1]))
    #predictions = inference_fn(model, testloader, DEVICE)
    #
    #
    #return oof, predictions


## K-Fold Running

In [52]:
def run_k_fold(Tester, NFOLDS, seed, param,
              folds, train, target, confFitting):
    oof = np.zeros((len(train), confFitting["num_targets"]))
    #predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    for fold in range(NFOLDS):
        if Tester:
            print('=' * 20, 'Fold', fold, '=' * 20)
        oof_= run_training(confFitting, Tester, fold, seed, param,
                                   folds, train, target)
        
        #predictions += pred_ / NFOLDS
        oof += oof_
        
    #return oof, predictions
    return oof

## Postprocessing

In [53]:
# 特になし

## Submit

In [54]:
def Submit(confFitting, predictions, test):
    test[confFitting["target_cols"]] = predictions
    sub = sample_submission.drop(columns=confFitting["target_cols"]).merge(test[['sig_id']+confFitting["target_cols"]], on='sig_id', how='left').fillna(0)
    sub.to_csv(f'{SUBMIT}submission.csv', index=False)

    print("sub.shape" + str(sub.shape))
    
    return

# Execute

In [55]:
def Exec(param):
    
    #Tester(True/False)
    Tester = True
    
    #Preprocessing Data
    train, target = preprocessing(param_space, trainFeature)
    
    #CV folds
    folds = CV_folds_PurgedTimeSeries(train, target)
    
    #Config about Fitting
    confFitting = Config_about_Fitting(train, target)
    
    # Averaging on multiple SEEDS
    #SEED = [0, 1, 2, 3 ,4, 5]
    SEED = [42]
    oof = np.zeros((len(train), confFitting["num_targets"]))
    #predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    ### RUN ###
    for seed in SEED:
        if Tester:
            print('~' * 20, 'SEED', seed, '~' * 20)
        oof_ = run_k_fold(Tester, NFOLDS, seed, param,
                                       folds, train, target, confFitting)
        oof += oof_ / len(SEED)
        #predictions += predictions_ / len(SEED)
    
    #CV 評価
    #utility
    score_utl = CV_Evaluation(confFitting, oof, train, -1, param)
    score_auc = roc_auc_score(target.values, oof)
    print("CV AUC score: ", score_auc)
    
    
    # 課題提出
    #Submit(confFitting, predictions, test)
    
    #OOF save
    np.save(SAVEOOF + 'oof', oof)
    
    return score_auc


In [56]:
%%time
oof= Exec(param_space)
#print("score: " + str(score))

Memory usage of dataframe is 2078.45 MB
Memory usage after optimization is: 540.40 MB
Decreased by 74.0%
~~~~~~~~~~~~~~~~~~~~ SEED 42 ~~~~~~~~~~~~~~~~~~~~
[1 1 0 ... 1 1 1]
tmp_oof shape:  (258754, 1)
Fold 0  CV utility score:  412.23281271271253
CV AUC score:  0.5119519786700366
[1 1 1 ... 0 1 1]
tmp_oof shape:  (295599, 1)
Fold 1  CV utility score:  40.07107275899171
CV AUC score:  0.5095271298771297
[1 1 1 ... 1 1 1]
tmp_oof shape:  (314367, 1)
Fold 2  CV utility score:  683.3142440326053
CV AUC score:  0.5144350316769792
[1 1 1 ... 1 1 1]
tmp_oof shape:  (341972, 1)
Fold 3  CV utility score:  -0.0
CV AUC score:  0.5064201856488102
[1 1 1 ... 1 1 1]
tmp_oof shape:  (370311, 1)
Fold 4  CV utility score:  246.1495432344157
CV AUC score:  0.5081047026676145
Fold -1  CV utility score:  417.0619063627436
CV AUC score:  0.5050432218859087
CPU times: user 1min 26s, sys: 11.1 s, total: 1min 37s
Wall time: 1min 21s


# Predict

In [44]:
def run_predict(confFitting, param, test, target, fold, seed):
    
    seed_everything(seed)
  
    #--------------------- PREDICTION---------------------
    x_test = test[confFitting["feature_cols"]].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=confFitting["num_features"],
        num_targets=confFitting["num_targets"],
        param=param
    )
    
    model.load_state_dict(torch.load(f"{SAVEMODEL}SEED{seed}_FOLD{fold}.pth"))
    model.to(DEVICE)
    
    predictions = np.zeros((len(test), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    
    
    return predictions


In [45]:
def run_k_fold_predict(confFitting, test, target, param, Tester, NFOLDS, seed):
    predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    for fold in range(NFOLDS):
        if Tester:
            print('=' * 20, 'Fold', fold, '=' * 20)
        pred_ = run_predict(confFitting, param, test, target, fold, seed)
        
        predictions += pred_ / NFOLDS
        
    return predictions

In [46]:
def SubmitPredict(confFitting, predictions, test, prefix):
    test[confFitting["target_cols"]] = predictions
    sub = sample_submission.drop(columns=confFitting["target_cols"]).merge(test[['sig_id']+confFitting["target_cols"]], on='sig_id', how='left').fillna(0)
    sub.to_csv(f'{SUBMIT}{prefix}submission.csv', index=False)

    print("sub.shape" + str(sub.shape))
    
    return

In [47]:
def Predict(param):
    #Tester(True/False)
    Tester = False
    
    #Preprocessing Data
    train, target = preprocessing(param_space, trainFeature)
    
    #CV folds
    folds = CV_folds(train, target)
    
    #Config about Fitting
    confFitting = Config_about_Fitting(train, test, target, folds)
    
    # Averaging on multiple SEEDS
    SEED = [0, 1, 2, 3 ,4, 5]
    predictions = np.zeros((len(test), confFitting["num_targets"]))
    
    ### RUN ###
    for seed in SEED:
        if Tester:
            print('~' * 20, 'SEED', seed, '~' * 20)
        predictions_ = run_k_fold_predict(confFitting, test, target, param, Tester, NFOLDS, seed)
        predictions += predictions_ / len(SEED)
    
    # 課題提出
    prefix = ""
    SubmitPredict(confFitting, predictions, test, prefix)
    
    return

In [48]:
#%%time
#Predict(param_space)

# Hyperparameter Tuning

In [49]:
#hyperopt
from hyperopt import fmin, tpe, hp, rand, Trials

In [50]:
%%time

#HyperParameter
#param_space = {'hidden_size1': hp.choice('hidden_size1', [394, 512, 896, 1024]), 
#               'hidden_size2': hp.choice('hidden_size2', [394, 512, 896, 1024]), 
#               'hidden_size3': hp.choice('hidden_size3', [394, 512, 896, 1024]),
#               'hidden_size4': hp.choice('hidden_size4', [394, 512, 896, 1024]),
#               'hidden_size5': hp.choice('hidden_size5', [394, 512, 896, 1024]),
#               'hidden_size6': hp.choice('hidden_size6', [394, 512, 896, 1024]),
#               'dropOutRate0': hp.uniform('dropOutRate0', 0, 0.4), 
#               'dropOutRate1': hp.uniform('dropOutRate1', 0, 0.4),
#               'dropOutRate2': hp.uniform('dropOutRate2', 0, 0.4),
#               'dropOutRate3': hp.uniform('dropOutRate3', 0, 0.4),
#               'dropOutRate4': hp.uniform('dropOutRate4', 0, 0.4),
#               'dropOutRate5': hp.uniform('dropOutRate5', 0, 0.4),
#               'dropOutRate6': hp.uniform('dropOutRate6', 0, 0.4),
#               'smoothingRate': hp.loguniform('smoothingRate', -7, -2),
#               'actThrs':hp.uniform('actThrs', 0, 1),
#              }
#
#trials = Trials()
#
#hopt = fmin(fn = Exec, 
#            space = param_space, 
#            algo = tpe.suggest, 
#            max_evals = 25, 
#            #timeout = 8.9 * 60 * 60, 
#            trials = trials, 
#           )
#
#print(hopt)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
