In [25]:
import numpy as np
import random
import pandas as pd
import os
import copy

from tqdm import tqdm
import pickle
import optuna

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.metrics import log_loss
from sklearn.multioutput import MultiOutputClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.pipeline import Pipeline
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [26]:
data_dir = '../input/lish-moa/'
os.listdir(data_dir)

['.ipynb_checkpoints',
 'sample_submission.csv',
 'test_features.csv',
 'train_drug.csv',
 'train_features.csv',
 'train_targets_nonscored.csv',
 'train_targets_scored.csv']

In [27]:
train_features = pd.read_csv(data_dir + 'train_features.csv')
train_targets_scored = pd.read_csv(data_dir + 'train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(data_dir + 'train_targets_nonscored.csv')
train_drug = pd.read_csv(data_dir + 'train_drug.csv')
test_features = pd.read_csv(data_dir + 'test_features.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

print('train_features: {}'.format(train_features.shape))
print('train_targets_scored: {}'.format(train_targets_scored.shape))
print('train_targets_nonscored: {}'.format(train_targets_nonscored.shape))
print('train_drug: {}'.format(train_drug.shape))
print('test_features: {}'.format(test_features.shape))
print('sample_submission: {}'.format(sample_submission.shape))

train_features: (23814, 876)
train_targets_scored: (23814, 207)
train_targets_nonscored: (23814, 403)
train_drug: (23814, 2)
test_features: (3982, 876)
sample_submission: (3982, 207)


In [28]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

print('GENES: {}'.format(GENES[:10]))
print('CELLS: {}'.format(CELLS[:10]))

GENES: ['g-0', 'g-1', 'g-2', 'g-3', 'g-4', 'g-5', 'g-6', 'g-7', 'g-8', 'g-9']
CELLS: ['c-0', 'c-1', 'c-2', 'c-3', 'c-4', 'c-5', 'c-6', 'c-7', 'c-8', 'c-9']


In [29]:
for col in (GENES + CELLS):
    transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
    vec_len = len(train_features[col].values)
    vec_len_test = len(test_features[col].values)
    raw_vec = train_features[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)

    train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [30]:
SEED_VALUE = 42

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(seed=SEED_VALUE)

In [31]:
# GENES
n_comp = 600

data = pd.concat([pd.DataFrame(train_features[GENES]), pd.DataFrame(test_features[GENES])])
data2 = (PCA(n_components=n_comp, random_state=SEED_VALUE).fit_transform(data[GENES]))
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

print('train_features: {}'.format(train_features.shape))
print('test_features: {}'.format(test_features.shape))

train_features: (23814, 1476)
test_features: (3982, 1476)


In [32]:
# CELLS
n_comp = 50

data = pd.concat([pd.DataFrame(train_features[CELLS]), pd.DataFrame(test_features[CELLS])])
data2 = (PCA(n_components=n_comp, random_state=SEED_VALUE).fit_transform(data[CELLS]))
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(CELLS))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

print('train_features: {}'.format(train_features.shape))
print('test_features: {}'.format(test_features.shape))

train_features: (23814, 1526)
test_features: (3982, 1526)


In [33]:
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(0.8)
data = train_features.append(test_features)
data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])

train_features_transformed = data_transformed[ : train_features.shape[0]]
test_features_transformed = data_transformed[-test_features.shape[0] : ]

train_features = pd.DataFrame(train_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                              columns=['sig_id','cp_type','cp_time','cp_dose'])

train_features = pd.concat([train_features, pd.DataFrame(train_features_transformed)], axis=1)

test_features = pd.DataFrame(test_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                             columns=['sig_id','cp_type','cp_time','cp_dose'])

test_features = pd.concat([test_features, pd.DataFrame(test_features_transformed)], axis=1)

print('train_features: {}'.format(train_features.shape))
print('test_features: {}'.format(test_features.shape))

train_features: (23814, 1040)
test_features: (3982, 1040)


In [34]:
train = train_features.merge(train_targets_scored, on='sig_id')
train = train.merge(train_targets_nonscored, on='sig_id')
train = train.merge(train_drug, on='sig_id')
train = train[train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)

In [35]:
train = train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)

In [36]:
target_cols = [x for x in train_targets_scored.columns if x != 'sig_id']
aux_target_cols = [x for x in train_targets_nonscored.columns if x != 'sig_id']
all_target_cols = target_cols + aux_target_cols

num_targets = len(target_cols)
num_aux_targets = len(aux_target_cols)
num_all_targets = len(all_target_cols)

print('num_targets: {}'.format(num_targets))
print('num_aux_targets: {}'.format(num_aux_targets))
print('num_all_targets: {}'.format(num_all_targets))

num_targets: 206
num_aux_targets: 402
num_all_targets: 608


In [37]:
print(train.shape)
print(test.shape)
print(sample_submission.shape)

(21948, 1648)
(3624, 1039)
(3982, 207)


# Stacking

In [38]:
# SEED = [0, 1, 2, 3, 4, 5, 6]
SEED = [0, 1, 2]

# load predictions
for seed_id in SEED:
    train_tmp = np.loadtxt(os.path.join("result", "4stacking", "oof_seed"+str(seed_id)+".csv"), delimiter=",")
    test_tmp = np.loadtxt(os.path.join("result", "4stacking", "testpred_seed"+str(seed_id)+".csv"), delimiter=",")
    if seed_id == 0:
        x_train_stacking = train_tmp
        x_test_stacking = test_tmp
    else:
        x_train_stacking = np.concatenate([x_train_stacking, train_tmp], axis=1)
        x_test_stacking = np.concatenate([x_test_stacking, test_tmp], axis=1)

y_train_stacking = train[target_cols].values

# Learn Meta model

In [39]:
def run_k_fold_multilabel_xgboost(data_x, data_y, NFOLDS, mskf):
    def objective(trial):
        params = {
            'objective': 'multi:softprob',
            'eval_metric':'mlogloss',
            'num_class':data_y.shape[1],
#             'objective': 'binary:logistic',
            'n_estimators': 1000,
            'learning_rate': 0.1,
            'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 0.95, 0.05),
            'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 0.95, 0.05),
            'max_depth':  trial.suggest_int('max_depth', 3, 9),
            'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
            'min_child_weight': trial.suggest_loguniform('min_child_weight', 0.1, 10.0),
            'tree_method': 'gpu_hist',
        }
        
        cv_score = 0.
        for f, (t_idx, v_idx) in enumerate(mskf.split(data_x, data_y)):
            x_train, y_train = data_x[t_idx], data_y[t_idx]
            x_valid, y_valid = data_x[v_idx], data_y[v_idx]
            
            clf = MultiOutputClassifier(xgb.XGBClassifier(**params))
            clf.fit(x_train, y_train)
            y_preds = clf.predict_proba(x_valid)

            score = 0.
            for i in range(len(data_y.shape[1])):
                y_pred = y_preds[i]
                score_ = log_loss(y_valid[:, i], y_pred[:, 1], labels=[0,1])
                score += score_ / len(data_y.shape[1])
            cv_score += score
        return cv_score / NFOLDS
    return objective

In [40]:
def run_multilabel_xgboost_bestparams(x_train, y_train, x_test, best_params, submission):
    
    # train XGBoost by using best_params
    clf = MultiOutputClassifier(xgb.XGBClassifier(**best_params))
    clf.fit(x_train, y_train)
    
    # predict for test
    y_preds = clf.predict_proba(x_test)
    
    for i in range(len(y_train.shape[1])):
        y_pred = y_preds[i]
        submission.iloc[:, i] = y_pred[:, 1]
    
    # save model for test
    modelfile = 'BestModel4test_Stacking_MultiXGBoost.pkl'
    with open(os.path.join('result', 'best_model', modelfile), 'wb') as f:
        pickle.dump(model, f)
        
    return submission

In [41]:
NFOLDS = 2
mskf = MultilabelStratifiedKFold(n_splits=NFOLDS, random_state=0)

In [42]:
# optimize by optuna
NUM_OPTUNA_TRIAL = 30
study = optuna.create_study()
study.optimize(run_k_fold_multilabel_xgboost(x_train_stacking, y_train_stacking, NFOLDS, mskf), n_trials=NUM_OPTUNA_TRIAL)

# save optuna log
df_trial = study.trials_dataframe()
df_trial.to_pickle(os.path.join('report', 'OptunaLog_Stacking_MultiXGBoost.pkl'))

# save best params found by CV
bestparam_filename = 'BestParamsSelectedByCV_Stacking_MultiXGBoost.pkl'
with open(os.path.join('result', 'best_param_cv', bestparam_filename), 'wb') as f:
    pickle.dump(study.best_params, f)

KeyboardInterrupt: 

In [None]:
sub = run_multilabel_xgboost_bestparams(x_train_stacking, y_train_stacking, x_test_stacking, sub)

In [None]:
sub.to_csv('submission.csv', index=False)