In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import log_loss
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import KFold
tqdm.pandas(desc="my bar!")
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)

def disp_full(x, drows=False, dcols=True):
    if drows:
        pd.set_option('display.max_rows', x.shape[0])
    if dcols:
        pd.set_option('display.max_columns', x.shape[1])
    display(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')


  from pandas import Panel


In [2]:
def make_feats(df):
    df['ImagePositionPatient_2'] = df['ImagePositionPatient'].progress_apply(lambda x: x[2])
    df = df.merge(df.groupby(
        ['StudyInstanceUID']
    )['ImagePositionPatient_2'].agg(position_min='min', position_max='max').reset_index(), on='StudyInstanceUID')
    df['position'] = (df['ImagePositionPatient_2'] - df['position_min']) / (df['position_max'] - df['position_min'])
    res = df.sort_values(by=['StudyInstanceUID', 'position'])
    return res


def pred_agg1(df):
    new_feats = []
    
    for c in target_cols:
        tmp = df.groupby(
            ['StudyInstanceUID']
        )[c+'_pred'].agg(['min', 'max', 'mean', 'std']).reset_index()
        tmp.columns = ['StudyInstanceUID', c+'_min', c+'_max', c+'_mean', c+'_std']
        if c != 'any':
            del tmp['StudyInstanceUID']
        new_feats.append(tmp)
    new_feats = pd.concat(new_feats, axis=1)
    df = pd.merge(df, new_feats, on='StudyInstanceUID', how='left')
    for c in target_cols:
        df[c+'_diff'] = df[c+'_pred'] - df[c+'_mean']
        df[c+'_div'] = df[c+'_pred'] / df[c+'_mean']
        df[c+'_scaled'] = (df[c+'_pred'] - df[c+'_mean']) / df[c+'_std']
    return df


def pred_agg2(df):
    a1 = df.groupby('StudyInstanceUID')[[col for col in df.columns if col.endswith('_pred')]].rolling(3, min_periods=1, center=True).mean().values
    a2 = df.groupby('StudyInstanceUID')[[col for col in df.columns if col.endswith('_pred')]].rolling(5, min_periods=1, center=True).mean().values
    a3 = df.groupby('StudyInstanceUID')[[col for col in df.columns if col.endswith('_pred')]].rolling(1, min_periods=1, center=True).mean().values
    new_feats1 = pd.DataFrame(a1, columns = [c+'_3roll' for c in target_cols])
    new_feats2 = pd.DataFrame(a2, columns = [c+'_5roll' for c in target_cols])
    new_feats3 = pd.DataFrame(a1 - a3, columns = [c+'_3rolldiff' for c in target_cols])
    new_feats4 = pd.DataFrame(a2 - a3, columns = [c+'_5rolldiff' for c in target_cols])
    new_feats5 = pd.DataFrame(a1 / a3, columns = [c+'_3rolldiv' for c in target_cols])
    new_feats6 = pd.DataFrame(a2 / a3, columns = [c+'_5rolldiv' for c in target_cols])
    new_feats1.index = df.index
    new_feats2.index = df.index
    new_feats3.index = df.index
    new_feats4.index = df.index
    new_feats5.index = df.index
    new_feats6.index = df.index
    df = pd.concat([df, new_feats1, new_feats2, new_feats3, new_feats4, new_feats5, new_feats6], axis=1)
    return df


In [3]:
# train data
df_all = []
target_cols = ['any', 'epidural', 'subdural', 'subarachnoid', 'intraventricular', 'intraparenchymal']
n_tta = 5
for n_fold in range(5):
    df = pd.read_pickle("data_for_stacking/appian/fold{}_ep2_valid_tta5.pkl".format(n_fold))
    tmp = np.zeros([len(df[0]['ids']), 6])
    for i in range(n_tta):
        tmp += df[i]['outputs'] / n_tta
    tmp = pd.DataFrame(tmp)
    tmp.columns = [tar_col + '_pred' for tar_col in target_cols]
    tmp['ID'] = df[0]['ids']
    tmp['folds'] = n_fold
    tmp2 = pd.DataFrame(df[0]['targets'], columns = target_cols)
    df_all.append(pd.concat([tmp, tmp2], axis=1))
df_all = pd.concat(df_all)
tr_meta = pd.read_pickle('data_for_stacking/cache/train_raw.pkl')
train = pd.merge(df_all, tr_meta, on='ID', how='inner')


In [4]:
# test data
df_all_ts = []
target_cols = ['any', 'epidural', 'subdural', 'subarachnoid', 'intraventricular', 'intraparenchymal']
n_tta = 5
for n_fold in range(5):
    df = pd.read_pickle("data_for_stacking/appian/fold{}_ep2_test_tta5.pkl".format(n_fold))
    tmp = np.zeros([len(df[0]['ids']), 6])
    for i in range(n_tta):
        tmp += df[i]['outputs'] / n_tta
    tmp = pd.DataFrame(tmp)
    tmp.columns = [tar_col + '_pred' for tar_col in target_cols]
    tmp['ID'] = df[n_fold]['ids']
    tmp['folds'] = n_fold
    df_all_ts.append(tmp)

ts_meta = pd.read_pickle('data_for_stacking/cache/test_raw.pkl')
test_list = []
for n_fold in range(5):
    test = pd.merge(df_all_ts[n_fold], ts_meta, on='ID', how='inner')
    test = make_feats(test)
    test = pred_agg1(test)
    test = pred_agg2(test)
    test_list.append(test)


my bar!: 100%|██████████| 78363/78363 [00:00<00:00, 413297.11it/s]
my bar!: 100%|██████████| 78363/78363 [00:00<00:00, 423155.28it/s]
my bar!: 100%|██████████| 78363/78363 [00:00<00:00, 425947.10it/s]
my bar!: 100%|██████████| 78363/78363 [00:00<00:00, 418714.94it/s]
my bar!: 100%|██████████| 78363/78363 [00:00<00:00, 425111.35it/s]


In [5]:
# 学習用col
X_cols = train.columns.drop(['StudyInstanceUID', 'ID', 'folds', 'labels', 'n_label', 'SOPInstanceUID', 'PatientID',
       'SamplesPerPixel', 'SeriesInstanceUID', 'StudyID'] + target_cols)
# lgbにいれるため、object colはlabel encoding
for c in X_cols:
    if train[c].dtype == 'O':
        print(c)
        tmp = pd.concat([train[[c]], test_list[0][[c]]]).reset_index(drop=True)
        tmp = pd.factorize(tmp[c])[0]
        print(tmp.max())
        train[c] = tmp[:len(train)]
        for n_fold in range(5):
            test_list[n_fold][c] = tmp[len(train):]


ImageOrientationPatient
187
ImagePositionPatient
696908
Modality
0
PhotometricInterpretation
0
PixelSpacing
221
WindowCenter
41
WindowWidth
51


In [6]:
# 学習
from sklearn.linear_model import Ridge, LogisticRegression
stack_preds = []
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": 0.1,
          "num_leaves": 5,
           "max_bin": 256,
          "feature_fraction": 0.8,
          "verbosity": 0,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }
res = []
pred_test = []
for i in range(5):
    tr = train.query('folds != @i')
    va = train.query('folds == @i')
    preds = pd.DataFrame(np.zeros([len(va), 6]), columns = [c + '_pred' for c in target_cols])
    for tar_col in target_cols:
        tr_D = lgb.Dataset(tr[X_cols], tr[tar_col])
        va_D = lgb.Dataset(va[X_cols], va[tar_col])
        clf = lgb.train(params, tr_D, 10000, valid_sets=va_D, verbose_eval=100,
                                    early_stopping_rounds=10)
        preds[tar_col + '_pred'] = clf.predict(va[X_cols])
        pred_test.append(clf.predict(test_list[i][X_cols]))
#         preds[tar_col + '_pred'] = clf.predict(va[X_cols])
        print(log_loss(va[tar_col], va[tar_col + '_pred']))
        print(log_loss(va[tar_col], preds[tar_col + '_pred']))
        print('-'*80)
        res.append(log_loss(va[tar_col], preds[tar_col + '_pred']))
    print('='*80)
    stack_preds.append(preds)


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[88]	valid_0's binary_logloss: 0.116963
0.11646785944210827
0.11696295249092052
--------------------------------------------------------------------------------
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[28]	valid_0's binary_logloss: 0.0165623
0.016153468849789597
0.016562305350467578
--------------------------------------------------------------------------------
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[65]	valid_0's binary_logloss: 0.0956264
0.09523852711521777
0.09562638475691988
--------------------------------------------------------------------------------
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[75]	valid_0's binary_logloss: 0.0782203
0.07882641153220848
0.07822029875417118
------------------------------------------------

In [11]:
stack_preds[0]

Unnamed: 0,any_pred,epidural_pred,subdural_pred,subarachnoid_pred,intraventricular_pred,intraparenchymal_pred
0,0.000681,0.000385,0.000549,0.000361,0.000134,0.000350
1,0.990242,0.003955,0.128484,0.110060,0.018366,0.971263
2,0.003136,0.000385,0.002269,0.000504,0.000151,0.000387
3,0.002866,0.000385,0.001692,0.000476,0.000163,0.000494
4,0.012694,0.000385,0.005301,0.004227,0.000215,0.001712
...,...,...,...,...,...,...
133855,0.000746,0.000385,0.000611,0.000293,0.000134,0.000380
133856,0.000725,0.000385,0.000554,0.000333,0.000157,0.000427
133857,0.004352,0.000385,0.001877,0.001450,0.000174,0.000971
133858,0.004718,0.000523,0.004542,0.001625,0.000133,0.000388


In [25]:
new_df = pd.read_pickle("data_for_stacking/sasaki_senet154_customlabels/fold0_ep2_valid_tta5.pkl")
old_df = pd.read_pickle("data_for_stacking/sasaki_se_resnext_410//fold0_ep2_valid_tta5.pkl")

In [26]:
target_cols = ['any', 'epidural', 'subdural', 'subarachnoid', 'intraventricular', 'intraparenchymal']
new_df = pd.DataFrame(new_df[0]["outputs"], columns=["new_"+target_col for target_col in target_cols])
old_df = pd.DataFrame(old_df[0]["outputs"], columns=["old_"+target_col for target_col in target_cols])

In [30]:
tmp_df = pd.concat([old_df, new_df], axis=1)

In [32]:
tmp_df.corr()

Unnamed: 0,old_any,old_epidural,old_subdural,old_subarachnoid,old_intraventricular,old_intraparenchymal,new_any,new_epidural,new_subdural,new_subarachnoid,new_intraventricular,new_intraparenchymal
old_any,1.0,0.227051,0.688876,0.693457,0.522691,0.63209,0.94492,0.181529,0.651793,0.688468,0.549907,0.660052
old_epidural,0.227051,1.0,0.216977,0.09416,0.010148,0.086278,0.212037,0.692956,0.233947,0.102997,0.01813,0.103233
old_subdural,0.688876,0.216977,1.0,0.366443,0.093449,0.233626,0.650575,0.168895,0.931208,0.372899,0.11043,0.258264
old_subarachnoid,0.693457,0.09416,0.366443,1.0,0.310449,0.421064,0.658295,0.090399,0.345653,0.927378,0.364495,0.453049
old_intraventricular,0.522691,0.010148,0.093449,0.310449,1.0,0.360995,0.494551,0.0139,0.093595,0.321794,0.923466,0.39117
old_intraparenchymal,0.63209,0.086278,0.233626,0.421064,0.360995,1.0,0.599629,0.08218,0.221528,0.411233,0.387634,0.936717
new_any,0.94492,0.212037,0.650575,0.658295,0.494551,0.599629,1.0,0.211156,0.688771,0.723781,0.583304,0.690047
new_epidural,0.181529,0.692956,0.168895,0.090399,0.0139,0.08218,0.211156,1.0,0.202803,0.118782,0.032992,0.114246
new_subdural,0.651793,0.233947,0.931208,0.345653,0.093595,0.221528,0.688771,0.202803,1.0,0.384175,0.126452,0.273103
new_subarachnoid,0.688468,0.102997,0.372899,0.927378,0.321794,0.411233,0.723781,0.118782,0.384175,1.0,0.41782,0.485162
