In [1]:
import os
import time
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import log_loss

from models_utils_stacking import *

In [2]:
def load_oof(mode = 'train'):
    src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Intel_Cervix/scripts/models/OOF_preds/'
    data_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Intel_Cervix/data/training_data/'

    oof_preds = pd.DataFrame()
    files = sorted([x for x in os.listdir(src + '{}/'.format(mode)) if '.npy' in x
                   and 'stack' not in x])
    print('\n', 'Loading OOF preds:', files, '\n', 'Numer of files to load:', len(files), '\n')
    for i in files:
        if 'npy'in i:
            df_preds = pd.DataFrame(np.load('{}/{}/{}'.format(src, mode, i)))
            df_preds.columns = ['Type_1', 'Type_2', 'Type_3']
            if mode == 'train':
                if 'frcnn' in i:
                    train_src = data_src + 'train_crops_frcnn_299_oversampled/'
                if 'vgg' in i:
                    train_src = data_src + 'train_crops_vgg_299_oversampled/'
                if 'yolo' in i:
                    train_src = data_src + 'train_crops_yolo_299_oversampled/'
                train_ids, y_train = load_ids(train_src)
                df_preds['image'] = train_ids
                df_preds['class'] = y_train
                df_preds_grouped = groupby_crops(df_preds, False)
            if mode == 'test':
                if 'frcnn' in i:
                    test_src = data_src + 'test_crops_frcnn_299/test_crops_frcnn_299/'
                if 'vgg' in i:
                    test_src = data_src + 'test_crops_vgg_299/test_crops_vgg_299/'
                if 'yolo' in i:
                    test_src = data_src + 'test_crops_yolo_299/test_crops_yolo_299/'
                test_ids = load_ids_test(test_src)
                df_preds['image'] = test_ids
                df_preds_grouped = groupby_crops(df_preds, True)
        if 'image_name' not in oof_preds.columns:
            oof_preds = pd.concat([oof_preds, df_preds_grouped], axis = 1)
        else:
            oof_preds.fillna(-1, inplace = True)
            if 'class' in oof_preds.columns:
                df_preds_grouped.drop(['class'], axis = 1, inplace = True)
            oof_preds = oof_preds.merge(df_preds_grouped, how = 'left', on = 'image_name')
    return oof_preds

def groupby_crops(df, test = True):
    if test:
        df['image_name'] = df['image'].apply(lambda x: x.split('_')[0] + '.jpg')
    else:
        df['image_name'] = df['image'].apply(lambda x: x[:7] + x.split('_')[1][2:] + '.jpg')
    df_mean = df.groupby(['image_name']).mean()
    df_mean.reset_index(inplace = True)
    return df_mean

In [3]:
X_train = load_oof('train')
X_test = load_oof('test')

test_ids = X_test['image_name']
y_train = X_train['class']

X_train.drop(['class', 'image_name'], axis = 1, inplace = True)
X_test.drop(['image_name'], axis = 1, inplace = True)




 Loading OOF preds: ['inception_globalavgpool_5foldSKF_train_crops_frcnn_299_oversampled_avg1.npy', 'inception_globalavgpool_5foldSKF_train_crops_vgg_299_oversampled_avg1.npy', 'resnet_dense_5foldSKF_train_crops_frcnn_299_oversampled_avg1.npy', 'resnet_dense_5foldSKF_train_crops_vgg_299_oversampled_avg1.npy', 'xception_globalavgpool_5foldSKF_train_crops_frcnn_299_oversampled_avg1.npy', 'xception_globalavgpool_5foldSKF_train_crops_vgg_299_oversampled_avg1.npy'] 
 Numer of files to load: 6 

Read train images
Load folder Type_1 (Index: 0)
Load folder Type_2 (Index: 1)
Load folder Type_3 (Index: 2)
Read train data time: 0.02 seconds
Read train images
Load folder Type_1 (Index: 0)
Load folder Type_2 (Index: 1)
Load folder Type_3 (Index: 2)
Read train data time: 0.02 seconds
Read train images
Load folder Type_1 (Index: 0)
Load folder Type_2 (Index: 1)
Load folder Type_3 (Index: 2)
Read train data time: 0.02 seconds
Read train images
Load folder Type_1 (Index: 0)
Load folder Type_2 (Index: 

In [10]:
from sklearn.linear_model import LogisticRegression

def logreg_foldrun(X, y, X_test, name, save = True):
    skf = StratifiedKFold(n_splits = 5, random_state = 111, shuffle = True)
    if isinstance(X, pd.core.frame.DataFrame):
        X = X.values
    if isinstance(y, pd.core.frame.DataFrame):
        y = y.is_duplicate.values
    if isinstance(y, pd.core.frame.Series):
        y = y.values
        
    i = 0
    losses = []
    oof_train = np.zeros((1481, 3))
    oof_test = np.zeros((512, 3, 5))
    for tr_index, val_index in skf.split(X, y):
        X_tr, X_val = X[tr_index], X[val_index]
        y_tr, y_val = y[tr_index], y[val_index]
        t = time.time()
        print('Start training on fold: {}'.format(i))
        lr = LogisticRegression()
        lr.fit(X_train, y_train)
        print('Start predicting...')
        val_pred = lr.predict_proba(X_val)
        oof_train[val_index, :] = val_pred
        score = log_loss(y_val, val_pred)
        losses.append(score)
        if X_test is not None:
            test_preds = lr.predict_proba(X_test)
            oof_test[:, :, i] = test_preds
        print('Final score for fold {} :'.format(i), score, '\n',
              'Time it took to train and predict on fold:', time.time() - t, '\n')
        i += 1
    print('Mean logloss for model in 5-folds SKF:', np.array(losses).mean(axis = 0))
    oof_train = pd.DataFrame(oof_train)
    oof_train.columns = ['Type_1', 'Type_2', 'Type_3']
    oof_test = oof_test.mean(axis = 2)
    oof_test = pd.DataFrame(oof_test)
    oof_test.columns = ['Type_1', 'Type_2', 'Type_3']
    if save:
        oof_train.to_pickle('OOF_preds/stacking_train/train_preds_{}.pkl'.format(name))
        oof_test.to_pickle('OOF_preds/stacking_test/test_preds_{}.pkl'.format(name))
    return oof_train, oof_test

savename = 'LogReg_1strun'
X_train.fillna(-1, inplace = True)
oof_train, oof_test = logreg_foldrun(X_train, y_train, X_test, savename)
oof_test['image_name'] = test_ids
oof_test.to_csv('../../submissions/{}_035loss.csv'.format(savename), index = False)

Start training on fold: 0
Start predicting...
Final score for fold 0 : 0.380185010974 
 Time it took to train and predict on fold: 0.024237871170043945 

Start training on fold: 1
Start predicting...
Final score for fold 1 : 0.332372707359 
 Time it took to train and predict on fold: 0.02046513557434082 

Start training on fold: 2
Start predicting...
Final score for fold 2 : 0.344626798072 
 Time it took to train and predict on fold: 0.01847362518310547 

Start training on fold: 3
Start predicting...
Final score for fold 3 : 0.3667510653 
 Time it took to train and predict on fold: 0.017380237579345703 

Start training on fold: 4
Start predicting...
Final score for fold 4 : 0.321350162616 
 Time it took to train and predict on fold: 0.01663041114807129 

Mean logloss for model in 5-folds SKF: 0.349057148864


In [9]:
lgb_params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'multiclass',
    'metric' : {'multi_logloss'},
    'num_class': 3,
    'learning_rate' : 0.03,
    'feature_fraction' : 0.62,
    'bagging_fraction': 0.9,
    'bagging_freq': 100,
    'num_leaves' : 255,
    'max_depth': 4,
    'min_data_in_leaf': 11,
    'subsample': 0.76,
    'colsample_bytree': 0.69,
    'silent': 1,
    'random_state': 1337,
    'verbose': 1,
    'nthread': 4,
}

xgb_params = {
    'seed': 1337,
    'colsample_bytree': 0.42,
    'silent': 1,
    'subsample': 0.85,
    'eta': 0.02,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,
    'min_child_weight': 20,
    'nthread': 4,
    }

savename = 'stacking_Xception&Inception'

oof_train, oof_test = lgb_foldrun_test(X_train, y_train, X_test, lgb_params, savename, False)

Running LGBM model with parameters: {'feature_fraction': 0.62, 'max_depth': 4, 'subsample': 0.76, 'num_leaves': 255, 'colsample_bytree': 0.69, 'bagging_fraction': 0.9, 'objective': 'multiclass', 'metric': {'multi_logloss'}, 'random_state': 1337, 'num_class': 3, 'verbose': 1, 'silent': 1, 'bagging_freq': 100, 'boosting_type': 'gbdt', 'nthread': 4, 'min_data_in_leaf': 11, 'task': 'train', 'learning_rate': 0.03}
Start training on fold: 0
Train until valid scores didn't improve in 200 rounds.
[100]	valid_0's multi_logloss: 0.345384
[200]	valid_0's multi_logloss: 0.309608
[300]	valid_0's multi_logloss: 0.310759
[400]	valid_0's multi_logloss: 0.308828
Early stopping, best iteration is:
[248]	valid_0's multi_logloss: 0.306884
Start predicting...
Final score for fold 0 : 0.30631991686 
 Time it took to train and predict on fold: 0.3506300449371338 

Start training on fold: 1
Train until valid scores didn't improve in 200 rounds.
[100]	valid_0's multi_logloss: 0.323254
[200]	valid_0's multi_log

In [11]:
oof_test['image_name'] = test_ids
oof_test.to_csv('../../submissions/{}.csv'.format(savename), index = False)