In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import gc
import numpy as np
import pandas as pd
import glob
import time
import matplotlib.pyplot as plt
%matplotlib inline

from keras.models import load_model, Model
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, KFold
import xgboost as xgb

from data_utils import *
from fit_utils import *
from inmem_utils import *

Using TensorFlow backend.


In [2]:
def extract_features_fold(modelname, layer_index, X = None, X_test = None):
    orig_model = load_model(checks_src + '{}.h5'.format(modelname))
    m = Model(input = orig_model.input, output = orig_model.layers[layer_index].output)
    if X_test is None:
        preds = m.predict(X, batch_size = 8)
        return preds
    if X_test is not None:
        preds_test = m.predict(X_test, batch_size = 8)
        return preds_test
    
def submission_inmem(bag_preds, test_ids, name):
    print('Begin to write submission file ..')
    f_submit = open(os.path.join(sub_src, '{}'.format(name) +'.csv'), 'w')
    f_submit.write('image,Type_1,Type_2,Type_3\n')
    for i, image_name in enumerate(test_ids):
        pred = ['%.6f' % p for p in bag_preds[i, :]]
        if i%100 == 0:
            print(i, '/', 600)
        f_submit.write('%s,%s\n' % (os.path.basename(image_name), ','.join(pred)))
    f_submit.close()
    print('Submission {} written.'.format(name))
    return 

def split_proper_skf(train_ids, num_folds):
    folds_train_imgs = []
    folds_val_imgs = []
    folds_train_inds = []
    folds_val_inds = [] 
    img_names = []
    for i in train_ids:
        img_names.append(i[:6] + '/' + i.split('_')[1][2:])
    img_names = list(set(img_names))
    train_ids = np.array(train_ids)
    img_names = np.array(img_names)
    skf = KFold(n_splits = num_folds, random_state = 111, shuffle = True)
    print('Running {}-Fold data split'.format(num_folds))
    fold_number = 1
    for train_index, test_index in skf.split(img_names):
        print('Split dataset for fold:', fold_number)
        train_split, val_split = img_names[train_index], img_names[test_index]
        to_train = []
        for orig in train_ids:
            for tr in train_split:
                if tr in orig:
                    to_train.append(orig)
        to_train = list(set(to_train))
        to_val = list(set(train_ids).difference(set(to_train)))
        folds_train_imgs.append(to_train)
        folds_val_imgs.append(to_val)
        print('Number of training set images: {}, validation set images: {}'.format(len(to_train), len(to_val)))
        inds_train = []
        inds_val = []
        for i, val in enumerate(train_ids):
            for j in to_train:
                if j in val:
                    inds_train.append(i)
        inds_val = list(set(range(len(train_ids))).difference(set(inds_train)))
        folds_train_inds.append(inds_train)
        folds_val_inds.append(inds_val)
        fold_number += 1
    return folds_train_imgs, folds_val_imgs, folds_train_inds, folds_val_inds
    
    
def fit_xgb(X, num_folds, layer_index, checkname = None, X_test = None):
    folds_train_imgs, folds_val_imgs, folds_train_inds, folds_val_inds = split_proper_skf(train_ids, num_folds)
    params = {
        'seed': 0,
        'colsample_bytree': 0.7,
        'silent': 1,
        'subsample': 0.7,
        'learning_rate': 0.03,
        'objective': 'reg:linear',
        'eval_metric': 'mlogloss',
        'objective': 'multi:softprob',
        'num_class': 3,
        'max_depth': 12,
        'min_child_weight': 100,
        'booster': 'gbtree',
        }
    test_predictions = []
    #for i in range(len(folds_train_inds)):
    for i in range(6):
        print('Extracting features and training for fold:', i + 1)
        X_tr = X[folds_train_inds[i]]
        X_val = X[folds_val_inds[i]]
        y_tr = y[folds_train_inds[i]]
        y_val = y[folds_val_inds[i]]
        X_tr = extract_features_fold('{}_fold{}'.format(checkname, i + 1), layer_index, X_tr)
        X_val = extract_features_fold('{}_fold{}'.format(checkname, i + 1), layer_index, X_val)
        d_train = xgb.DMatrix(X_tr, label=y_tr)
        d_valid = xgb.DMatrix(X_val, label=y_val)
        watchlist = [(d_train, 'train'), (d_valid, 'eval')]

        clf = xgb.train(params, d_train, 100000, watchlist, early_stopping_rounds = 50,
                       verbose_eval = 50)
        preds_val = clf.predict(xgb.DMatrix(X_val), ntree_limit=clf.best_ntree_limit)
        print('Logloss:', log_loss(y_val, preds_val))
        if X_test is not None:
            X_test_feats = extract_features_fold('{}_fold{}'.format(checkname, i + 1), layer_index, X_test = X_test)
            preds_test = clf.predict(xgb.DMatrix(X_test_feats), ntree_limit=clf.best_ntree_limit)
            test_predictions.append(preds_test)
    return np.array(test_predictions).mean(axis = 0)
   


In [3]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Intel_Cervix/data/training_data/'
checks_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Intel_Cervix/scripts/models/checks/'
sub_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Intel_Cervix/submissions/Raw/'
sub_dst = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Intel_Cervix/submissions/'

In [4]:
X, y, train_ids = load_train(src + 'train_crops_yolo_299/')
X = X / 255.

X_test, test_ids = load_test(src + 'test_crops_yolo_299/test_crops_yolo_299/')
X_test = X_test / 255.

Read train images
Load folder Type_1 (Index: 0)
Load folder Type_2 (Index: 1)
Load folder Type_3 (Index: 2)
Read train data time: 3.89 seconds
Read test images
Read train data time: 2.07 seconds


In [None]:
fit_xgb(X, 10, -2, 'xception2_10foldSKF_yolo299')

In [5]:
test_preds = fit_xgb(X, 10, -2, 'xception2_10foldSKF_yolo299', X_test = X_test)
submission_inmem(test_preds, test_ids, 'XGB_on_xception2_10foldSKF_yolo299')
prep_sub('XGB_on_xception2_10foldSKF_yolo299')

Running 10-Fold data split
Split dataset for fold: 1
Number of training set images: 1849, validation set images: 87
Split dataset for fold: 2
Number of training set images: 1872, validation set images: 64
Split dataset for fold: 3
Number of training set images: 1862, validation set images: 74
Split dataset for fold: 4
Number of training set images: 1862, validation set images: 74
Split dataset for fold: 5
Number of training set images: 1845, validation set images: 91
Split dataset for fold: 6
Number of training set images: 1873, validation set images: 63
Split dataset for fold: 7
Number of training set images: 1868, validation set images: 68
Split dataset for fold: 8
Number of training set images: 1866, validation set images: 70
Split dataset for fold: 9
Number of training set images: 1873, validation set images: 63
Split dataset for fold: 10
Number of training set images: 1830, validation set images: 106
Extracting features and training for fold: 1
[0]	train-mlogloss:1.084	eval-mloglo