In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

import tensorflow as tf
from keras import backend as K
K.set_image_dim_ordering('th') 

import numpy as np
import pandas as pd
import cv2
import zarr
import glob
import time
import matplotlib.pyplot as plt
%matplotlib inline

from keras.models import load_model, Model
from keras.utils.np_utils import to_categorical

from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
import xgboost as xgb

In [None]:
zarr_dir = '/home/w/DS_Projects/Kaggle/DS Bowl 2017/input_data/stage1_zarr/DSBowl.zarr/'
zarr_store = zarr.DirectoryStore(zarr_dir)
zarr_load_group = zarr.hierarchy.open_group(store=zarr_store, mode='r')
dsb_pats = os.listdir('/home/w/DS_Projects/Kaggle/DS Bowl 2017/input_data/stage1_zarr/DSBowl.zarr/lung_mask/')

zarr_store = zarr.DirectoryStore('/home/w/DS_Projects/Kaggle/DS Bowl 2017/input_data/stage1_zarr/DSBowl.zarr')
zarr_group = zarr.hierarchy.open_group(store=zarr_store, mode='a')
feats_group = zarr_group.require_group('features')


def save_zarr(id_patient, features):
    feats_group.array(id_patient, features, 
            chunks=(64*32*32), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), 
            synchronizer=zarr.ThreadSynchronizer())
    
def load_zarr(patient_id):
    lung_cand_zarr = zarr_load_group['lung_mask'][patient_id]
    return np.array(lung_cand_zarr).astype('float32')

def load_data(start, end):
    print('Loading 2D full-size candidates.')
    df = pd.read_csv('/home/w/DS_Projects/Kaggle/DS Bowl 2017/input_data/stage1_labels.csv')[start:end]
    df = df[df['id'].isin(dsb_pats)]
    t = time.time()
    masks = np.zeros((0, 1, 512, 512))
    labels = np.zeros(0)
    for i in range(len(df)):
        mask = load_zarr('{}'.format(df.iloc[i, 0]))
        mask[mask <= 1.0] = 0.
        z_nonzero = np.unique(np.nonzero(mask)[0])
        mask = mask[z_nonzero[0]:z_nonzero[-1], :, :]
        mask = mask/255.
        masks = np.concatenate((masks, mask), 0)
        if df.iloc[i, 1] == 1:
            label = np.ones(mask.shape[0])
        if df.iloc[i, 1] == 0:
            label = np.zeros(mask.shape[0])
        labels = np.concatenate((labels, label), 0)
    print('Data shape:', masks.shape)
    print('Time it took to load the data:', time.time() - t)
    return masks, df, labels


In [None]:
def save_features(start, end):
    print('Loading 2D full-size candidates.')
    df = pd.read_csv('/home/w/DS_Projects/Kaggle/DS Bowl 2017/input_data/stage1_labels_full.csv')[start:end]
    df = df[df['id'].isin(dsb_pats)]
    t = time.time()
    features = np.zeros((len(df), 64, 32, 32))
    for i in range(len(df)):
        print('Predicting features for patient - index: {}, ID: {}'.format(i, df.iloc[i, 0]))
        mask = load_zarr('{}'.format(df.iloc[i, 0]))
        mask[mask <= 1.0] = 0.
        z_nonzero = np.unique(np.nonzero(mask)[0])
        mask = mask[z_nonzero[0]:z_nonzero[-1], :, :]
        print('Nonzero mask shape:', mask.shape[0])
        mask = mask/255.
        preds = m2.predict(mask)
        preds_mean = np.mean(preds, axis = 0)
        preds_mean = preds_mean.reshape(-1)
        save_zarr(df.iloc[i, 0], preds_mean)
    print('Time it took to load the data & predict:', time.time() - t)
    return 


def get_features(start, end):
    print('Loading 2D full-size candidates.')
    df = pd.read_csv('/home/w/DS_Projects/Kaggle/DS Bowl 2017/input_data/stage1_labels_full.csv')[start:end]
    df = df[df['id'].isin(dsb_pats)]
    t = time.time()
    features = np.zeros((len(df), 64, 32, 32))
    for i in range(len(df)):
        print('Predicting features for patient - index: {}, ID: {}'.format(i, df.iloc[i, 0]))
        mask = load_zarr('{}'.format(df.iloc[i, 0]))
        mask[mask <= 1.0] = 0.
        z_nonzero = np.unique(np.nonzero(mask)[0])
        mask = mask[z_nonzero[0]:z_nonzero[-1], :, :]
        print('Nonzero mask shape:', mask.shape[0])
        mask = mask/255.
        preds = m.predict(mask)
        preds_mean = np.mean(preds, axis = 0)
        features[i, :] = preds_mean
    features = features.reshape(features.shape[0], -1)
    print('Time it took to load the data & predict:', time.time() - t)
    return features, df

In [None]:
zarr_dir2 = '/home/w/DS_Projects/Kaggle/DS Bowl 2017/input_data/stage2_zarr/DSBowl.zarr/'
zarr_store2 = zarr.DirectoryStore(zarr_dir2)
zarr_load_group2 = zarr.hierarchy.open_group(store=zarr_store2, mode='r')
dsb_pats_stg2 = [x for x in os.listdir('/home/w/DS_Projects/Kaggle/DS Bowl 2017/input_data/stage2_zarr/DSBowl.zarr/lung_mask/')
                 if '.' not in x]

def load_zarr2(patient_id):
    lung_cand_zarr = zarr_load_group2['lung_mask'][patient_id]
    return np.array(lung_cand_zarr).astype('float32')

def get_features_test(dsb_pats):
    print('Loading 2D full-size candidates.')
    t = time.time()
    features = np.zeros((len(dsb_pats), 64, 32, 32))
    for i in range(len(dsb_pats)):
        print('Predicting features for patient - index: {}, ID: {}'.format(i, dsb_pats[i]))
        mask = load_zarr2(dsb_pats[i])
        mask[mask <= 1.0] = 0.
        z_nonzero = np.unique(np.nonzero(mask)[0])
        mask = mask[z_nonzero[0]:z_nonzero[-1], :, :]
        print('Nonzero mask shape:', mask.shape[0])
        mask = mask/255.
        preds = m.predict(mask)
        preds_mean = np.mean(preds, axis = 0)
        features[i, :] = preds_mean
    features = features.reshape(features.shape[0], -1)
    print('Time it took to load the data & predict:', time.time() - t)
    return features

In [None]:
src_checks = '/home/w/DS_Projects/Kaggle/DS Bowl 2017/Scripts/LUNA/CNN/Checkpoints/'
cnn = load_model(src_checks + '1stgentry_CNN2DClassifier.h5')
m = Model(input = cnn.input, output = cnn.layers[-12].output) # last MaxPool
m2 = Model(input = cnn.input, output = cnn.layers[-5].output) # last Dense(128)

In [None]:
X_train, y_train = get_features(0, 1500)
y_train = y_train['cancer'].values

X_test, y_test = get_features(1500, 1595)
y_test = y_test['cancer'].values

stg2_features_masks = get_features_test(dsb_pats_stg2)

In [None]:
def xgb_train(X_train, X_test, y_train, y_test):
    
    clf = xgb.XGBRegressor(max_depth=10,
                           n_estimators=10000,
                           min_child_weight=15,
                           learning_rate=0.03,
                           nthread=8,
                           subsample=0.80,
                           colsample_bytree=0.75,
                           seed=1337)

    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=50, eval_metric='logloss', early_stopping_rounds=50)
    return clf


def make_submit(clf):
    df = pd.read_csv('/home/w/DS_Projects/Kaggle/DS Bowl 2017/input_data/stage2_sample_submission.csv')
    X_test2 = get_features_test(dsb_pats_stg2)
    pred = clf.predict(X_test2)
    df['cancer'] = pred
    df.to_csv('stage2_sub_2DUNet_preds_masks.csv', index=False)
    print(df.head())
    return

In [None]:
clf = xgb_train(X_train, X_test, y_train, y_test)

In [None]:
make_submit(clf)

In [None]:
np.save('2D_train0-1500_features_masks', X_train)
np.save('2D_train0-1500_labels_masks', y_train)

np.save('2D_valid1500-1595_features_masks', X_test)
np.save('2D_valid1500-1595_labels_masks', y_test)