In [1]:
import numpy as np
import pandas as pd
import os
from PIL import Image
import pydicom
import matplotlib.pyplot as plt
import pylab
import cv2
from tensorflow.keras.utils import Sequence
#import gdcm
from tqdm import tqdm

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K
import tensorflow.keras.regularizers as R
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

In [2]:
EPOCHS = 5
NUM_IMAGES = 140
BATCH_SIZE = 4
FOLDS = 5
IMAGE_DIM = (NUM_IMAGES,55,55)
COMP_DIR = '../input/osic-pulmonary-fibrosis-progression/'
TRAIN_PATH = '../input/osic-pulmonary-fibrosis-progression/train'
TEST_PATH = '../input/osic-pulmonary-fibrosis-progression/test'
SUB_PATH = '../input/osic-pulmonary-fibrosis-progression/sample_submission.csv'

In [3]:
comp_dir = '../input/osic-pulmonary-fibrosis-progression'

train_data = pd.read_csv(os.path.join(comp_dir,'train.csv'))
sub = pd.read_csv(os.path.join(comp_dir,'sample_submission.csv'))
test_data = pd.read_csv(os.path.join(comp_dir,'test.csv'))
train_data.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])

In [4]:
train_data_u = train_data
train_data_u = train_data_u.drop_duplicates(subset=['Patient'])
train_data_u = train_data_u.rename(columns={'Weeks':'Base_Week','FVC':'Base_FVC','Percent':'Base_Percent'})
train_data_u['Typical_FVC'] = (train_data_u.Base_FVC.values/train_data_u.Base_Percent.values)*100
train_data = train_data.merge(train_data_u.drop(['Age','Sex','SmokingStatus'],axis=1),on='Patient',how='left')

In [5]:
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub = sub.drop(['Confidence'],axis=1)
sub =  sub[['Patient','Weeks','Patient_Week']]

In [6]:
test_data = test_data.rename(columns={'Weeks':'Base_Week','FVC':'Base_FVC','Percent':'Base_Percent'})
test_data['Typical_FVC'] = (test_data.Base_FVC.values/test_data.Base_Percent.values)*100
sub = sub.merge(test_data, how='left', on='Patient')

In [7]:
train_data['Type'] = 'train'
sub['Type'] = 'test'

In [8]:
data = train_data.append(sub)

In [9]:
data

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Base_Week,Base_FVC,Base_Percent,Typical_FVC,Type,Patient_Week
0,ID00007637202177411956430,-4,2315.0,58.253649,79,Male,Ex-smoker,-4,2315,58.253649,3974.0,train,
1,ID00007637202177411956430,5,2214.0,55.712129,79,Male,Ex-smoker,-4,2315,58.253649,3974.0,train,
2,ID00007637202177411956430,7,2061.0,51.862104,79,Male,Ex-smoker,-4,2315,58.253649,3974.0,train,
3,ID00007637202177411956430,9,2144.0,53.950679,79,Male,Ex-smoker,-4,2315,58.253649,3974.0,train,
4,ID00007637202177411956430,11,2069.0,52.063412,79,Male,Ex-smoker,-4,2315,58.253649,3974.0,train,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,ID00419637202311204720264,133,,,73,Male,Ex-smoker,6,3020,70.186855,4302.8,test,ID00419637202311204720264_133
726,ID00421637202311550012437,133,,,68,Male,Ex-smoker,15,2739,82.045291,3338.4,test,ID00421637202311550012437_133
727,ID00422637202311677017371,133,,,73,Male,Ex-smoker,6,1930,76.672493,2517.2,test,ID00422637202311677017371_133
728,ID00423637202312137826377,133,,,72,Male,Ex-smoker,17,3294,79.258903,4156.0,test,ID00423637202312137826377_133


In [10]:
prediction_col = ["FVC"]
Continuos_cols = ["Weeks","Base_Week","Base_FVC","Typical_FVC","Age","Percent","Base_Percent"]
Categorical_cols = ['Sex','Smoking_status']

In [11]:
from sklearn.preprocessing import MinMaxScaler

In [12]:
scaler = MinMaxScaler()
conti = scaler.fit_transform(data[Continuos_cols])
data[Continuos_cols] = conti

In [13]:
print(np.mean(train_data_u.query('SmokingStatus == \'Never smoked\'').Base_Percent.values))
print(np.mean(train_data_u.query('SmokingStatus == \'Currently smokes\'').Base_Percent.values))
print(np.mean(train_data_u.query('SmokingStatus == \'Ex-smoker\'').Base_Percent.values))
print(np.mean(train_data_u.query('Sex == \'Male\'').Base_Percent.values))
print(np.mean(train_data_u.query('Sex == \'Female\'').Base_Percent.values))

79.50710799558858
98.51592571905941
79.08661822268049
78.86695444921291
85.19475419165079


In [14]:
sex_m = np.zeros((len(data['Sex'].values),1))
sex_f = np.zeros((len(data['Sex'].values),1))
sm_es = np.zeros((len(data['Sex'].values),1))
sm_ns = np.zeros((len(data['Sex'].values),1))
sm_cs = np.zeros((len(data['Sex'].values),1))
for i in range(len(data['Sex'].values)):
    if data['Sex'].values[i] == 'Male':
        sex_m[i] = 1
    elif data['Sex'].values[i] == 'Female':
        sex_f[i] = 1
for i in range(len(data['SmokingStatus'].values)):
    if data['SmokingStatus'].values[i] =='Ex-smoker':
        sm_es[i] = 1
    elif data['SmokingStatus'].values[i] =='Never smoked':
        sm_ns[i] = 1
    else:
        sm_cs[i] = 1

data['sex_m'] = sex_m
data['sex_f'] = sex_f
data['sm_es'] = sm_es
data['sm_ns'] = sm_ns
data['sm_cs'] = sm_cs

In [15]:
x_cols = ['Weeks','Base_Week','Base_FVC','Age','sex_m','sex_f','sm_es','sm_ns','sm_cs']

In [16]:
x_train = data[x_cols].loc[data['Type'] == "train"].values.astype(np.float)
y_train = data[prediction_col].loc[data['Type'] == "train"].values.astype(np.float)
x_test = data[x_cols].loc[data['Type'] == "test"].values.astype(np.float)

In [17]:
data

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Base_Week,Base_FVC,Base_Percent,Typical_FVC,Type,Patient_Week,sex_m,sex_f,sm_es,sm_ns,sm_cs
0,ID00007637202177411956430,0.055172,2315.0,0.236393,0.769231,Male,Ex-smoker,0.011905,0.241456,0.135886,0.714466,train,,1.0,0.0,1.0,0.0,0.0
1,ID00007637202177411956430,0.117241,2214.0,0.215941,0.769231,Male,Ex-smoker,0.011905,0.241456,0.135886,0.714466,train,,1.0,0.0,1.0,0.0,0.0
2,ID00007637202177411956430,0.131034,2061.0,0.184960,0.769231,Male,Ex-smoker,0.011905,0.241456,0.135886,0.714466,train,,1.0,0.0,1.0,0.0,0.0
3,ID00007637202177411956430,0.144828,2144.0,0.201767,0.769231,Male,Ex-smoker,0.011905,0.241456,0.135886,0.714466,train,,1.0,0.0,1.0,0.0,0.0
4,ID00007637202177411956430,0.158621,2069.0,0.186580,0.769231,Male,Ex-smoker,0.011905,0.241456,0.135886,0.714466,train,,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,ID00419637202311204720264,1.000000,,,0.615385,Male,Ex-smoker,0.130952,0.372400,0.244706,0.813374,test,ID00419637202311204720264_133,1.0,0.0,1.0,0.0,0.0
726,ID00421637202311550012437,1.000000,,,0.487179,Male,Ex-smoker,0.238095,0.320208,0.352843,0.523268,test,ID00421637202311550012437_133,1.0,0.0,1.0,0.0,0.0
727,ID00422637202311677017371,1.000000,,,0.615385,Male,Ex-smoker,0.130952,0.169948,0.303848,0.276239,test,ID00422637202311677017371_133,1.0,0.0,1.0,0.0,0.0
728,ID00423637202312137826377,1.000000,,,0.589744,Male,Ex-smoker,0.261905,0.423291,0.327434,0.769215,test,ID00423637202312137826377_133,1.0,0.0,1.0,0.0,0.0


In [18]:
x_train = x_train.astype(np.float32)
y_train = y_train.astype(np.float32)
x_test = x_test.astype(np.float32)

In [19]:
x_train.shape,y_train.shape,x_test.shape

((1535, 9), (1535, 1), (730, 9))

In [20]:
type(x_train[0][3])

numpy.float32

In [21]:
type(x_train),type(y_train),type(x_test)

(numpy.ndarray, numpy.ndarray, numpy.ndarray)

In [22]:
class Data_Generator(tf.keras.utils.Sequence):
    
    def __init__(self,batch_size,patient_ids,tab_data,dim,target=None,train=True,augment=False):
        self.batch_size = batch_size
        self.image_ids = patient_ids
        self.augment = augment
        self.dim = dim
        self.target = target
        self.indices = range(len(self.image_ids))
        self.train = train
        self.tab_data = tab_data
        #self.on_epoch_end()
    
    def getimage(self,image_id):
        X1 = np.zeros((NUM_IMAGES,self.dim[1],self.dim[2], 1))
        if self.train:
            path = TRAIN_PATH
        else:
            path = TEST_PATH
        im_num = len(os.listdir(os.path.join(path,image_id)))
        if im_num < NUM_IMAGES+1:
            for i,dcm_i in enumerate(os.listdir(os.path.join(path,image_id))):
                try:
                    im = pydicom.dcmread(os.path.join(TRAIN_PATH,f'{image_id}/{dcm_i}'))
                    img = im.pixel_array/255
                    img = cv2.resize(img, (self.dim[1],self.dim[2]))
                    img = np.reshape(img,(IMAGE_DIM[1],IMAGE_DIM[2],1))
                    X1[i,] = img
                    if i>=NUM_IMAGES-1:
                        break
                except:
                    continue
        else:
            val = (im_num - NUM_IMAGES)//2
            dir_list = os.listdir(os.path.join(path,image_id))
            dir_list.sort()
            for i,dcm_i in enumerate(dir_list[val:]):
                try:
                    im = pydicom.dcmread(os.path.join(TRAIN_PATH,f'{image_id}/{dcm_i}'))
                    img = im.pixel_array/255
                    img = cv2.resize(img, (self.dim[1],self.dim[2]))
                    img = np.reshape(img,(IMAGE_DIM[1],IMAGE_DIM[2],1))
                    X1[i,] = img
                    if i>=NUM_IMAGES-1:
                        break
                except:
                    continue
               
        if self.augment == True:
            img = self.ImageAugment(img)
            return img
        return X1
    
    def on_epoch_end(self):
        return self.indices
    
    def getdata(self, image_id_list):
        X = np.empty((self.batch_size,*self.dim, 1))
        for i, im_id in enumerate(image_id_list):
            #print(i)
            X[i,] = self.getimage(im_id)
        
        return X
    '''
    def ImageAugment(self,image):
        augmentor = ImageAugmentor(image,axis_point=[self.dim/2,self.dim/2])
        augmentor.cutmix()
        #augmentor.zoom()
        augmentor.flip()
        augmentor.rotate()
        return augmentor.get_image()
    ''' 
    
    def __getitem__(self,index):
        indices = self.indices[index*self.batch_size:(index+1)*self.batch_size]
        
        image_id_list = [self.image_ids[k] for k in indices]
        tab_X = np.array([self.tab_data[k] for k in indices]).astype(np.float32)
        X = self.getdata(image_id_list)
        if self.train == True:
            target_list = [self.target[k] for k in indices]
            y = np.array(target_list).astype(np.float32)
            return [X,tab_X],y
        return [X,tab_X]
    
    def __len__(self):
        return int(np.floor(len(self.indices)/self.batch_size))
    

In [23]:
C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")
#=============================#
def score(y_true, y_pred):
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:,2]-y_pred[:,0]
    fvc_pred = y_pred[:,1]
    #sigma_clip = sigma + C1
    sigma_clip = tf.maximum(sigma, C1)
    delta = tf.abs(y_true[:,0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype=tf.float32) )
    metric = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip* sq2)
    return K.mean(metric)
#============================#
def qloss(y_true, y_pred):
    # Pinball loss for multiple quantiles
    qs = [0.2,0.50,0.8]
    q = tf.constant(np.array([qs]), dtype=tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q*e,(q-1)*e)
    return K.mean(v)
#=============================#
def mloss(_lambda):
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda)*score(y_true, y_pred)
    return loss
#=================

In [24]:
x_train = data[x_cols].loc[data['Type'] == "train"].values
y_train = data[prediction_col].loc[data['Type'] == "train"].values
x_test = data[x_cols].loc[data['Type'] == "test"].values

In [25]:
model = M.load_model('../input/tab-data-osic/dense_model.h5',custom_objects={'loss':mloss,'score':score})
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 9)]          0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 128)          1280        input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          16512       dense[0][0]                      
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 64)           8256        dense_1[0][0]                    
______________________________________________________________________________________________

In [26]:
test_patient_ids = data['Patient'].loc[data['Type'] == "test"].values

In [27]:
test_gen = Data_Generator(2,test_patient_ids,x_test,IMAGE_DIM,train=False)

In [28]:
print("Inferencing")

Inferencing


In [29]:
print(len(test_patient_ids))

730


In [30]:
pred = model.predict(x_test,verbose=1)



In [31]:
conf = pred[:,2] - pred[:,0]
for i in range(len(conf)):
    conf[i] = max(conf[i],70)

In [32]:
pred_dict = {'FVC':pred[:,1],'Confidence':conf}
pred_df = pd.DataFrame(pred_dict)

In [33]:
sub['Confidence'] = pred_df['Confidence']
sub['FVC'] = pred_df['FVC']

In [34]:
subm = sub[['Patient_Week','FVC','Confidence']].copy()

In [35]:
subm.to_csv("submission.csv", index=False)

In [36]:
subm

Unnamed: 0,Patient_Week,FVC,Confidence
0,ID00419637202311204720264_-12,3086.141113,233.427734
1,ID00421637202311550012437_-12,2869.516846,198.660889
2,ID00422637202311677017371_-12,2018.793945,224.039673
3,ID00423637202312137826377_-12,3461.710938,178.082520
4,ID00426637202313170790466_-12,2937.440674,223.564453
...,...,...,...
725,ID00419637202311204720264_133,2735.941162,450.392578
726,ID00421637202311550012437_133,2461.265381,408.124756
727,ID00422637202311677017371_133,1827.992432,369.147461
728,ID00423637202312137826377_133,3019.620605,447.003174
