In [None]:
import numpy as np
import pandas as pd
import pydicom
import os
import seaborn as sns
import seaborn as sb
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import category_encoders as ce
from keras.optimizers import Adam

import cv2

import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

plt.style.use('seaborn-darkgrid')

In [None]:
import tensorflow as tf

# GPU device Check.
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # PyTorch use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
train = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

In [None]:
sub = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/sample_submission.csv")
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub =  sub[['Patient','Weeks','Confidence','Patient_Week']]
sub = sub.merge(test.drop('Weeks', axis=1), on="Patient")

In [None]:
sub

In [None]:
train.head(100)

FVC : forced vital capacity, i.e. the volume of air exhaled

In [None]:
test.head()

In [None]:
print(train.shape, test.shape)

In [None]:
print(train.isnull().sum(),'\n')
print(test.isnull().sum())

In [None]:
train.info()

In [None]:
plt.figure(figsize=(16,10))

In [None]:
sns.barplot(train['Sex'].value_counts().index, train['Sex'].value_counts())

In [None]:
sns.barplot(train['SmokingStatus'].value_counts().index, train['SmokingStatus'].value_counts())

In [None]:
sa = pd.crosstab(train['SmokingStatus'],train['Sex'])
sa.plot(kind="bar",title='No of passengers survived')
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = 10,5
ax = train['Age'].hist(bins = 15,alpha = 0.9, color = 'green')
ax.set(xlabel = 'Age',ylabel = 'Count',title = 'Visualization of Ages')
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = 10,5
ax = train['Weeks'].hist(bins = 15,alpha = 0.9, color = 'green')
ax.set(xlabel = 'Weeks',ylabel = 'Count',title = 'Visualization of Ages')
plt.show()

In [None]:
plt.scatter(train['Weeks'],train['FVC'])

In [None]:
plt.scatter(train['Age'],train['FVC'])

In [None]:
plt.rcParams['figure.figsize'] = 10,10
sb.heatmap(train.corr(),annot = True,square = True,linewidths = 2,linecolor = 'black')

In [None]:
train[train['FVC'] == train['FVC'].max()]

In [None]:
train[train['FVC'] == train['FVC'].min()]

The patient of Max FVC

In [None]:
imdir_max = "/kaggle/input/osic-pulmonary-fibrosis-progression/train/ID00219637202258203123958"
imdir_min = "/kaggle/input/osic-pulmonary-fibrosis-progression/train/ID00225637202259339837603"

fig=plt.figure(figsize=(12, 12))

columns = 4
rows = 5

imglist = os.listdir(imdir_max)


for i in range(1, columns*rows +1):
    filename = imdir_max + "/" + str(i) + ".dcm"
    ds = pydicom.dcmread(filename)
    fig.add_subplot(rows, columns, i)
    plt.imshow(ds.pixel_array, cmap='jet')
plt.show()

The patient of Min FVC

In [None]:
fig=plt.figure(figsize=(12, 12))

columns = 4
rows = 5

imglist = os.listdir(imdir_min)


for i in range(1, columns*rows +1):
    filename = imdir_min + "/" + str(i) + ".dcm"
    ds = pydicom.dcmread(filename)
    fig.add_subplot(rows, columns, i)
    plt.imshow(ds.pixel_array, cmap='jet')
plt.show()

Submission File:
*     Patient_Week,FVC,Confidence
*     ID00002637202176704235138_1,2000,100
*     ID00002637202176704235138_2,2000,100
*     ID00002637202176704235138_3,2000,100

confidence = standard deviation σ

In [None]:
train['Patient_Week'] = train['Patient'].astype(str) + '_' + train['Weeks'].astype(str)
train.head()

In [None]:
test['Patient_Week'] = test['Patient'].astype(str) + '_' + test['Weeks'].astype(str)
test.head()

In [None]:
print(train.shape)
print(test.shape)
train

In [None]:
# construct train input

output = pd.DataFrame()
gb = train.groupby('Patient')
tk0 = tqdm(gb, total=len(gb))
for _, usr_df in tk0:
    usr_output = pd.DataFrame()
    for week, tmp in usr_df.groupby('Weeks'):
        rename_cols = {'Weeks': 'base_Week', 'FVC': 'base_FVC', 'Percent': 'base_Percent', 'Age': 'base_Age'}
        tmp = tmp.drop(columns='Patient_Week').rename(columns=rename_cols)
        drop_cols = ['Age', 'Sex', 'SmokingStatus', 'Percent']
        _usr_output = usr_df.drop(columns=drop_cols).rename(columns={'Weeks': 'predict_Week'}).merge(tmp, on='Patient')
        _usr_output['Week_passed'] = _usr_output['predict_Week'] - _usr_output['base_Week']
        usr_output = pd.concat([usr_output, _usr_output])
    output = pd.concat([output, usr_output])

train = output[output['Week_passed']!=0].reset_index(drop=True)

train.head()

In [None]:
# construct test input

test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')\
        .rename(columns={'Weeks': 'base_Week', 'FVC': 'base_FVC', 'Percent': 'base_Percent', 'Age': 'base_Age'})
submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
submission['Patient'] = submission['Patient_Week'].apply(lambda x: x.split('_')[0])
submission['predict_Week'] = submission['Patient_Week'].apply(lambda x: x.split('_')[1]).astype(int)
test = submission.drop(columns=['FVC', 'Confidence']).merge(test, on='Patient')
test['Week_passed'] = test['predict_Week'] - test['base_Week']

test.head()

In [None]:
print(submission.shape)
print(test.shape)
print(train.shape)

In [None]:
print(train.isnull().sum(),'\n')
print(test.isnull().sum())

Data Preparation

In [None]:
train.set_index(['Patient_Week'],inplace = True)

test.set_index(['Patient_Week'],inplace = True)

In [None]:
train

In [None]:
y = train['FVC']
X = train.drop(['FVC'], axis=1)
X = X.drop(['Patient'], axis=1)

#y_test = test['FVC']
test_X = test.drop(['Patient'], axis=1)
#test_X = test.drop(['FVC'], axis=1)

In [None]:
X

In [None]:
# getting dummy variables column

enc = LabelEncoder()

X['Sex'] = enc.fit_transform(X['Sex'])

X['SmokingStatus'] = enc.fit_transform(X['SmokingStatus'])

In [None]:
X

In [None]:
test_X['Sex'] = enc.fit_transform(test_X['Sex'])
test_X['SmokingStatus'] = enc.fit_transform(test_X['SmokingStatus'])

In [None]:
test_X

In [None]:
y

In [None]:
#Normalizing

from sklearn.preprocessing import normalize

X = normalize(X)
test_X = normalize(test_X)

Modeling

1-DNN

In [None]:
sub = test_X 
pe = np.zeros((test_X.shape[0], 3)) #for predict test
# pred = np.zeros((X.shape[0], 3)) #for predict val
ze = normalize(sub)

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M

# C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")
# #=============================#
# def score(y_true, y_pred):
#     tf.dtypes.cast(y_true, tf.float32)
#     tf.dtypes.cast(y_pred, tf.float32)
#     sigma = y_pred[:, 2] - y_pred[:, 0]
#     fvc_pred = y_pred[:, 1]
    
#     #sigma_clip = sigma + C1
#     sigma_clip = tf.maximum(sigma, C1)
#     delta = tf.abs(y_true[:, 0] - fvc_pred)
#     delta = tf.minimum(delta, C2)
#     sq2 = tf.sqrt( tf.dtypes.cast(2, dtype=tf.float32) )
#     metric = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip* sq2)
#     return K.mean(metric)
# #============================#
# def qloss(y_true, y_pred):
#     # Pinball loss for multiple quantiles
#     qs = [0.2, 0.50, 0.8]
#     q = tf.constant(np.array([qs]), dtype=tf.float32)
#     e = y_true - y_pred
#     v = tf.maximum(q*e, (q-1)*e)
#     return K.mean(v)
# #=============================#
# def mloss(_lambda):
#     def loss(y_true, y_pred):
#         return _lambda * qloss(y_true, y_pred) + (1 - _lambda)*score(y_true, y_pred)
#     return loss
# #=================
def make_model():
    z = L.Input((8,), name="Patient")
    x = L.Dense(1000, activation="relu", name="d1")(z)
    x = L.Dense(1000, activation="relu", name="d2")(x)
    x = L.Dense(500, activation="relu", name="d3")(x)
    x = L.Dense(500, activation="relu", name="d4")(x)
    p1 = L.Dense(3, activation="relu", name="p1")(x)
    p2 = L.Dense(3, activation="relu", name="p2")(x)
    preds = L.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis=1), 
                     name="preds")([p1, p2])
    
    model = M.Model(z, preds, name="CNN")
    #model.compile(loss=qloss, optimizer="adam", metrics=[score])
    opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01, amsgrad=False)
    model.compile(loss = 'binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model 

 
net = make_model()
net.summary()

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import KFold

scores = []

NFOLD = 5
kf = KFold(n_splits=NFOLD)
BATCH_SIZE = 8

In [None]:
%%time
cnt = 0
for tr_idx, val_idx in kf.split(X):  
    cnt += 1
    print(f"FOLD {cnt}")
    net = make_model()  
    net.fit(X[tr_idx], y[tr_idx], batch_size=BATCH_SIZE, epochs=400, 
            validation_data=(X[val_idx], y[val_idx]), verbose=0) #
#     print("train", net.evaluate(X[tr_idx], y[tr_idx], verbose=0, batch_size=BATCH_SIZE))
#     print("val", net.evaluate(X[val_idx], y[val_idx], verbose=0, batch_size=BATCH_SIZE))
#     print("predict val...")
#     pred[val_idx] = net.predict(X[val_idx], batch_size=BATCH_SIZE, verbose=0)    
    print("predict test...")   
    pe += net.predict(ze, batch_size=BATCH_SIZE, verbose=0) / NFOLD 

2-xgboost

In [None]:
# #K Fold Cross Validation
  
# from sklearn.model_selection import KFold 
# import xgboost as xgb


# kf = KFold(n_splits=20, random_state=42, shuffle=True)

# for train_index, val_index in kf.split(X):
#     print("TRAIN:", train_index, "TEST:", val_index)
#     X_train, X_val = X[train_index], X[val_index]
#     y_train, y_val = y[train_index], y[val_index]

In [None]:
# dtrain = xgb.DMatrix(X_train, label=y_train)
# dvalid = xgb.DMatrix(X_val, label=y_val)
# watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

# xgb_pars = {'min_child_weight': 10, 'eta': 0.04, 'colsample_bytree': 0.8, 'max_depth': 15,
#             'subsample': 0.75, 'lambda': 2, 'nthread': -1, 'booster' : 'gbtree', 'silent': 1, 'gamma' : 0,
#             'eval_metric': 'rmse', 'objective': 'reg:linear'}    

# model = xgb.train(xgb_pars, dtrain, 500, watchlist, early_stopping_rounds=250,
#                   maximize=False, verbose_eval=15) 

In [None]:
# ze = normalize(test_X)
# ze

In [None]:
# dtest = xgb.DMatrix(ze)

# pred = model.predict(dtest)

3-RNN

In [None]:
# #K Fold Cross Validation

# from sklearn.model_selection import KFold


# kf = KFold(n_splits=5, random_state=2020, shuffle=True)

# for train_index, val_index in kf.split(X):
#     print("TRAIN:", train_index, "TEST:", val_index)
#     X_train, X_val = X[train_index], X[val_index]
#     y_train, y_val = y[train_index], y[val_index]

In [None]:
# print(X_train.shape)
# print(y_train.shape)
# print(X_val.shape)
# print(y_val.shape)

# #reshape for rnn

# X_train = X_train.reshape(-1, 1, 8)
# X_val  = X_val.reshape(-1, 1, 8)
# y_train = y_train.values #convert pd to array
# y_train = y_train.reshape(-1, 1,)
# y_val = y_val.values #convert pd to array
# y_val = y_val.reshape(-1, 1,)

# print(X_train.shape)
# print(y_train.shape)
# print(X_val.shape)
# print(y_val.shape)

In [None]:
# from tensorflow.keras.layers import Conv2D,LSTM,LeakyReLU, MaxPooling2D,Concatenate,Input, Dropout, Flatten, Dense, GlobalAveragePooling2D,Activation, BatchNormalization
# from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
# from tensorflow.keras.models import Model


#   # create model
    

# #input 
# input_layer = Input(shape=(1,8))
# main_rnn_layer = LSTM(64, return_sequences=True, recurrent_dropout=0.2)(input_layer)

    
# #output
# rnn = LSTM(32)(main_rnn_layer)
# dense = Dense(128)(rnn)
# dropout_c = Dropout(0.3)(dense)
# dense = Dense(64)(dropout_c)
# dropout_c = Dropout(0.3)(dense)
# dense = Dense(32)(dropout_c)
# dropout_c = Dropout(0.3)(dense)

# classes = Dense(1, activation= LeakyReLU(alpha=0.1),name="class")(dropout_c)

# model = Model(input_layer, classes)

# # Compile model
# callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=4, verbose=1, factor=0.6),
#              EarlyStopping(monitor='val_loss', patience=20),
#              ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]
# model.compile(loss=[tf.keras.losses.MeanSquaredLogarithmicError(),tf.keras.losses.MeanSquaredLogarithmicError()], optimizer="adam")


# model.summary()
# # Fit the model
# history = model.fit(X_train, y_train, 
#           epochs = 250, 
#           batch_size = 8, 
#           validation_data=(X_val,  y_val), 
#           callbacks=callbacks)

In [None]:
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('Loss over epochs')
# plt.ylabel('Loss')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Validation'], loc='best')
# plt.show()

In [None]:
# model.load_weights("best_model.h5")

# test_X = test_X.reshape(-1, 1,8)


# predictions = model.predict(test_X)

In [None]:
print(pe.shape)

In [None]:
 pe[:,0].shape

![](https://i.imgur.com/EV7xPrl.png)

In [None]:
test

In [None]:
submis = test.copy() #for Patient_Week
submis

In [None]:
import math

#calcule Confidence
 
submis['FVC_pred'] = pe[:,0]

# baseline score
submis['Confidence'] = 100
submis['sigma_clipped'] = submis['Confidence'].apply(lambda x: max(x, 70))
submis['diff'] = abs(submis['base_FVC'] - submis['FVC_pred'])
submis['delta'] = submis['diff'].apply(lambda x: min(x, 1000))
submis['score'] = -math.sqrt(2)*submis['delta']/submis['sigma_clipped'] - np.log(math.sqrt(2)*submis['sigma_clipped'])
score = submis['score'].mean()
print(score)

In [None]:
import scipy as sp
from functools import partial

def loss_func(weight, row):
    confidence = weight
    sigma_clipped = max(confidence, 70)
    diff = abs(row['base_FVC'] - row['FVC_pred'])
    delta = min(diff, 1000)
    score = -math.sqrt(2)*delta/sigma_clipped - np.log(math.sqrt(2)*sigma_clipped)
    return -score

results = []
tk0 = tqdm(submis.iterrows(), total=len(submis))
for _, row in tk0:
    loss_partial = partial(loss_func, row=row)
    weight = [100]
    #bounds = [(70, 100)]
    #result = sp.optimize.minimize(loss_partial, weight, method='SLSQP', bounds=bounds)
    result = sp.optimize.minimize(loss_partial, weight, method='SLSQP')
    x = result['x']
    results.append(x[0])

In [None]:
# optimized score
submis['Confidence'] = results
submis['sigma_clipped'] = submis['Confidence'].apply(lambda x: max(x, 70))
submis['diff'] = abs(submis['base_FVC'] - submis['FVC_pred'])
submis['delta'] = submis['diff'].apply(lambda x: min(x, 1000))
submis['score'] = -math.sqrt(2)*submis['delta']/submis['sigma_clipped'] - np.log(math.sqrt(2)*submis['sigma_clipped'])
score = submis['score'].mean()
print(score)

In [None]:
submis=submis.reset_index()
submis

In [None]:
submis_final =  submis[['Patient_Week', 'FVC_pred', 'Confidence']].copy()
submis_final = submis_final.rename(columns={"FVC_pred": "FVC"})
submis_final['FVC'] = submis_final['FVC']
submis_final.to_csv('submission.csv', index=False)
submis_final