# Pulmonary Fibrosis EDA 🏥💊

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport

# 1. Data

In [None]:
train=pd.read_csv('/kaggle/input/osic-pulmonary-fibrosis-progression/train.csv')
print('Train Data:')
print(train.head())

test=pd.read_csv('/kaggle/input/osic-pulmonary-fibrosis-progression/test.csv')
print('\n\nTest Data:')
print(test.head())

sub=pd.read_csv('/kaggle/input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
print('\n\nSubmission File:')
print(sub.head())

* Some patients had their FVC value tested before the CT Scan(negative values for Weeks)

In [None]:
ProfileReport(train,progress_bar=False)

In [None]:
ProfileReport(test,progress_bar=False)

In [None]:
ProfileReport(sub,progress_bar=False)

# 2. EDA with Visualization

## # Patients Readings 😷

In [None]:
print('No of unique patients:',len(train.Patient.unique()))

readings=train.groupby('Patient').Weeks.count()
print('Min no. of readings for a patient:', min(readings))
print('Max no. of readings for a patient:', max(readings))

fig=plt.figure(figsize=(15,5))
sns.barplot(readings.index,readings,color='#7AC8BE')
plt.title('Number of Readings per Patient',size=15)
plt.xlabel('Patient',size=12)
plt.ylabel('# Readings',size=12)
plt.xticks([])

## # Patients Details 👴

In [None]:
#Age
print('Minimum aged patient:',min(train['Age']))
print('Maximum aged patient:',max(train['Age']))

fig=plt.figure(figsize=(10,5))
sns.distplot(train['Age'])
plt.title('Age Distribution',size=15)
plt.xlabel('Age',size=12)

In [None]:
#Sex
sex=train.groupby('Patient').Sex.first()
print('Male Patients:',sex.value_counts()[0])
print('Female Patients:',sex.value_counts()[1])

fig=plt.figure(figsize=(5,5))                                              
sns.countplot(sex)
plt.title('Sex Distribution',size=15)
plt.ylabel('# Patients',size=12)
plt.xlabel('Sex',size=12)

In [None]:
#Smoking status
smoke=train.groupby('Patient').SmokingStatus.first()
print('Ex-smokers:',smoke.value_counts()[0])
print('Patients who never smoked:',smoke.value_counts()[1])
print('Patients who currently smoke:',smoke.value_counts()[2])

fig=plt.figure(figsize=(5,5))                                              
sns.countplot(smoke)
plt.title('Smoking Status',size=15)
plt.ylabel('# Patients',size=12)
plt.xlabel('Status',size=12)

### * Forced vital capacity (FVC) is the amount of air that can be forcibly exhaled from your lungs after taking the deepest breath possible. The recorded lung capacity in ml.

In [None]:
#FVC value
print('Maximum FVC value:',max(train['FVC']))
print('Minimum FVC value:',min(train['FVC']))

fig=plt.figure(figsize=(10,5))
sns.distplot(train['FVC'])
plt.title('FVC Value Distribution',size=15)
plt.xlabel('FVC Value',size=12)

### * Percent - a computed field which approximates the patient's FVC as a percent of the typical FVC for a person of similar characteristics.

In [None]:
#Percent
print('Maximum Percentage:',max(train['Percent']))
print('Minimum Percentage:',min(train['Percent']))

fig=plt.figure(figsize=(10,5))
sns.distplot(train['Percent'])
plt.title('Percentage Distribution',size=15)
plt.xlabel('Percent',size=12)

In [None]:
#Scatterplot to check correlations
a=train[['Age','SmokingStatus','Percent']]
fig=plt.figure(figsize=(15,5))
for i in range(len(a.columns)):
    fig.add_subplot(1,3,i+1)
    sns.scatterplot(x=a.iloc[:,i],y=train['FVC'],hue=train['Sex'],palette=['blue','red'])
plt.tight_layout()
plt.show()

* Age and Smoking Status has no correlation with the FVC value.
* Percent and FVC are highly correlated

# 3. Dicom Data 📁

-References: 
* https://www.kaggle.com/gzuidhof/full-preprocessing-tutorial
* https://www.kaggle.com/allunia/pulmonary-dicom-preprocessing

Thanks to Guido Zuidhof (@gzuidhof) and Laura Fink (@allunia) for these really insightful notebooks. 

## # Reading the Metadata 🗃

In [None]:
import pydicom as dicom
import cv2

data_dir='../input/osic-pulmonary-fibrosis-progression/train'
patients=os.listdir(data_dir)
labels_df=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv',index_col=0)
#labels_df=labels_df[['FVC']]
labels_df.head()

In [None]:
#Viewing the metadata of the dicom file

for patient in patients[:1]:
    label=labels_df.loc[patient,'FVC']
    path=data_dir+'/'+patient
    slices=[dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    print('No. of scans:',len(slices))
    print('Height and width of the scan:',slices[0].pixel_array.shape)
    print('\nMetadata of the Dicom File:')
    print(slices[1])

###     From the above data we can see that:
* This patient has 258 images of his/her ct scan
* Each image is a 512x512 pixel image

In [None]:
#Viewing the ct scan size for 5 different patients
c=0
for patient in patients:
    try:
        label=labels_df.loc[patient,'FVC']
        path=data_dir+'/'+patient
        slices=[dicom.read_file(path + '/' + s) for s in os.listdir(path)]
        slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
        print(len(slices),slices[0].pixel_array.shape)
        c+=1
        if c==5:
            break
    except:
        continue

* Each patient has the same ct scan size of 512x512
* But the no. of scans for each patient is different
* So we need to resize each image to the same size to feed into our model

In [None]:
min_s=9999
max_s=0
for patient in patients[:]:
    label=labels_df.loc[patient,'FVC']
    path=data_dir+'/'+patient
    slices=[len(s) for s in os.listdir(path)]
    if len(slices)<min_s:
        min_s=len(slices)
    if len(slices)>max_s:
        max_s=len(slices)
print('Minimum number of scans for any patient:',min_s)
print('Maximum number of scans for any patient:',max_s)

## Visualization📷

In [None]:
#Single Frame 2D Visualization for a patient
import cv2

for patient in patients[1:2]:
    label=labels_df.loc[patient,'FVC']
    path=data_dir+'/'+patient
    slices=[dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    
    fig=plt.figure(figsize=(5,5))
    plt.axis('off')
    plt.title('CT Scan',size=15)
    plt.imshow(slices[0].pixel_array,cmap='gray')
    plt.show()

In [None]:
#2D Visualization of all the scans for a patient
import cv2

for patient in patients:
    label=labels_df.loc[patient,'FVC']
    path=data_dir+'/'+patient
    slices=[dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    
    img_px_size=150
    
    try:
        if len(slices)<=56:
            fig=plt.figure(figsize=(20,40))
            for num,each_slice in enumerate(slices):
                fig.add_subplot(14,4,num+1)
                new_image=cv2.resize(np.array(each_slice.pixel_array),(img_px_size,img_px_size))
                plt.axis('off')
                plt.title(num+1,size=10)
                plt.imshow(new_image,cmap='gray')
            plt.show()
            break
    except:
        continue

## # HU (Hounsfield Scale) Values

In [None]:
# Load the scans in given folder path
def load_scan(path):
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
        
    for s in slices:
        s.SliceThickness = slice_thickness
        
    return slices

In [None]:
def get_pixels_hu(slices):
    image = np.stack([s.pixel_array for s in slices])
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    image = image.astype(np.int16)

    # Set outside-of-scan pixels to 0
    # The intercept is usually -1024, so air is approximately 0
    image[image == -2000] = 0
    
    # Convert to Hounsfield units (HU)
    for slice_number in range(len(slices)):
        
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
            
        image[slice_number] += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

In [None]:
first_patient = load_scan(data_dir + '/' + patients[0])
first_patient_pixels = get_pixels_hu(first_patient)
plt.hist(first_patient_pixels.flatten(), bins=80, color='c')
plt.xlabel("Hounsfield Units (HU)")
plt.ylabel("Frequency")
plt.show()

# Show some slice in the middle
plt.imshow(first_patient_pixels[80], cmap=plt.cm.gray)
plt.show()

# 4. Preprocessing 📝

### # Drop Duplicates 

In [None]:
#Drop duplicate values from the training dataset
drop=train[train.duplicated(subset=['Patient','Weeks'],keep='last')]
print('No. of rows to be dropped:',drop.shape[0])
train.drop_duplicates(subset=['Patient','Weeks'],keep='last',inplace=True)

* Not many duplicate values are present in the dataset.
* We keep the last value and drop all the previous iterations.

### # Splitting the Submission File and Concatenating

In [None]:
#Split Patient_Week Column from the submission file
sub[['Patient','Weeks']]=sub.Patient_Week.str.split("_",expand = True)
sub=sub[['Patient','Weeks','Confidence','Patient_Week']]
sub.head()

In [None]:
#Merging submission file and test file
sub=sub.merge(test.drop('Weeks',axis = 1),on="Patient")
sub.head()

In [None]:
#Introduce a column to indicate the source dataset for the data
#Merge train and test data
train['Dataset']='train'
sub['Dataset']='test'

data=train.append([sub])
data.reset_index(inplace = True,drop=True)
data.head()

In [None]:
#Conveting categorical data to numerical data and dropping the categorical columns
#Conversion
data = pd.concat([
    data,
    pd.get_dummies(data.Sex),
    pd.get_dummies(data.SmokingStatus)
],axis=1)

#Dropping
data.drop(['Sex','SmokingStatus'],axis=1,inplace=True)
data['Weeks']=data['Weeks'].astype('int64')
data.head()

In [None]:
#Getting the baseline week as every patient had thier first at different points of time w.r.t their CT scans
def get_baseline(df):  
    _df=df.copy()
    _df['min_week']=_df['Weeks']
    # as test data is containing all weeks 
    _df.loc[_df.Dataset=='test','min_week']=0
    _df["min_week"]=_df.groupby('Patient')['Weeks'].transform('min')
    _df['baselined_week']=_df['Weeks']-_df['min_week']
    
    return _df   


data['Weeks']=data['Weeks'].astype('int64')
data=get_baseline(data)
data.head()

In [None]:
def get_baseline_FVC(df):
    # same as above
    _df = df.copy()
    base = _df.loc[_df.Weeks == _df.min_week]
    base = base[['Patient','FVC']].copy()
    base.columns = ['Patient','base_FVC']
    
    # add a row which contains the cumulated sum of rows for each patient
    base['nb'] = 1
    base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
    
    # drop all except the first row for each patient (= unique rows!), containing the min_week
    base = base[base.nb == 1]
    base.drop('nb', axis = 1, inplace = True)
    
    # merge the rows containing the base_FVC on the original _df
    _df = _df.merge(base, on = 'Patient', how = 'left')    
    _df.drop(['min_week'], axis = 1)
    
    return _df

data=get_baseline_FVC(data)
data.head()

In [None]:
#Scaling Features
def scaling(series):
    return (series-series.min())/(series.max()-series.min())

data['Age']=scaling(data['Age'])
data['Percent']=scaling(data['Percent'])
data['baselined_week']=scaling(data['baselined_week'])
data['base_FVC']=scaling(data['base_FVC'])
data.head()

# 5. Model 

* Reference: https://www.kaggle.com/reighns/higher-lb-score-by-tuning-mloss-around-6-811

Thanks to Hongnan Gao (@reighns) for his notebook

In [None]:
import tensorflow as tf
from tensorflow_addons.layers import WeightNormalization
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Lambda, Input
from tensorflow.keras.models import Sequential, Model

# create constants for the loss function
C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")

# define competition metric
def score(y_true, y_pred):
    """Calculate the competition metric"""
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]
    
    sigma_clip = tf.maximum(sigma, C1)
    # Python is automatically broadcasting y_true with shape (1,0) to 
    # shape (3,0) in order to make this subtraction work
    delta = tf.abs(y_true[:, 0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype = tf.float32) )
    metric = (delta / sigma_clip) * sq2 + tf.math.log(sigma_clip * sq2)
    return K.mean(metric)

# define pinball loss
def qloss(y_true, y_pred):
    """Calculate Pinball loss"""
    # IMPORTANT: define quartiles, feel free to change here!
    qs = [0.2, 0.50, 0.8]
    q = tf.constant(np.array([qs]), dtype = tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q * e, (q-1) * e)
    return K.mean(v)

# combine competition metric and pinball loss to a joint loss function
def mloss(_lambda):
    """Combine Score and qloss"""
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda) * score(y_true, y_pred)
    return loss

In [None]:
def make_model(nh):
    z = Input((nh,), name="Patient")
    x = Dense(100, activation="elu", name="d1")(z)
    x = Dense(100, activation="elu", name="d3")(x)
    p1 = Dense(3, activation="linear", name="p1")(x)
    p2 = Dense(3, activation="elu", name="p2")(x)
    preds = Lambda(lambda x: x[0] + tf.cumsum(x[1], axis=1), 
                     name="preds")([p1, p2])
    
    model = Model(z,preds,name="CNN")
    model.compile(loss=mloss(0.8), optimizer=tf.keras.optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False), metrics=[score])
    return model

In [None]:
## GET TRAINING DATA AND TARGET VALUE

# get back original data split
features_list=['baselined_week', 'Percent', 'Age', 'base_FVC', 'Male', 'Female', 'Ex-smoker', 'Never smoked', 'Currently smokes']
train=data.loc[data.Dataset == 'train']
sub=data.loc[data.Dataset == 'test']

# get target value
y=train['FVC'].values.astype(float)

# get training & test data
X_train=train[features_list].values
X_test=sub[features_list].values
n_rows=X_train.shape[1]

# instantiate target arrays
train_preds=np.zeros((X_train.shape[0], 3))
test_preds=np.zeros((X_test.shape[0], 3))

In [None]:
model=make_model(n_rows)
print(model.summary())

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from keras import backend as K

reduce_lr_loss=tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',factor=0.4,patience=150,verbose=0,epsilon=1e-4,mode='min')

NFOLD = 6
kf = KFold(n_splits=NFOLD)
OOF_val_score=[]

cnt = 0
BATCH_SIZE=128
EPOCHS = 800
for tr_idx, val_idx in kf.split(X_train):
    cnt += 1
    print(f"FOLD {cnt}")
    model=make_model(n_rows)
    history=model.fit(X_train[tr_idx], y[tr_idx], batch_size=BATCH_SIZE, epochs=EPOCHS, 
            validation_data=(X_train[val_idx], y[val_idx]), verbose=0, callbacks=[reduce_lr_loss])
    print("train", model.evaluate(X_train[tr_idx], y[tr_idx], verbose=0, batch_size=BATCH_SIZE))
    print("val", model.evaluate(X_train[val_idx], y[val_idx], verbose=0, batch_size=BATCH_SIZE))
    print("predict val...")
    train_preds[val_idx]=model.predict(X_train[val_idx],batch_size=BATCH_SIZE, verbose=0)
    
    # append OOF evaluation to calculate OFF_Score
    OOF_val_score.append(model.evaluate(X_train[val_idx], y[val_idx], verbose = 0, batch_size = BATCH_SIZE, return_dict = True)['score'])
    
    print("predict test...")
    test_preds+=model.predict(X_test, batch_size=BATCH_SIZE, verbose=0)/NFOLD

# 6. Evaluation and Submission ✅🏁

In [None]:
# fetch results from history
score = history.history['score']
val_score = history.history['val_score']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(EPOCHS)

# create subplots
plt.figure(figsize = (20,5))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, score, label = 'Training Accuracy')
plt.plot(epochs_range, val_score, label = 'Validation Accuracy')
# limit y-values for better zoom-scale. Remember that roughly -4.5 is the best possible score
# plt.ylim(0.8 * np.mean(val_score), 1.2 * np.mean(val_score))
plt.legend(loc = 'lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label = 'Training Loss')
plt.plot(epochs_range, val_loss, label = 'Validation Loss')
# limit y-values for beter zoom-scale
plt.ylim(0.3 * np.mean(val_loss), 1.8 * np.mean(val_loss))

plt.legend(loc = 'upper right')
plt.title('Training and Validation Loss')
plt.show()


In [None]:
np.mean(OOF_val_score)

In [None]:
## FIND OPTIMIZED STANDARD-DEVIATION
sigma_opt = mean_absolute_error(y, train_preds[:,1])
sigma_uncertain = train_preds[:,2] - train_preds[:,0]
sigma_mean = np.mean(sigma_uncertain)
print(sigma_opt, sigma_mean)

In [None]:
## PREPARE SUBMISSION FILE WITH OUR PREDICTIONS
sub['FVC1'] = test_preds[:, 1]
sub['Confidence1'] = test_preds[:,2] - test_preds[:,0]

# get rid of unused data and show some non-empty data
submission = sub[['Patient_Week','FVC','Confidence','FVC1','Confidence1']].copy()
submission.loc[~submission.FVC1.isnull()].head(10)

In [None]:
submission.loc[~submission.FVC1.isnull(),'FVC'] = submission.loc[~submission.FVC1.isnull(),'FVC1']

if sigma_mean < 70:
    submission['Confidence'] = sigma_opt
else:
    submission.loc[~submission.FVC1.isnull(),'Confidence'] = submission.loc[~submission.FVC1.isnull(),'Confidence1']

In [None]:
submission.describe().T

In [None]:
org_test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

for i in range(len(org_test)):
    submission.loc[submission['Patient_Week']==org_test.Patient[i]+'_'+str(org_test.Weeks[i]), 'FVC'] = org_test.FVC[i]
    submission.loc[submission['Patient_Week']==org_test.Patient[i]+'_'+str(org_test.Weeks[i]), 'Confidence'] = 70

In [None]:
submission[["Patient_Week","FVC","Confidence"]].to_csv("submission.csv", index = False)