# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, Subset

from sklearn.model_selection import GroupKFold

In [None]:
num_kfolds = 5
batch_size = 32
learning_rate = 3e-3
num_epoch = 10
es_patience = 20
quantiles = (0.2, 0.5, 0.8)
model_name = 'descartes'

# Create Dataset

In [None]:
class MedicalDataset(Dataset):
    def __init__(self, mode, transform=None):
        self.transform = transform
        self.mode = mode
        
        train = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
        
        # Some of the patients have inspected twice a week. So, remove duplicated records. 
        train.drop_duplicates(keep=False, inplace=True, subset=['Patient', 'Weeks'])
        
        sub = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
        sub['Patient'] = sub['Patient_Week'].apply(lambda x: x.split('_')[0])
        sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
        sub = sub[['Patient', 'Weeks', 'Confidence', 'Patient_Week']]
        
        chunk = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
        sub = sub.merge(chunk.drop('Weeks', axis=1), on='Patient')
        
        train['WHERE'] = 'train'
        chunk['WHERE'] = 'val'
        sub['WHERE'] = 'test'
        
        # Merge train, test, test data.
        data = train.append([chunk, sub])
        
        # Calculate min_week by each patient.
        data['min_week'] = data['Weeks']
        data.loc[data.WHERE == 'test', 'min_week'] = np.nan
        data['min_week'] = data.groupby('Patient')['min_week'].transform('min')
        
        init_exam = data.loc[data.Weeks == data.min_week]
        init_exam = init_exam[['Patient', 'FVC']].copy()
        init_exam.columns = ['Patient', 'min_FVC']
        init_exam['nb'] = 1
        init_exam['nb'] = init_exam.groupby('Patient')['nb'].transform('cumsum')
        init_exam = init_exam[init_exam.nb == 1]
        init_exam.drop('nb', axis=1, inplace=True)
        
        # Calculate day
        data = data.merge(init_exam, on='Patient', how='left')
        data['base_week'] = data['Weeks'] - data['min_week']
        del init_exam
        
        # Transform 'Sex' and 'SmokingStatus' data into One-Hot vector.
        COLS = ['Sex', 'SmokingStatus']
        self.FE = []
        for col in COLS:
            for mod in data[col].unique():
                self.FE.append(mod)
                data[mod] = (data[col] == mod).astype(int)
        
        # Normalize each data so that the maximum value is 1 and the minimum value is 0.
        data['age'] = (data['Age'] - data['Age'].min()) / \
                      (data['Age'].max() - data['Age'].min())
        
        data['BASE'] = (data['min_FVC'] - data['min_FVC'].min()) / \
                      (data['min_FVC'].max() - data['min_FVC'].min())

        data['week'] = (data['base_week'] - data['base_week'].min()) / \
                      (data['base_week'].max() - data['base_week'].min())

        data['percent'] = (data['Percent'] - data['Percent'].min()) / \
                      (data['Percent'].max() - data['Percent'].min())
        
        self.FE += ['age', 'percent', 'week', 'BASE']
        self.raw = data.loc[data.WHERE == mode].reset_index()
        del data
        
        
    def __len__(self):
        return len(self.raw)
    
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        sample = {
            'patient_id': self.raw['Patient'].iloc[idx],
            'features': self.raw[self.FE].iloc[idx].values,
            'target': self.raw['FVC'].iloc[idx]
        }
        if self.transform:
            sample = transform(sample)
    
        return sample
    
    
    def group_kfold(self, n_splits):
        gkf = GroupKFold(n_splits=n_splits)
        groups = self.raw['Patient']
        for train_idx, val_idx in gkf.split(self.raw, self.raw, groups):
            train = Subset(self, train_idx)
            val = Subset(self, val_idx)
            yield train, val
        

# Create Model (Quantile Regression Neural Network)

In [None]:
class QrnnModel(nn.Module):
    def __init__(self, in_tabular_features=9, out_quantiles=3):
        super(QrnnModel, self).__init__()
        self.fc1 = nn.Linear(in_tabular_features, 100)
        self.bn1 = nn.BatchNorm1d(num_features=100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, out_quantiles)
    
    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        x = self.bn1(x)
        x = F.leaky_relu(self.fc2(x))
        x = self.fc3(x)
        
        return x
    

def quantile_loss(preds, target, quantiles):
    losses = []
    for i, q in enumerate(quantiles):
        errors = target - preds[:, i]
        losses.append(torch.max((q - 1) * errors, q * errors).unsqueeze(1))
    loss = torch.mean(torch.sum(torch.cat(losses, dim=1), dim=1))
    return loss

# Train Model

In [None]:
models = []
data = MedicalDataset(mode='train')
folds = data.group_kfold(num_kfolds)

for fold, (trainset, valset) in enumerate(folds):
    
    dataloaders = {
        'train': DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2),
        'val': DataLoader(valset, batch_size=batch_size, shuffle=False, num_workers=2)
    }
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = QrnnModel().to(device)
    optimizer = Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epoch):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
                
            for batch in dataloaders[phase]:
                inputs = batch['features'].float().to(device)
                targets = batch['target'].to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    preds = model(inputs)
                    loss = quantile_loss(preds, targets, quantiles)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                
    models.append(model)

# Predict Testdata

In [None]:
data = MedicalDataset(mode='test')
avg_preds = np.zeros((len(data), len(quantiles)))
                     
for model in models:
    dataloader = DataLoader(data, batch_size=batch_size, shuffle=False, num_workers=2)
    
    preds = []
    for batch in dataloader:
        inputs = batch['features'].float()
        with torch.no_grad():
            x = model(inputs)
            preds.append(x)
            
    preds = torch.cat(preds, dim=0).numpy()
    avg_preds += preds
    
avg_preds /= len(models)
df = pd.DataFrame(data = avg_preds, columns=list(quantiles))
df['Patient_Week'] = data.raw['Patient_Week']
df['FVC'] = df[quantiles[1]]
df['Confidence'] = df[quantiles[2]] - df[quantiles[0]]
df = df.drop(columns=list(quantiles))
df.to_csv('submission.csv', index=False)

In [None]:
df.head()