# Setup

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

os.chdir('drive/MyDrive/NLP')
DATADIR = "data/"
SAVEDIR = "data/working"

Mounted at /content/drive


In [None]:
# !pip install transformers=="4.2.2"
# !pip install pytorch-lightning=="1.5.8"

In [None]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
from torch.utils.data.dataset import Dataset

# Preprocessing

In [None]:
def erase_punc(string):
    string = re.sub(r"-I-", " 1 ", string)
    string = re.sub(r"-OR-", " or ", string)   
    string = re.sub(r"-", " ", string)
    return string

def erase_mark(string):
    string = re.sub(r"'", "", string)
#     string = re.sub(r"\"", "", string)
    string = re.sub(r", ", ";", string)
    return string

def data_preparation(feature_path, pn_path, data_path, train):
    feature = pd.read_csv(feature_path)
    patient = pd.read_csv(pn_path)
    dat = pd.read_csv(data_path)
    
    first_merge = dat.merge(patient, how = "inner", on = ["case_num", "pn_num"])
    full_dat = first_merge.merge(feature, how = "inner", on = ["case_num", "feature_num"])

    full_dat["feature_text"] = full_dat["feature_text"].apply(erase_punc)
    feature["feature_text"] = feature["feature_text"].apply(erase_punc)
    
    if train:
        full_dat["annotation"] = full_dat["annotation"].apply(erase_mark)
        full_dat["annotation"] = full_dat["annotation"].apply(lambda x: x[1:-1].split(";"))
        full_dat["annotation"] = full_dat["annotation"].apply(lambda x: [] if x[0] == "" else x)

        full_dat["location"] = full_dat["location"].apply(erase_mark)
        helper_location = lambda lst: [[int(elem) for elem in loc.split()] for loc in lst[1:-1].split(";")]
        full_dat["location"] = full_dat["location"].apply(helper_location)
        full_dat["location"] = full_dat["location"].apply(lambda x : [] if len(x[0]) == 0 else x)
        
    return full_dat, feature

The three csv files (train, patient notes, and features) are parsed and merged into a single pandas dataframe. Here is the result.

# Training Setup

In [None]:
trainpath = DATADIR + '/train.csv'
pnpath = DATADIR + '/patient_notes.csv'
featurepath = DATADIR + '/features.csv'
df_data, df_feature = data_preparation(featurepath, pnpath, trainpath, train=True)

In [None]:
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )
import numpy as np
import scipy
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data.dataset import Dataset
import argparse
import os
from pathlib import Path
from torch.optim import SGD, Adam
import pytorch_lightning as pl
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification
from transformers import RobertaTokenizerFast, RobertaModel
from transformers import pipeline
from torchmetrics import Accuracy
from datetime import datetime 
from pathlib import Path
from pytorch_lightning import loggers as pl_loggers
import time
from argparse import Namespace
import json
import shutil
logger = logging.getLogger(__name__)



In [None]:
class BaseModel(pl.LightningModule):
    def __init__(
        self,
        **config_kwargs
    ):
        """Initialize a model, tokenizer and config."""
        logger.info("Initilazing BaseModel")
        super().__init__()
        self.save_hyperparameters() #save hyperparameters to checkpoint
        self.step_count = 0
        self.output_dir = Path(self.hparams.output_dir)
        self.model = self._load_model()

        self.accuracy = Accuracy()

    def _load_model(self):
        raise NotImplementedError

    def forward(self, **inputs):
        return self.model(**inputs)

    def batch2input(self, batch):
        raise NotImplementedError

    def training_step(self, batch, batch_idx):
        input = self.batch2input(batch)
        labels = input['labels']
        loss, pred_labels = self(**input)
        labels_cls = (labels > 0.).int()
        valid_idx = (labels_cls == 1) | (pred_labels == 1)
        if len(labels_cls[labels_cls == 1]) > 0:
            acc = self.accuracy(pred_labels[valid_idx].view(-1), labels_cls[valid_idx].view(-1)) #Accuracy is computed only for indices with (predicted feature = 1) or (label featrure = 1)
        else:
            acc = self.accuracy(pred_labels.view(-1), labels.view(-1).int())
        
        self.log('train_loss', loss, prog_bar=True)
        self.log('train_acc', acc, prog_bar=True)
        
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        input = self.batch2input(batch)
        labels = input['labels']
        loss, pred_labels = self(**input)
        labels_cls = (labels > 0.).int()
        valid_idx = (labels_cls == 1) | (pred_labels == 1)
        if len(labels_cls[labels_cls == 1]) > 0:
            acc = self.accuracy(pred_labels[valid_idx].view(-1), labels_cls[valid_idx].view(-1))
        else:
            acc = self.accuracy(pred_labels.view(-1), labels.view(-1).int())
        self.log('val_loss', loss)
        self.log('val_acc', acc)

    def test_step(self, batch, batch_nb):
        input = self.batch2input(batch)
        labels = input['labels']
        loss, pred_labels = self(**input)
        labels_cls = (labels > 0.).int()
        valid_idx = (labels_cls == 1) | (pred_labels == 1)
        if len(labels_cls[labels_cls == 1]) > 0:
            acc = self.accuracy(pred_labels[valid_idx].view(-1), labels_cls[valid_idx].view(-1))
        else:
            acc = self.accuracy(pred_labels.view(-1), labels.view(-1).int())
        self.log('test_loss', loss)
        self.log('test_acc', acc)

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        # optimizer = SGD(model.parameters(), lr=self.hparams.learning_rate)
        optimizer = Adam(model.parameters(), lr=self.hparams.learning_rate)

        self.opt = optimizer
        return [optimizer]

    def setup(self, stage):
        if stage == "fit":
            self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)

    def train_dataloader(self):
        return self.train_loader

    def val_dataloader(self):
        return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)

    def test_dataloader(self):
        return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)

    @staticmethod
    def add_generic_args(parser, root_dir) -> None:
        parser.add_argument(
            "--max_epochs",
            default=10,
            type=int,
            help="The number of epochs to train your model.",
        )
        ############################################################
        ## WARNING: set --gpus 0 if you do not have access to GPUS #
        ############################################################
        parser.add_argument(
            "--gpus",
            default=1,
            type=int,
            help="The number of GPUs allocated for this, it is by default 1. Set to 0 for no GPU.",
        )
        parser.add_argument(
            "--output_dir",
            default=None,
            type=str,
            required=True,
            help="The output directory where the model predictions and checkpoints will be written.",
        )
        parser.add_argument("--do_train", action="store_true", default=True, help="Whether to run training.")
        parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
        parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
        parser.add_argument(
            "--data_dir",
            default="./",
            type=str,
            help="The input data dir. Should contain the training files.",
        )
        parser.add_argument("--learning_rate", default=1e-2, type=float, help="The initial learning rate for training.")
        parser.add_argument("--num_workers", default=16, type=int, help="kwarg passed to DataLoader")
        parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int)
        parser.add_argument("--train_batch_size", default=32, type=int)
        parser.add_argument("--eval_batch_size", default=32, type=int)
    
def generic_train(
    model: BaseModel,
    args: argparse.Namespace,
    early_stopping_callback=False,
    extra_callbacks=[],
    checkpoint_callback=None,
    logging_callback=None,
    **extra_train_kwargs
):
    
    # init model
    odir = Path(model.hparams.output_dir)
    odir.mkdir(exist_ok=True)
    log_dir = Path(os.path.join(model.hparams.output_dir, 'logs'))
    log_dir.mkdir(exist_ok=True)

    # Tensorboard logger
    pl_logger = pl_loggers.TensorBoardLogger(
        save_dir=log_dir,
        version="version_" + datetime.now().strftime("%d-%m-%Y--%H-%M-%S"),
        name="",
        default_hp_metric=True
    )

    # add custom checkpoints
    ckpt_path = os.path.join(
        args.output_dir, pl_logger.version, "checkpoints",
    )
    if checkpoint_callback is None:
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            dirpath=ckpt_path, filename="{epoch}-{val_acc:.2f}", monitor="val_acc", mode="max", save_top_k=1, verbose=True
        )

    train_params = {}

    train_params["max_epochs"] = args.max_epochs

    if args.gpus > 1:
        train_params["distributed_backend"] = "ddp"

    trainer = pl.Trainer.from_argparse_args(
        args,
        enable_model_summary=False,
        callbacks= [checkpoint_callback] + extra_callbacks,
        logger=pl_logger,
        **train_params,
    )

    if args.do_train:
        trainer.fit(model)
        # track model performance under differnt hparams settings in "Hparams" of TensorBoard
        pl_logger.log_hyperparams(params=model.hparams, metrics={'hp_metric': checkpoint_callback.best_model_score.item()})
        pl_logger.save()

        # save best model to `best_model.ckpt`
        target_path = os.path.join(SAVEDIR, 'best_model.ckpt')
        logger.info(f"Copy best model from {checkpoint_callback.best_model_path} to {target_path}.")
        shutil.copy(checkpoint_callback.best_model_path, target_path)

    
    # Optionally, predict on test set and write to output_dir
    if args.do_predict:
        best_model_path = os.path.join(SAVEDIR, "best_model.ckpt")
        model = model.load_from_checkpoint(best_model_path)
        return trainer.test(model)
    
    return trainer

## Model 

In [None]:
class LSTM(torch.nn.Module):
    """
    LSTM Named Entity Recognition Model
    """
    def __init__(self, model_name, feature_num, hidden_size=256, num_layer=2, dropout=0.3):
        super(LSTM, self).__init__()
        self.model_name = model_name
        self.bert = AutoModel.from_pretrained(model_name)
        self.bilstm = torch.nn.LSTM(768, hidden_size,
                       num_layers = num_layer,  
                       batch_first=True, bidirectional=True)
        self.dense_layer = torch.nn.Linear(2*hidden_size, hidden_size)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(dropout)
        self.output_layer = torch.nn.Linear(hidden_size, feature_num + 1)
        self.criterion = nn.BCELoss()
        
    def forward(self, encodings, labels):
        output = self.bert(**encodings)
        output, _ = self.bilstm(output['last_hidden_state'])
        output = self.dense_layer(output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.output_layer(output)
        probs = torch.sigmoid(output)
        valid_idx = (labels > 0.) | (probs > 0.5)
        loss = self.criterion(probs[valid_idx], labels[valid_idx])
        predicted_labels = (probs > 0.5).int()
        
        return loss, predicted_labels

## Dataset

In [None]:
class SST2Dataset(Dataset):
    """
    Using dataset to process input text on-the-fly
    """
    def __init__(self, model_name, df_data, df_feature, test=False):
        self.df_data = df_data
        self.df_feature = df_feature
        self.pn_num = df_data['pn_num'].unique()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trim_offsets=False)
        self.test = test

    def __getitem__(self, index):
        pn_num = self.pn_num[index]
        data = self.df_data[self.df_data['pn_num'] == pn_num]
        data = data.sort_values(by=['feature_num'])
        
        pn_history = data.iloc[0]['pn_history']
        feature_num = list(data['feature_num'])
        if not self.test:
            location = list(data['location'])
        else:
            location = None
        return pn_history, feature_num, location

    def collate_fn(self, batch_data):
        pn_histories, feature_nums, locations = list(zip(*batch_data))
        encodings = self.tokenizer(list(pn_histories), padding=True, return_tensors="pt")
        num_batch = len(feature_nums)
        labels = np.zeros((num_batch, encodings['input_ids'].size()[1], len(self.df_feature) + 1)) #Label dimension is (batch, num_tokens, num_feature+1). For each token, if it has a feature, 1 is assigned. For example, if in the first batch Token 2 has Feature 9, then label[0, 1, 8] = 1. If it has no feature, then label[0, 1, -1] = 1 (this is why I have an extra dim for feature).
        if self.test:
            return encodings, torch.FloatTensor(labels)
        labels[:, :, -1] = 1. #Initially, set 1 for the last feature index, which is "no feature" index.
        for i_batch in range(num_batch): #Iterates over the batches
            for i_data, loc_list in enumerate(locations[i_batch]): #Each batch has several rows of data. We iterate the locations data of each row.
                x_range_done = [] #x is the index of the encoding vector. This is to store the range of x that is found to have a feature.
                for loc in loc_list:
                    if len(loc) > 0: #Proceeds only when this data row has a location entry.
                        # if len(np.shape(locs)) == 1: #This is to check whether locs is just a list, or locs is a list of lists (the case when the same word appears in multiple locations). So the NEW version does not care whether the location of "list of lists" comes from the same word or not. It just lists every locations as a single list. **If we use the NEW version, there is no such case so I removed these parts.
                        #    locs_all = [locs]
                        #else:
                        #    locs_all = locs
                        # for loc in locs_all:
                        x_start = -1 #Now, we search for the starting x and the ending x for the encoded vector, corresponding to the word with a feature.
                        x_end = 0
                        query = self.tokenizer(pn_histories[i_batch][loc[0]:loc[1]+1], return_tensors="pt")['input_ids'][0][1:-1] #First, we encode just the word with a given featrue. The first and the last token are excluded because they are paddings.
                        for x in range(0, encodings['input_ids'].size()[1]): #Then, we search for the range of x which exactly matches with query.
                            if x_start == -1 and encodings['input_ids'][i_batch, x] == query[0]: #Start checking when the current x is the same as the first x of the query
                                x_start = x
                            if x_start != -1 and encodings['input_ids'][i_batch, x] == query[x - x_start]: #Check whether x which follows x_start is the same as the corresponding x in the query.
                                if x - x_start == len(query)-1: #If x matches up to the last x of the query, we found the ending x.
                                    x_end = x
                                    range_now = range(x_start, x_end+1)
                                    valid = True
                                    for i in range(len(x_range_done)): #Now, we should check whether the x range found is redundant. Check if this range is what we already found.
                                        range_done = range(x_range_done[i][0], x_range_done[i][1]+1)
                                        if set(range_now).intersection(range_done): #Check if there is any intersection between the current x range and the ranges done.
                                            valid = False
                                            break
                                    if valid: #If there is no overlap, this x range is valid. We found the x_start and x_end.
                                        break
                                    else:
                                        x_start = -1
                            else: #If the above checks fail, we pass and look for next x's.
                                x_start = -1
                        if x_start != -1: #Now, for the x range found, we assign the label value.
                            idx_feature = self.df_feature.index[self.df_feature['feature_num'] == feature_nums[i_batch][i_data]].item() #We should find the feature index of the current word.
                            x_range_done.append([x_start, x_end])
                            labels[i_batch, x_start:x_end+1, idx_feature] = 1. #Then, for the found x range with corresponding feature index, we assign 1 as the label value.
                            labels[i_batch, x_start:x_end+1, -1] = 0. #As these range do have a feature, we set the "no feature" index to be zero.
        return encodings, torch.FloatTensor(labels)

    def __len__(self):
        return len(self.pn_num)

class LSTM_PL(BaseModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def _load_model(self):
        return LSTM(self.hparams.model_name, self.hparams.feature_num, self.hparams.hidden_size, 
               self.hparams.num_layer, self.hparams.dropout)

    def get_dataloader(self, type_path, batch_size, shuffle=False):
        # dataset path (change if necessary)
        trainpath = DATADIR + '/train.csv'
        pnpath = DATADIR + '/patient_notes.csv'
        featurepath = DATADIR + '/features.csv'
        logger.info(f"Loading data from {trainpath} and pn from {pnpath} and feature from {featurepath}")
        df_data, df_feature = data_preparation(featurepath, pnpath, trainpath, train=True)
        pn_num = df_data['pn_num'].unique()
        idx_test = torch.Tensor([len(pn_num) - 2, len(pn_num) - 1]).int()
        random_idx = torch.randperm(len(pn_num) - 2)
        idx_train = random_idx[:800]
        idx_dev = random_idx[800:998]
        self.idx_dev = idx_dev
        
        if type_path == 'train':
            dataset = SST2Dataset(self.hparams.model_name, 
                        df_data[df_data['pn_num'].isin(pn_num[idx_train])], df_feature)
            data_loader = torch.utils.data.DataLoader(
                dataset=dataset,
                batch_size=batch_size,
                shuffle=shuffle,
                num_workers=self.hparams.num_workers,
                collate_fn=dataset.collate_fn
            )
            return data_loader
        elif type_path == 'dev':
            dataset = SST2Dataset(self.hparams.model_name, 
                        df_data[df_data['pn_num'].isin(pn_num[idx_dev])], df_feature)
            data_loader = torch.utils.data.DataLoader(
                dataset=dataset,
                batch_size=batch_size,
                shuffle=shuffle,
                num_workers=self.hparams.num_workers,
                collate_fn=dataset.collate_fn
            )
            return data_loader
        elif type_path == 'test':
            dataset = SST2Dataset(self.hparams.model_name,
                        df_data[df_data['pn_num'].isin(pn_num[idx_test])], df_feature)
            data_loader = torch.utils.data.DataLoader(
                dataset=dataset,
                batch_size=batch_size,
                shuffle=shuffle,
                num_workers=self.hparams.num_workers,
                collate_fn=dataset.collate_fn
            )
            return data_loader

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        optimizer = Adam(model.parameters(), lr=self.hparams.learning_rate)
        self.opt = optimizer
        return [optimizer]
    
    def batch2input(self, batch):
        return {"encodings": batch[0], "labels": batch[1]}

    @staticmethod
    def add_model_specific_args(parser, root_dir):
        parser.add_argument(
            "--model_name",
            default='emilyalsentzer/Bio_ClinicalBERT',
            type=str,
            required=True,
            help="Name of Pre-trained BERT",
        )
        parser.add_argument(
            "--optimizer",
            default="adam",
            type=str,
            required=True,
            help="Whether to use SGD or not",
        )
        parser.add_argument(
            "--hidden_size",
            default=256,
            type=int,
            help="Dimension of hidden size",
        )
        parser.add_argument(
            "--num_layer",
            default=2,
            type=int,
            help="Dimension of LSTM layers",
        )
        parser.add_argument(
            "--dropout",
            default=0.1,
            type=float,
            help="Dropout rate",
        )
        parser.add_argument(
            "--feature_num",
            default=143,
            type=int,
            help="Size of label class",
        )
        return parser

# Bio-Clinical BERT

In [None]:
SAVEDIR = "data/working/BioC"
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )
import time
import argparse
import glob
import os
logger = logging.getLogger(__name__)

def main():
    ########################################################
    ## TODO: change args if needed according to your files #
    ########################################################
    mock_args = f"--data_dir {DATADIR} --output_dir {SAVEDIR} --optimizer adam \
    --model_name emilyalsentzer/Bio_ClinicalBERT --learning_rate 1e-4 --max_epochs 60\
    --dropout 0.1 --num_layer 1 --train_batch_size 16 --do_predict" # change model_name here

    # load hyperparameters
    parser = argparse.ArgumentParser()
    BaseModel.add_generic_args(parser, os.getcwd())
    parser = LSTM_PL.add_model_specific_args(parser, os.getcwd())
    args = parser.parse_args(mock_args.split())
    print(args)
    # fix random seed to make sure the result is reproducible
    pl.seed_everything(args.seed)

    # If output_dir not provided, a folder will be generated in pwd
    if args.output_dir is None:
        args.output_dir = os.path.join(
            "./results",
            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
        )
        os.makedirs(args.output_dir)
    dict_args = vars(args)
    model = LSTM_PL(**dict_args)
    trainer = generic_train(model, args)


if __name__ == "__main__":
    main()

# Training Simpler Model

In [None]:
class LSTM(torch.nn.Module):
    """
    LSTM Named Entity Recognition Model
    """
    def __init__(self, model_name, feature_num, hidden_size=256, num_layer=2, dropout=0.3):
        super(LSTM, self).__init__()
        self.model_name = model_name
        self.bert = AutoModel.from_pretrained(model_name)
        # self.bilstm = torch.nn.LSTM(768, hidden_size,
        #                num_layers = num_layer,  
        #                batch_first=True, bidirectional=True)
        #self.dense_layer = torch.nn.Linear(2*hidden_size, hidden_size)
        #self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(dropout)
        self.output_layer = torch.nn.Linear(768, feature_num + 1)
        self.criterion = nn.BCELoss()
        
    def forward(self, encodings, labels):
        output = self.bert(**encodings)
        output = output['last_hidden_state']
        #output, _ = self.bilstm(output['last_hidden_state'])
        #output = self.dense_layer(output)
        #output = self.relu(output)
        output = self.dropout(output)
        output = self.output_layer(output)
        probs = torch.sigmoid(output)
        valid_idx = (labels > 0.) | (probs > 0.5)
        loss = self.criterion(probs[valid_idx], labels[valid_idx])
        predicted_labels = (probs > 0.5).int()
        
        return loss, predicted_labels

In [None]:
SAVEDIR = "data/working/BioC_small"
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )
import time
import argparse
import glob
import os
logger = logging.getLogger(__name__)

def main():
    ########################################################
    ## TODO: change args if needed according to your files #
    ########################################################
    mock_args = f"--data_dir {DATADIR} --output_dir {SAVEDIR} --optimizer adam \
    --model_name emilyalsentzer/Bio_ClinicalBERT --learning_rate 1e-4 --max_epochs 25 \
    --dropout 0.5 --num_layer 1 --train_batch_size 16 --do_predict" # change model_name here

    # load hyperparameters
    parser = argparse.ArgumentParser()
    BaseModel.add_generic_args(parser, os.getcwd())
    parser = LSTM_PL.add_model_specific_args(parser, os.getcwd())
    args = parser.parse_args(mock_args.split())
    print(args)
    # fix random seed to make sure the result is reproducible
    pl.seed_everything(args.seed)

    # If output_dir not provided, a folder will be generated in pwd
    if args.output_dir is None:
        args.output_dir = os.path.join(
            "./results",
            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
        )
        os.makedirs(args.output_dir)
    dict_args = vars(args)
    model = LSTM_PL(**dict_args)
    trainer = generic_train(model, args)


if __name__ == "__main__":
    main()

03/15/2022 00:36:08 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42
03/15/2022 00:36:08 - INFO - __main__ -   Initilazing BaseModel


Namespace(data_dir='data/', do_predict=True, do_train=True, dropout=0.5, eval_batch_size=32, feature_num=143, gpus=1, hidden_size=256, learning_rate=0.0001, max_epochs=25, model_name='emilyalsentzer/Bio_ClinicalBERT', num_layer=1, num_workers=16, optimizer='adam', output_dir='data/working/BioC_small', seed=42, train_batch_size=16)


03/15/2022 00:36:11 - INFO - pytorch_lightning.utilities.distributed -   GPU available: True, used: True
03/15/2022 00:36:11 - INFO - pytorch_lightning.utilities.distributed -   TPU available: False, using: 0 TPU cores
03/15/2022 00:36:11 - INFO - pytorch_lightning.utilities.distributed -   IPU available: False, using: 0 IPUs
03/15/2022 00:36:11 - INFO - __main__ -   Loading data from data//train.csv and pn from data//patient_notes.csv and feature from data//features.csv
  cpuset_checked))
03/15/2022 00:36:12 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation sanity check: 0it [00:00, ?it/s]

03/15/2022 00:36:13 - INFO - __main__ -   Loading data from data//train.csv and pn from data//patient_notes.csv and feature from data//features.csv
03/15/2022 00:36:18 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

03/15/2022 00:36:52 - INFO - pytorch_lightning.utilities.distributed -   Epoch 0, global step 49: val_acc reached 0.70359 (best 0.70359), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=0-val_acc=0.70.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:37:37 - INFO - pytorch_lightning.utilities.distributed -   Epoch 1, global step 99: val_acc reached 0.72153 (best 0.72153), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=1-val_acc=0.72.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:38:20 - INFO - pytorch_lightning.utilities.distributed -   Epoch 2, global step 149: val_acc reached 0.75710 (best 0.75710), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=2-val_acc=0.76.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:39:02 - INFO - pytorch_lightning.utilities.distributed -   Epoch 3, global step 199: val_acc reached 0.77817 (best 0.77817), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=3-val_acc=0.78.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:39:45 - INFO - pytorch_lightning.utilities.distributed -   Epoch 4, global step 249: val_acc reached 0.79178 (best 0.79178), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=4-val_acc=0.79.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:40:27 - INFO - pytorch_lightning.utilities.distributed -   Epoch 5, global step 299: val_acc reached 0.81979 (best 0.81979), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=5-val_acc=0.82.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:41:08 - INFO - pytorch_lightning.utilities.distributed -   Epoch 6, global step 349: val_acc reached 0.83683 (best 0.83683), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=6-val_acc=0.84.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:41:51 - INFO - pytorch_lightning.utilities.distributed -   Epoch 7, global step 399: val_acc reached 0.85567 (best 0.85567), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=7-val_acc=0.86.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:42:33 - INFO - pytorch_lightning.utilities.distributed -   Epoch 8, global step 449: val_acc reached 0.88444 (best 0.88444), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=8-val_acc=0.88.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:43:15 - INFO - pytorch_lightning.utilities.distributed -   Epoch 9, global step 499: val_acc reached 0.89340 (best 0.89340), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=9-val_acc=0.89.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:43:57 - INFO - pytorch_lightning.utilities.distributed -   Epoch 10, global step 549: val_acc reached 0.91111 (best 0.91111), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=10-val_acc=0.91.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:44:40 - INFO - pytorch_lightning.utilities.distributed -   Epoch 11, global step 599: val_acc reached 0.92313 (best 0.92313), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=11-val_acc=0.92.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:45:21 - INFO - pytorch_lightning.utilities.distributed -   Epoch 12, global step 649: val_acc reached 0.93557 (best 0.93557), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=12-val_acc=0.94.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:46:04 - INFO - pytorch_lightning.utilities.distributed -   Epoch 13, global step 699: val_acc reached 0.94456 (best 0.94456), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=13-val_acc=0.94.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:46:47 - INFO - pytorch_lightning.utilities.distributed -   Epoch 14, global step 749: val_acc reached 0.94600 (best 0.94600), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=14-val_acc=0.95.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:47:30 - INFO - pytorch_lightning.utilities.distributed -   Epoch 15, global step 799: val_acc reached 0.95300 (best 0.95300), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=15-val_acc=0.95.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:48:12 - INFO - pytorch_lightning.utilities.distributed -   Epoch 16, global step 849: val_acc reached 0.95656 (best 0.95656), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=16-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:48:54 - INFO - pytorch_lightning.utilities.distributed -   Epoch 17, global step 899: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:49:27 - INFO - pytorch_lightning.utilities.distributed -   Epoch 18, global step 949: val_acc reached 0.96000 (best 0.96000), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=18-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:50:09 - INFO - pytorch_lightning.utilities.distributed -   Epoch 19, global step 999: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:50:42 - INFO - pytorch_lightning.utilities.distributed -   Epoch 20, global step 1049: val_acc reached 0.96327 (best 0.96327), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=20-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:51:25 - INFO - pytorch_lightning.utilities.distributed -   Epoch 21, global step 1099: val_acc reached 0.96715 (best 0.96715), saving model to "/content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=21-val_acc=0.97.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:52:07 - INFO - pytorch_lightning.utilities.distributed -   Epoch 22, global step 1149: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:52:40 - INFO - pytorch_lightning.utilities.distributed -   Epoch 23, global step 1199: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 00:53:13 - INFO - pytorch_lightning.utilities.distributed -   Epoch 24, global step 1249: val_acc was not in top 1
03/15/2022 00:53:14 - INFO - __main__ -   Copy best model from /content/drive/MyDrive/NLP/data/working/BioC_small/version_15-03-2022--00-36-11/checkpoints/epoch=21-val_acc=0.97.ckpt to data/working/BioC_small/best_model.ckpt.
03/15/2022 00:53:23 - INFO - __main__ -   Initilazing BaseModel
03/15/2022 00:53:28 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
03/15/2022 00:53:30 - INFO - __main__ -   Loading data from data//train.csv and pn from data//patient_notes.csv and feature from data//features.csv


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9300225973129272, 'test_loss': 0.17484189569950104}
--------------------------------------------------------------------------------


# Training distilbert

In [None]:
SAVEDIR = "data/working/distilbert"
def main():
    ########################################################
    ## TODO: change args if needed according to your files #
    ########################################################
    mock_args = f"--data_dir {DATADIR} --output_dir {SAVEDIR} --optimizer adam \
    --model_name distilbert-base-uncased --learning_rate 1e-4 --max_epochs 50\
    --dropout 0.5 --num_layer 1 --train_batch_size 16 --do_predict" # change model_name here

    # load hyperparameters
    parser = argparse.ArgumentParser()
    BaseModel.add_generic_args(parser, os.getcwd())
    parser = LSTM_PL.add_model_specific_args(parser, os.getcwd())
    args = parser.parse_args(mock_args.split())
    print(args)
    # fix random seed to make sure the result is reproducible
    pl.seed_everything(args.seed)

    # If output_dir not provided, a folder will be generated in pwd
    if args.output_dir is None:
        args.output_dir = os.path.join(
            "./results",
            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
        )
        os.makedirs(args.output_dir)
    dict_args = vars(args)
    model = LSTM_PL(**dict_args)
    trainer = generic_train(model, args)


if __name__ == "__main__":
    main()

03/15/2022 06:17:56 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42
03/15/2022 06:17:56 - INFO - __main__ -   Initilazing BaseModel


Namespace(data_dir='data/', do_predict=True, do_train=True, dropout=0.5, eval_batch_size=32, feature_num=143, gpus=1, hidden_size=256, learning_rate=0.0001, max_epochs=50, model_name='distilbert-base-uncased', num_layer=1, num_workers=16, optimizer='adam', output_dir='data/working/distilbert', seed=42, train_batch_size=16)


03/15/2022 06:17:57 - INFO - pytorch_lightning.utilities.distributed -   GPU available: True, used: True
03/15/2022 06:17:57 - INFO - pytorch_lightning.utilities.distributed -   TPU available: False, using: 0 TPU cores
03/15/2022 06:17:57 - INFO - pytorch_lightning.utilities.distributed -   IPU available: False, using: 0 IPUs
03/15/2022 06:17:57 - INFO - __main__ -   Loading data from data//train.csv and pn from data//patient_notes.csv and feature from data//features.csv
  cpuset_checked))
03/15/2022 06:17:59 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation sanity check: 0it [00:00, ?it/s]

03/15/2022 06:18:00 - INFO - __main__ -   Loading data from data//train.csv and pn from data//patient_notes.csv and feature from data//features.csv
03/15/2022 06:18:05 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

03/15/2022 06:18:30 - INFO - pytorch_lightning.utilities.distributed -   Epoch 0, global step 49: val_acc reached 0.68320 (best 0.68320), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=0-val_acc=0.68.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:19:02 - INFO - pytorch_lightning.utilities.distributed -   Epoch 1, global step 99: val_acc reached 0.71198 (best 0.71198), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=1-val_acc=0.71.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:19:30 - INFO - pytorch_lightning.utilities.distributed -   Epoch 2, global step 149: val_acc reached 0.73319 (best 0.73319), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=2-val_acc=0.73.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:19:59 - INFO - pytorch_lightning.utilities.distributed -   Epoch 3, global step 199: val_acc reached 0.74634 (best 0.74634), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=3-val_acc=0.75.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:20:27 - INFO - pytorch_lightning.utilities.distributed -   Epoch 4, global step 249: val_acc reached 0.76057 (best 0.76057), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=4-val_acc=0.76.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:20:55 - INFO - pytorch_lightning.utilities.distributed -   Epoch 5, global step 299: val_acc reached 0.76514 (best 0.76514), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=5-val_acc=0.77.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:21:23 - INFO - pytorch_lightning.utilities.distributed -   Epoch 6, global step 349: val_acc reached 0.77202 (best 0.77202), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=6-val_acc=0.77.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:21:51 - INFO - pytorch_lightning.utilities.distributed -   Epoch 7, global step 399: val_acc reached 0.77657 (best 0.77657), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=7-val_acc=0.78.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:22:19 - INFO - pytorch_lightning.utilities.distributed -   Epoch 8, global step 449: val_acc reached 0.77849 (best 0.77849), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=8-val_acc=0.78.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:22:47 - INFO - pytorch_lightning.utilities.distributed -   Epoch 9, global step 499: val_acc reached 0.78492 (best 0.78492), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=9-val_acc=0.78.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:23:15 - INFO - pytorch_lightning.utilities.distributed -   Epoch 10, global step 549: val_acc reached 0.78497 (best 0.78497), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=10-val_acc=0.78.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:23:43 - INFO - pytorch_lightning.utilities.distributed -   Epoch 11, global step 599: val_acc reached 0.79462 (best 0.79462), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=11-val_acc=0.79.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:24:11 - INFO - pytorch_lightning.utilities.distributed -   Epoch 12, global step 649: val_acc reached 0.79794 (best 0.79794), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=12-val_acc=0.80.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:24:39 - INFO - pytorch_lightning.utilities.distributed -   Epoch 13, global step 699: val_acc reached 0.81136 (best 0.81136), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=13-val_acc=0.81.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:25:08 - INFO - pytorch_lightning.utilities.distributed -   Epoch 14, global step 749: val_acc reached 0.81215 (best 0.81215), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=14-val_acc=0.81.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:25:36 - INFO - pytorch_lightning.utilities.distributed -   Epoch 15, global step 799: val_acc reached 0.82000 (best 0.82000), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=15-val_acc=0.82.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:26:04 - INFO - pytorch_lightning.utilities.distributed -   Epoch 16, global step 849: val_acc reached 0.82460 (best 0.82460), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=16-val_acc=0.82.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:26:32 - INFO - pytorch_lightning.utilities.distributed -   Epoch 17, global step 899: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:26:56 - INFO - pytorch_lightning.utilities.distributed -   Epoch 18, global step 949: val_acc reached 0.82795 (best 0.82795), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=18-val_acc=0.83.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:27:24 - INFO - pytorch_lightning.utilities.distributed -   Epoch 19, global step 999: val_acc reached 0.83767 (best 0.83767), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=19-val_acc=0.84.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:27:52 - INFO - pytorch_lightning.utilities.distributed -   Epoch 20, global step 1049: val_acc reached 0.84636 (best 0.84636), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=20-val_acc=0.85.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:28:19 - INFO - pytorch_lightning.utilities.distributed -   Epoch 21, global step 1099: val_acc reached 0.87556 (best 0.87556), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=21-val_acc=0.88.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:28:50 - INFO - pytorch_lightning.utilities.distributed -   Epoch 22, global step 1149: val_acc reached 0.90271 (best 0.90271), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=22-val_acc=0.90.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:29:18 - INFO - pytorch_lightning.utilities.distributed -   Epoch 23, global step 1199: val_acc reached 0.92388 (best 0.92388), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=23-val_acc=0.92.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:29:46 - INFO - pytorch_lightning.utilities.distributed -   Epoch 24, global step 1249: val_acc reached 0.92836 (best 0.92836), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=24-val_acc=0.93.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:30:15 - INFO - pytorch_lightning.utilities.distributed -   Epoch 25, global step 1299: val_acc reached 0.93958 (best 0.93958), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=25-val_acc=0.94.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:30:44 - INFO - pytorch_lightning.utilities.distributed -   Epoch 26, global step 1349: val_acc reached 0.94711 (best 0.94711), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=26-val_acc=0.95.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:31:12 - INFO - pytorch_lightning.utilities.distributed -   Epoch 27, global step 1399: val_acc reached 0.94919 (best 0.94919), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=27-val_acc=0.95.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:31:40 - INFO - pytorch_lightning.utilities.distributed -   Epoch 28, global step 1449: val_acc reached 0.95797 (best 0.95797), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=28-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:32:10 - INFO - pytorch_lightning.utilities.distributed -   Epoch 29, global step 1499: val_acc reached 0.95810 (best 0.95810), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=29-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:32:39 - INFO - pytorch_lightning.utilities.distributed -   Epoch 30, global step 1549: val_acc reached 0.95818 (best 0.95818), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=30-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:33:10 - INFO - pytorch_lightning.utilities.distributed -   Epoch 31, global step 1599: val_acc reached 0.96141 (best 0.96141), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=31-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:33:39 - INFO - pytorch_lightning.utilities.distributed -   Epoch 32, global step 1649: val_acc reached 0.96245 (best 0.96245), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=32-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:34:08 - INFO - pytorch_lightning.utilities.distributed -   Epoch 33, global step 1699: val_acc reached 0.96354 (best 0.96354), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=33-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:34:36 - INFO - pytorch_lightning.utilities.distributed -   Epoch 34, global step 1749: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:34:59 - INFO - pytorch_lightning.utilities.distributed -   Epoch 35, global step 1799: val_acc reached 0.96670 (best 0.96670), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=35-val_acc=0.97.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:35:28 - INFO - pytorch_lightning.utilities.distributed -   Epoch 36, global step 1849: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:35:52 - INFO - pytorch_lightning.utilities.distributed -   Epoch 37, global step 1899: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:36:15 - INFO - pytorch_lightning.utilities.distributed -   Epoch 38, global step 1949: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:36:39 - INFO - pytorch_lightning.utilities.distributed -   Epoch 39, global step 1999: val_acc reached 0.96780 (best 0.96780), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=39-val_acc=0.97.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:37:07 - INFO - pytorch_lightning.utilities.distributed -   Epoch 40, global step 2049: val_acc reached 0.96878 (best 0.96878), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=40-val_acc=0.97.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:37:36 - INFO - pytorch_lightning.utilities.distributed -   Epoch 41, global step 2099: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:38:00 - INFO - pytorch_lightning.utilities.distributed -   Epoch 42, global step 2149: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:38:23 - INFO - pytorch_lightning.utilities.distributed -   Epoch 43, global step 2199: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:38:47 - INFO - pytorch_lightning.utilities.distributed -   Epoch 44, global step 2249: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:39:10 - INFO - pytorch_lightning.utilities.distributed -   Epoch 45, global step 2299: val_acc reached 0.96958 (best 0.96958), saving model to "/content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=45-val_acc=0.97.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:39:39 - INFO - pytorch_lightning.utilities.distributed -   Epoch 46, global step 2349: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:40:02 - INFO - pytorch_lightning.utilities.distributed -   Epoch 47, global step 2399: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:40:25 - INFO - pytorch_lightning.utilities.distributed -   Epoch 48, global step 2449: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/15/2022 06:40:49 - INFO - pytorch_lightning.utilities.distributed -   Epoch 49, global step 2499: val_acc was not in top 1
03/15/2022 06:40:50 - INFO - __main__ -   Copy best model from /content/drive/MyDrive/NLP/data/working/distilbert/version_15-03-2022--06-17-57/checkpoints/epoch=45-val_acc=0.97.ckpt to data/working/distilbert/best_model.ckpt.
03/15/2022 06:40:55 - INFO - __main__ -   Initilazing BaseModel
03/15/2022 06:40:57 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
03/15/2022 06:40:57 - INFO - __main__ -   Loading data from data//train.csv and pn from data//patient_notes.csv and feature from data//features.csv


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.8767442107200623, 'test_loss': 0.37659215927124023}
--------------------------------------------------------------------------------


# Train Roberta

In [None]:
SAVEDIR = "data/working/roberta"
def main():
    ########################################################
    ## TODO: change args if needed according to your files #
    ########################################################
    mock_args = f"--data_dir {DATADIR} --output_dir {SAVEDIR} --optimizer adam \
    --model_name roberta-large --learning_rate 1e-4 --max_epochs 10\
    --dropout 0.5 --num_layer 1 --train_batch_size 5 --eval_batch_size 5 --do_predict" # change model_name here

    # load hyperparameters
    parser = argparse.ArgumentParser()
    BaseModel.add_generic_args(parser, os.getcwd())
    parser = LSTM_PL.add_model_specific_args(parser, os.getcwd())
    args = parser.parse_args(mock_args.split())
    print(args)
    # fix random seed to make sure the result is reproducible
    pl.seed_everything(args.seed)

    # If output_dir not provided, a folder will be generated in pwd
    if args.output_dir is None:
        args.output_dir = os.path.join(
            "./results",
            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
        )
        os.makedirs(args.output_dir)
    dict_args = vars(args)
    model = LSTM_PL(**dict_args)
    trainer = generic_train(model, args)


if __name__ == "__main__":
    main()

# Bio-Discharge

In [None]:
SAVEDIR = "data/working/bioD"
def main():
    ########################################################
    ## TODO: change args if needed according to your files #
    ########################################################
    mock_args = f"--data_dir {DATADIR} --output_dir {SAVEDIR} --optimizer adam \
    --model_name emilyalsentzer/Bio_Discharge_Summary_BERT --learning_rate 1e-4 --max_epochs 60\
    --dropout 0.1 --num_layer 1 --train_batch_size 16 --do_predict" # change model_name here

    # load hyperparameters
    parser = argparse.ArgumentParser()
    BaseModel.add_generic_args(parser, os.getcwd())
    parser = LSTM_PL.add_model_specific_args(parser, os.getcwd())
    args = parser.parse_args(mock_args.split())
    print(args)
    # fix random seed to make sure the result is reproducible
    pl.seed_everything(args.seed)

    # If output_dir not provided, a folder will be generated in pwd
    if args.output_dir is None:
        args.output_dir = os.path.join(
            "./results",
            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
        )
        os.makedirs(args.output_dir)
    dict_args = vars(args)
    model = LSTM_PL(**dict_args)
    trainer = generic_train(model, args)


if __name__ == "__main__":
    main()

03/14/2022 19:24:42 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42
03/14/2022 19:24:42 - INFO - __main__ -   Initilazing BaseModel


Namespace(data_dir='data/', do_predict=True, do_train=True, dropout=0.1, eval_batch_size=32, feature_num=143, gpus=1, hidden_size=256, learning_rate=0.0001, max_epochs=60, model_name='emilyalsentzer/Bio_Discharge_Summary_BERT', num_layer=1, num_workers=16, optimizer='adam', output_dir='data/working/bioD', seed=42, train_batch_size=16)


Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

03/14/2022 19:25:09 - INFO - pytorch_lightning.utilities.distributed -   GPU available: True, used: True
03/14/2022 19:25:09 - INFO - pytorch_lightning.utilities.distributed -   TPU available: False, using: 0 TPU cores
03/14/2022 19:25:09 - INFO - pytorch_lightning.utilities.distributed -   IPU available: False, using: 0 IPUs
03/14/2022 19:25:09 - INFO - __main__ -   Loading data from data//train.csv and pn from data//patient_notes.csv and feature from data//features.csv


Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

  cpuset_checked))
03/14/2022 19:25:16 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation sanity check: 0it [00:00, ?it/s]

03/14/2022 19:25:17 - INFO - __main__ -   Loading data from data//train.csv and pn from data//patient_notes.csv and feature from data//features.csv
03/14/2022 19:25:26 - INFO - pytorch_lightning.utilities.seed -   Global seed set to 42


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

03/14/2022 19:26:03 - INFO - pytorch_lightning.utilities.distributed -   Epoch 0, global step 49: val_acc reached 0.67265 (best 0.67265), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=0-val_acc=0.67.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:26:52 - INFO - pytorch_lightning.utilities.distributed -   Epoch 1, global step 99: val_acc reached 0.72345 (best 0.72345), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=1-val_acc=0.72.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:27:39 - INFO - pytorch_lightning.utilities.distributed -   Epoch 2, global step 149: val_acc reached 0.74678 (best 0.74678), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=2-val_acc=0.75.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:28:25 - INFO - pytorch_lightning.utilities.distributed -   Epoch 3, global step 199: val_acc reached 0.76037 (best 0.76037), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=3-val_acc=0.76.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:29:12 - INFO - pytorch_lightning.utilities.distributed -   Epoch 4, global step 249: val_acc reached 0.76802 (best 0.76802), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=4-val_acc=0.77.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:29:58 - INFO - pytorch_lightning.utilities.distributed -   Epoch 5, global step 299: val_acc reached 0.78232 (best 0.78232), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=5-val_acc=0.78.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:30:44 - INFO - pytorch_lightning.utilities.distributed -   Epoch 6, global step 349: val_acc reached 0.78827 (best 0.78827), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=6-val_acc=0.79.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:31:31 - INFO - pytorch_lightning.utilities.distributed -   Epoch 7, global step 399: val_acc reached 0.80015 (best 0.80015), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=7-val_acc=0.80.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:32:17 - INFO - pytorch_lightning.utilities.distributed -   Epoch 8, global step 449: val_acc reached 0.80405 (best 0.80405), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=8-val_acc=0.80.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:33:05 - INFO - pytorch_lightning.utilities.distributed -   Epoch 9, global step 499: val_acc reached 0.80820 (best 0.80820), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=9-val_acc=0.81.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:33:51 - INFO - pytorch_lightning.utilities.distributed -   Epoch 10, global step 549: val_acc reached 0.81613 (best 0.81613), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=10-val_acc=0.82.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:34:37 - INFO - pytorch_lightning.utilities.distributed -   Epoch 11, global step 599: val_acc reached 0.81687 (best 0.81687), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=11-val_acc=0.82.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:35:24 - INFO - pytorch_lightning.utilities.distributed -   Epoch 12, global step 649: val_acc reached 0.82001 (best 0.82001), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=12-val_acc=0.82.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:36:11 - INFO - pytorch_lightning.utilities.distributed -   Epoch 13, global step 699: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:36:47 - INFO - pytorch_lightning.utilities.distributed -   Epoch 14, global step 749: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:37:22 - INFO - pytorch_lightning.utilities.distributed -   Epoch 15, global step 799: val_acc reached 0.82463 (best 0.82463), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=15-val_acc=0.82.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:38:09 - INFO - pytorch_lightning.utilities.distributed -   Epoch 16, global step 849: val_acc reached 0.83007 (best 0.83007), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=16-val_acc=0.83.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:38:56 - INFO - pytorch_lightning.utilities.distributed -   Epoch 17, global step 899: val_acc reached 0.85143 (best 0.85143), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=17-val_acc=0.85.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:39:42 - INFO - pytorch_lightning.utilities.distributed -   Epoch 18, global step 949: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:40:18 - INFO - pytorch_lightning.utilities.distributed -   Epoch 19, global step 999: val_acc reached 0.86308 (best 0.86308), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=19-val_acc=0.86.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:41:05 - INFO - pytorch_lightning.utilities.distributed -   Epoch 20, global step 1049: val_acc reached 0.87434 (best 0.87434), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=20-val_acc=0.87.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:41:52 - INFO - pytorch_lightning.utilities.distributed -   Epoch 21, global step 1099: val_acc reached 0.89040 (best 0.89040), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=21-val_acc=0.89.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:42:39 - INFO - pytorch_lightning.utilities.distributed -   Epoch 22, global step 1149: val_acc reached 0.89802 (best 0.89802), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=22-val_acc=0.90.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:43:26 - INFO - pytorch_lightning.utilities.distributed -   Epoch 23, global step 1199: val_acc reached 0.92265 (best 0.92265), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=23-val_acc=0.92.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:44:14 - INFO - pytorch_lightning.utilities.distributed -   Epoch 24, global step 1249: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:44:49 - INFO - pytorch_lightning.utilities.distributed -   Epoch 25, global step 1299: val_acc reached 0.93285 (best 0.93285), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=25-val_acc=0.93.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:45:35 - INFO - pytorch_lightning.utilities.distributed -   Epoch 26, global step 1349: val_acc reached 0.94428 (best 0.94428), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=26-val_acc=0.94.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:46:22 - INFO - pytorch_lightning.utilities.distributed -   Epoch 27, global step 1399: val_acc reached 0.94619 (best 0.94619), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=27-val_acc=0.95.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:47:13 - INFO - pytorch_lightning.utilities.distributed -   Epoch 28, global step 1449: val_acc reached 0.95192 (best 0.95192), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=28-val_acc=0.95.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:47:59 - INFO - pytorch_lightning.utilities.distributed -   Epoch 29, global step 1499: val_acc reached 0.95768 (best 0.95768), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=29-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:48:47 - INFO - pytorch_lightning.utilities.distributed -   Epoch 30, global step 1549: val_acc reached 0.95821 (best 0.95821), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=30-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:49:33 - INFO - pytorch_lightning.utilities.distributed -   Epoch 31, global step 1599: val_acc reached 0.96015 (best 0.96015), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=31-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:50:19 - INFO - pytorch_lightning.utilities.distributed -   Epoch 32, global step 1649: val_acc reached 0.96249 (best 0.96249), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=32-val_acc=0.96.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:51:05 - INFO - pytorch_lightning.utilities.distributed -   Epoch 33, global step 1699: val_acc reached 0.96661 (best 0.96661), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=33-val_acc=0.97.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:51:52 - INFO - pytorch_lightning.utilities.distributed -   Epoch 34, global step 1749: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:52:29 - INFO - pytorch_lightning.utilities.distributed -   Epoch 35, global step 1799: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:53:05 - INFO - pytorch_lightning.utilities.distributed -   Epoch 36, global step 1849: val_acc reached 0.96743 (best 0.96743), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=36-val_acc=0.97.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:53:52 - INFO - pytorch_lightning.utilities.distributed -   Epoch 37, global step 1899: val_acc reached 0.96917 (best 0.96917), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=37-val_acc=0.97.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:54:38 - INFO - pytorch_lightning.utilities.distributed -   Epoch 38, global step 1949: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:55:14 - INFO - pytorch_lightning.utilities.distributed -   Epoch 39, global step 1999: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:55:49 - INFO - pytorch_lightning.utilities.distributed -   Epoch 40, global step 2049: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:56:25 - INFO - pytorch_lightning.utilities.distributed -   Epoch 41, global step 2099: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:57:00 - INFO - pytorch_lightning.utilities.distributed -   Epoch 42, global step 2149: val_acc reached 0.97180 (best 0.97180), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=42-val_acc=0.97.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:57:48 - INFO - pytorch_lightning.utilities.distributed -   Epoch 43, global step 2199: val_acc reached 0.97236 (best 0.97236), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=43-val_acc=0.97.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:58:35 - INFO - pytorch_lightning.utilities.distributed -   Epoch 44, global step 2249: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:59:11 - INFO - pytorch_lightning.utilities.distributed -   Epoch 45, global step 2299: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 19:59:46 - INFO - pytorch_lightning.utilities.distributed -   Epoch 46, global step 2349: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 20:00:22 - INFO - pytorch_lightning.utilities.distributed -   Epoch 47, global step 2399: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 20:00:57 - INFO - pytorch_lightning.utilities.distributed -   Epoch 48, global step 2449: val_acc reached 0.97276 (best 0.97276), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=48-val_acc=0.97.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 20:01:44 - INFO - pytorch_lightning.utilities.distributed -   Epoch 49, global step 2499: val_acc reached 0.97397 (best 0.97397), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=49-val_acc=0.97.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 20:02:30 - INFO - pytorch_lightning.utilities.distributed -   Epoch 50, global step 2549: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 20:03:06 - INFO - pytorch_lightning.utilities.distributed -   Epoch 51, global step 2599: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 20:03:42 - INFO - pytorch_lightning.utilities.distributed -   Epoch 52, global step 2649: val_acc reached 0.97463 (best 0.97463), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=52-val_acc=0.97.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 20:04:30 - INFO - pytorch_lightning.utilities.distributed -   Epoch 53, global step 2699: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 20:05:06 - INFO - pytorch_lightning.utilities.distributed -   Epoch 54, global step 2749: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 20:05:43 - INFO - pytorch_lightning.utilities.distributed -   Epoch 55, global step 2799: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 20:06:20 - INFO - pytorch_lightning.utilities.distributed -   Epoch 56, global step 2849: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 20:06:57 - INFO - pytorch_lightning.utilities.distributed -   Epoch 57, global step 2899: val_acc was not in top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 20:07:33 - INFO - pytorch_lightning.utilities.distributed -   Epoch 58, global step 2949: val_acc reached 0.97569 (best 0.97569), saving model to "/content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=58-val_acc=0.98.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

03/14/2022 20:08:20 - INFO - pytorch_lightning.utilities.distributed -   Epoch 59, global step 2999: val_acc was not in top 1
03/14/2022 20:08:21 - INFO - __main__ -   Copy best model from /content/drive/MyDrive/NLP/data/working/bioD/version_14-03-2022--19-25-09/checkpoints/epoch=58-val_acc=0.98.ckpt to data/working/bioD/best_model.ckpt.
03/14/2022 20:08:45 - INFO - __main__ -   Initilazing BaseModel
03/14/2022 20:08:52 - INFO - pytorch_lightning.accelerators.gpu -   LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
03/14/2022 20:08:54 - INFO - __main__ -   Loading data from data//train.csv and pn from data//patient_notes.csv and feature from data//features.csv


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9454545378684998, 'test_loss': 0.19716587662696838}
--------------------------------------------------------------------------------


# Prediction

In [None]:
# val = df_data['pn_num'].unique()[model.idx_dev]
# val_idx = [pn in list(val) for pn in df_data['pn_num']]
# val_df = df_data.loc[val_idx, :]
# val_df.to_csv(DATADIR + 'validation.csv')

In [None]:
class LocPredictor:
    def __init__(self, model_path, df_data, df_feature):
        self.model = LSTM_PL.load_from_checkpoint(model_path).to('cuda')
        self.df_data = df_data
        self.df_feature = df_feature
        self.prediction = {}
        self.pn_processed = set()
        self.tokenizer = AutoTokenizer.from_pretrained(self.model.hparams.model_name, trim_offsets=False)

    def get_loc(self, pn_num):
        if pn_num in self.pn_processed:
            return self.prediction[pn_num]
        else:
            test_dataset = SST2Dataset(self.model.hparams.model_name, 
                           self.df_data[self.df_data['pn_num'].astype(int) == pn_num], self.df_feature)
            pn_history, feature_num, location = test_dataset[0]
            batch = test_dataset.collate_fn([(pn_history, feature_num, location)])
            batch = self.model.transfer_batch_to_device(batch, torch.device('cuda'), 0)
            input = self.model.batch2input(batch)
            loss, predicted_labels = self.model(**input)
            
            labels = predicted_labels.argmax(dim=2)
            labels_dct = {k:'' for k in list(map(int, feature_num))}
            encoding = self.tokenizer(pn_history,
                        return_offsets_mapping=True)
            for idx, label in enumerate(labels.unique()):
                if label != 143:
                    id = torch.where(labels == label)[1]
                    offsets = [encoding['offset_mapping'][i] for i in id]
                    anno = []
                    if len(offsets) == 1:
                        begin, end = offsets[0]
                        anno.append(f'{begin} {end}')
                    else:
                        for i, item in enumerate(offsets):
                            begin, end = item
                            if i == 0:
                                start = begin
                                end1 = end
                                continue
                            if 0 <= (begin - end1) <= 1:
                                end1 = end
                            else:
                                anno.append(f'{start} {end1}')
                                start = begin
                                end1 = end
                            
                            if i == len(offsets) - 1:
                                if (begin - end1) > 1:
                                    anno.append(f'{begin} {end}')
                                else:
                                    anno.append(f'{start} {end}')
                    lab = feat_lst[label.item()]
                    if lab in labels_dct.keys():
                        labels_dct[lab] = ";".join(anno)
            self.prediction[pn_num] = labels_dct
            self.pn_processed.add(pn_num)
        return labels_dct

Saving Model File for Kaggle

In [None]:
SAVEDIR = "data/working/roberta"
model = RobertaModel.from_pretrained("roberta-base")
model.save_pretrained(SAVEDIR)

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base", trim_offsets=False)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
import pickle

with open(SAVEDIR + '/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# model_path = os.path.join(SAVEDIR, 'best_model.ckpt')
# model = LSTM_PL.load_from_checkpoint(model_path).to('cuda')
# SAVE_BEST = os.path.join(SAVEDIR, 'model.pt')
# with open(SAVE_BEST, 'wb') as f:
#     torch.save(model.state_dict(), f)
# with open(SAVE_BEST, 'rb') as f:
#     model.load_state_dict(torch.load(f))
#     print(model.model.model_name)

03/15/2022 19:00:15 - INFO - __main__ -   Initilazing BaseModel


distilbert-base-uncased


## Write Results

In [None]:
valpath = DATADIR + '/validation.csv'
feature = pd.read_csv(featurepath)
feat_lst = feature.feature_num.tolist()

In [None]:
def write_output(model_dir):
    model_path = os.path.join(model_dir, 'best_model.ckpt')
    loc_predictor = LocPredictor(model_path, df_data, df_feature)
    val = pd.read_csv(valpath)
    #feature = pd.read_csv(featurepath)
    #feat_lst = feature.feature_num.tolist()
    filename = os.path.join(model_dir, 'prediction.csv')
    len_id_list = len(val.pn_num.unique())
    id_list = tqdm(val.pn_num.unique(), total=len_id_list, leave=True)
    new_ids = []
    location = []

    for i, id in enumerate(id_list):
        location_pred = loc_predictor.get_loc(id)
        for feat, pred in location_pred.items():
            new_id = f'{str(id).zfill(5)}_{str(feat).zfill(3)}'
            new_ids.append(new_id)
            location.append(pred)
    df_sub = pd.DataFrame(data={'id': new_ids, 'location':location})
    df_sub.to_csv(filename, index=False)

In [None]:
write_output("data/working/BioC")
write_output("data/working/bioD")
write_output("data/working/distilbert")
write_output("data/working/BioC_small")


03/15/2022 18:46:27 - INFO - __main__ -   Initilazing BaseModel


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]


  0%|          | 0/198 [00:00<?, ?it/s][A
  1%|          | 1/198 [00:01<03:44,  1.14s/it][A
  1%|          | 2/198 [00:02<03:41,  1.13s/it][A
  2%|▏         | 3/198 [00:03<03:39,  1.13s/it][A
  2%|▏         | 4/198 [00:04<03:38,  1.13s/it][A
  3%|▎         | 5/198 [00:05<03:38,  1.13s/it][A
  3%|▎         | 6/198 [00:06<03:37,  1.13s/it][A
  4%|▎         | 7/198 [00:07<03:36,  1.13s/it][A
  4%|▍         | 8/198 [00:09<03:35,  1.13s/it][A
  5%|▍         | 9/198 [00:10<03:34,  1.13s/it][A
  5%|▌         | 10/198 [00:11<03:34,  1.14s/it][A
  6%|▌         | 11/198 [00:12<03:32,  1.14s/it][A
  6%|▌         | 12/198 [00:13<03:31,  1.14s/it][A
  7%|▋         | 13/198 [00:14<03:29,  1.13s/it][A
  7%|▋         | 14/198 [00:15<03:28,  1.13s/it][A
  8%|▊         | 15/198 [00:16<03:27,  1.13s/it][A
  8%|▊         | 16/198 [00:18<03:26,  1.13s/it][A
  9%|▊         | 17/198 [00:19<03:25,  1.13s/it][A
  9%|▉         | 18/198 [00:20<03:24,  1.13s/it][A
 10%|▉         | 19/198 [00:2