# Training Notebook

In [1]:
!pip install ImageBind/.
!pip install git+https://github.com/huggingface/transformers.git
!pip install "sagemaker>=2.140.0" "transformers==4.26.1" "datasets[s3]==2.10.1" --upgrade
!pip install accelerate==0.20.3

Processing ./ImageBind
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pytorchvideo@ git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
  Using cached pytorchvideo-0.1.5-py3-none-any.whl
Building wheels for collected packages: imagebind
  Building wheel for imagebind (setup.py) ... [?25ldone
[?25h  Created wheel for imagebind: filename=imagebind-0.1.0-py3-none-any.whl size=27969 sha256=ce8877ee6f6638063217f6eb5dc7e46f778b0f855909676d32eba4c46fb1a607
  Stored in directory: /tmp/pip-ephem-wheel-cache-zik8ykv5/wheels/15/d5/57/9f60b1256b436b67dceac96201d8bc2eebd9fd320f633e190f
Successfully built imagebind
Installing collected packages: imagebind
  Attempting uninstall: imagebind
    Found existing installation: imagebind 0.1.0
    Uninstalling imagebind-0.1.0:
      Successfully uninstalled imagebind-0.1.0
Successfully installed imagebind-0.1.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is av

In [2]:
from imagebind import data
import torch
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType

import numpy as np
from sagemaker import get_execution_role
import boto3
import pandas as pd
from io import StringIO # Python 3.
from datasets import load_dataset,Dataset,DatasetDict,concatenate_datasets

from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# print bucket contents to see which one we are working with
s3 = boto3.client('s3')
bucket = 'chianglab-dataderivatives'
subfolder = 'mimic-iv-clinical-database-2.2'

response = s3.list_objects_v2(Bucket=bucket)
folders = []
for obj in response['Contents']:
    if obj['Key'].endswith('/'):
        folders.append(obj['Key'])
print(folders)

['mimic-iv-2.2/', 'mimic-iv-clinical-database-demo-2.2/', 'mimic-iv-ed-2.2/', 'mimic-iv-ed-demo-2.2/']


# Load in Data

In [4]:
s3 = boto3.resource('s3')
bucket_name = 'chianglab-dataderivatives'
file_path = "mimic-iv-ed-2.2/text_repr.json"

# loading in raw data
content_object = s3.Object(bucket_name, file_path)
file_content = content_object.get()['Body'].read().decode('utf-8')
json_content = json.loads(file_content)
df = pd.DataFrame(json_content).T
print("length of dataframe: "+ str(len(df)))
df.head(5)

length of dataframe: 400019


Unnamed: 0,arrival,eddischarge,admission,discharge,eddischarge_category,triage,medrecon,vitals,codes,pyxis
33258284,"Patient 10000032, a 52 year old white female, ...",The ED disposition was admitted at 2180-05-06 ...,The patient was admitted at 2180-05-06 22:23:00.,The patient's discharge disposition was: home ...,a d m i t t e d,"At triage: temperature was 98.4, pulse was 70....",The patient was previously taking the followin...,The patient had the following vitals: At 2180-...,The patient received the following diagnostic ...,
38112554,"Patient 10000032, a 52 year old white female, ...",The ED disposition was admitted at 2180-06-26 ...,The patient was admitted at 2180-06-26 18:27:00.,The patient's discharge disposition was: home ...,a d m i t t e d,"At triage: temperature was 98.9, pulse was 88....",The patient was previously taking the followin...,The patient had the following vitals: At 2180-...,The patient received the following diagnostic ...,The patient received the following medications...
35968195,"Patient 10000032, a 52 year old white female, ...",The ED disposition was admitted at 2180-08-06 ...,The patient was admitted at 2180-08-05 23:44:00.,The patient's discharge disposition was: hospi...,a d m i t t e d,"At triage: temperature was 99.4, pulse was 105...",The patient was previously taking the followin...,The patient had the following vitals: At 2180-...,The patient received the following diagnostic ...,The patient received the following medications...
32952584,"Patient 10000032, a 52 year old white female, ...",The ED disposition was home at 2180-07-23 05:5...,The patient was admitted at 2180-07-23 12:35:00.,The patient's discharge disposition was: home ...,h o m e,"At triage: temperature was 97.8, pulse was 87....",The patient was previously taking the followin...,The patient had the following vitals: At 2180-...,The patient received the following diagnostic ...,The patient received the following medications...
39399961,"Patient 10000032, a 52 year old white female, ...",The ED disposition was admitted at 2180-07-23 ...,The patient was admitted at 2180-07-23 12:35:00.,The patient's discharge disposition was: home ...,a d m i t t e d,"At triage: temperature was 98.7, pulse was 77....",The patient was previously taking the followin...,The patient had the following vitals: At 2180-...,The patient received the following diagnostic ...,The patient received the following medications...


### fill in missing entries 

In [5]:
# check which columns have nans
df.columns[df.isna().any()].tolist()

['medrecon', 'vitals', 'codes', 'pyxis']

In [6]:
# fix missing entries
df['medrecon'] = df['medrecon'].fillna("The patient was previously not taking any medications.")
df['pyxis'] = df['pyxis'].fillna("The patient did not receive any medications.")
df['vitals'] = df['vitals'].fillna("The patient had no vitals recorded")
df['codes'] = df['codes'].fillna("The patient received no diagnostic codes")

# Split Test & Training and write out Patients IDs for Reproducibility

In [7]:
# split the train and test split with seed so it is always the same
def train_validate_test_split(df, train_percent=.7, validate_percent=.15, seed=7):
    np.random.seed(seed) # set seed for reproducibility sake
    df = df.reset_index()
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    train = train.set_index('index')
    validate = validate.set_index('index')
    test = test.set_index('index')
    return train, validate, test

train, validate, test = train_validate_test_split(df, seed=7)
train2, validate2, test2 = train_validate_test_split(df, seed=7)
print("70% Train:",len(train), "\n30% Test:",len(validate+test))

70% Train: 280013 
30% Test: 120006


In [8]:
import string
# extracts the patient ID's from the arrival column 
train_patients = train.arrival.astype(str).str.split().str[1].to_list()
train_patients2 = train2.arrival.astype(str).str.split().str[1].to_list()
test_patients = test.arrival.astype(str).str.split().str[1].to_list()
validate_patients = validate.arrival.astype(str).str.split().str[1].to_list()
test_patients = (test_patients+validate_patients)

# Sanity Check: checking if seed works by seeing if training sets are equal when called two separate times for future reproducibility
train_patients.sort()
train_patients2.sort()

train_patients = [''.join(char for char in item if char not in string.punctuation) for item in train_patients]
train_patients2 = [''.join(char for char in item if char not in string.punctuation) for item in train_patients2]
test_patients = [''.join(char for char in item if char not in string.punctuation) for item in test_patients]
 
# using == to check if lists are equal
if train_patients == train_patients2:
    print("The lists are identical")
else:
    print("The lists are not identical")

# free up memory by deleting
del train2
del validate2
del test2

# remove duplicates    
train_patients = set(train_patients)
test_patients = set(test_patients)

# write patient ID's into txt files for lookup purposes later in case there are dependency issues in the future that modify seeding
file = open('./models/data/train_patients.txt','w')
for patient in train_patients:
	file.write(patient+"\n")
file.close()
print("stored patient IDs into: ./models/data/train_patients.txt")

file = open('./models/data/test_patients.txt','w')
for patient in test_patients:
	file.write(patient+"\n")
file.close()
print("stored patient IDs into: ./models/data/test_patients.txt")


The lists are identical
stored patient IDs into: ./models/data/train_patients.txt
stored patient IDs into: ./models/data/test_patients.txt


# Tokenize our Corpus

We will use the biobert tokenizer to ensure that it covers some biomedical terminology found in both MIMIC and PubMed databases

In [9]:
# feed it into a custom tokenizer but first need to make a Dataset Object for transformers
disposition_train = train.eddischarge_category
train = train.drop("eddischarge_category",axis=1)
stacked_train = train.stack().to_frame("headline")
print("Train Stacked")
display(stacked_train.head(10))

disposition_test = test.eddischarge_category
test = test.drop("eddischarge_category",axis=1)
stacked_test = test.stack().to_frame("headline")
print("Test Stacked")
display(stacked_test.head(10))

Train Stacked


Unnamed: 0_level_0,Unnamed: 1_level_0,headline
index,Unnamed: 1_level_1,Unnamed: 2_level_1
36706089,arrival,"Patient 17100154, a 19 year old white female, ..."
36706089,eddischarge,The ED disposition was admitted at 2130-11-06 ...
36706089,admission,The patient was admitted at 2130-11-06 04:56:00.
36706089,discharge,The patient's discharge disposition was: unkno...
36706089,triage,"At triage: temperature was 97.6, pulse was 93...."
36706089,medrecon,The patient was previously taking the followin...
36706089,vitals,The patient had the following vitals: At 2130-...
36706089,codes,The patient received the following diagnostic ...
36706089,pyxis,The patient received the following medications...
38867343,arrival,"Patient 17533742, a 42 year old black/african ..."


Test Stacked


Unnamed: 0_level_0,Unnamed: 1_level_0,headline
index,Unnamed: 1_level_1,Unnamed: 2_level_1
39515874,arrival,"Patient 10264068, a 55 year old hispanic or la..."
39515874,eddischarge,The ED disposition was home at 2124-10-31 14:1...
39515874,admission,The patient was admitted at 2124-10-31 03:40:00.
39515874,discharge,The patient's discharge disposition was: unkno...
39515874,triage,"At triage: temperature was 98.5, pulse was 71...."
39515874,medrecon,The patient was previously not taking any medi...
39515874,vitals,The patient had no vitals recorded
39515874,codes,The patient received the following diagnostic ...
39515874,pyxis,The patient received the following medications...
34432494,arrival,"Patient 18457617, a 31 year old white female, ..."


In [10]:
training_data_corpus = Dataset.from_pandas(stacked_train)
testing_data_corpus = Dataset.from_pandas(stacked_test)
training_data_corpus

Dataset({
    features: ['headline', 'index', '__index_level_1__'],
    num_rows: 2520117
})

### If we want to use an exisiting tokenizer

In [11]:
# Load tokenizer trained from biobert + pubmed200kRCT dataset
run = False

if run:
    tokenizer = AutoTokenizer.from_pretrained("pritamdeka/BioBert-PubMed200kRCT")
    def encode_with_truncation(examples):
      """Mapping function to tokenize the sentences passed with truncation"""
      return tokenizer(examples["headline"], truncation=True, padding="max_length",
                        max_length=512, return_special_tokens_mask=True)

    def encode_without_truncation(examples):
      """Mapping function to tokenize the sentences passed without truncation"""
      return tokenizer(examples["headline"], return_special_tokens_mask=True)

In [12]:
# tokenizing input
#
# Implementation: I tokenized without truncation so then we aren't cutting off our input from parts of the EHR for training
if run:
    truncate = False
    if truncate:
        print("Tokenizing with truncation")
        train_data_tokenized = training_data_corpus.map(encode_without_truncation, batched=True)
        train_data_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
        print("Tokenizing Complete")
    else:
        print("Tokenizing without truncation")
        train_data_tokenized = training_data_corpus.map(encode_without_truncation, batched=True)
        train_data_tokenized.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
        print("Tokenizing Complete")

In [13]:
from itertools import chain

if run:
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of
    # max_seq_length.
    # grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= max_length:
            total_length = (total_length // max_length) * max_length
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
            for k, t in concatenated_examples.items()
        }
        return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
    # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
    # might be slower to preprocess.
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

    max_length = 512
    if not truncate:
        train_dataset = train_data_tokenized.map(group_texts, batched=True, desc=f"Grouping texts in chunks of {max_length}")
        # convert them from lists to torch tensors
        train_dataset.set_format("torch")

    print("data preprocessing is finished")


# Creating our own Tokenizer for RoBERTa Model

Why Pretrain a RoBERTa over a traditional BERT:

*"This way, in BERT, the masking is performed only once at data preparation time, and they basically take each sentence and mask it in 10 different ways. Therefore, at training time, the model will only see those 10 variations of each sentence.*

*On the other hand, in RoBERTa, the masking is done during training. Therefore, each time a sentence is incorporated in a minibatch, it gets its masking done, and therefore the number of potentially different masked versions of each sentence is not bounded like in BERT."*

In [15]:
txt_files_dir = "./models/data/text_split/"
!rm -rf {txt_files_dir}
!mkdir {txt_files_dir}

In [16]:
import os
from tqdm import tqdm
# if you want to train the tokenizer from scratch (especially if you have custom
# dataset loaded as datasets object), then run this cell to save it as files
# but if you already have your custom data as text files, there is no point using this
def column_to_files(column, txt_files_dir,output_filename="train.txt"):
    # The prefix is a unique ID to avoid to overwrite a text file
    i=1
    counter = 0
    #For every value in the df, with just one column
    for row in tqdm(column.to_list()):
      # Create the filename using the prefix ID
        if i % 1000 == 1:
            file_name = os.path.join(txt_files_dir, str(counter)+'.txt')
            f = open(file_name, 'wb')
        try:
            f.write(row.encode('utf-8'))
            if i % 1000 == 0:
                f.close()
                counter += 1
        except Exception as e:  #catch exceptions(for eg. empty rows)
            print(row, e) 
        i+=1
    # Return the last ID
    return counter

In [17]:
# Get the training data
training_data = stacked_train["headline"]
# Removing the end of line character \n
training_data = training_data.replace("\n"," ")
# Create a file for every description value
train_num_files = column_to_files(training_data, txt_files_dir, output_filename="train.txt")
print("Turned train dataset into txt file")

# # Get the test data
# test_data = stacked_test["headline"]
# # Removing the end of line character \n
# test_data = test_data.replace("\n"," ")
# # Create a file for every description value
# test_num_files = column_to_files(test_data, txt_files_dir, output_filename="test.txt")
# print("Turned test dataset into txt file")

100%|██████████| 2520117/2520117 [01:11<00:00, 35050.53it/s]


Turned train dataset into txt file


In [18]:
tokenizer_dir = "./models/data/TokenizerRoBERTa"
!rm -rf {tokenizer_dir}
!mkdir {tokenizer_dir}

In [19]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from pathlib import Path

paths = [str(x) for x in Path(".").glob("./models/data/text_split/*.txt")]
print("Loaded Dataset:", str(len(paths)))
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=True)

print("Train")
# Customize training

# we choose a vocab_size of 30,522 to reduce the OOV tokens which may commonly be found in Medical Terminology
tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,
                show_progress=True,
                special_tokens=[
                                "<s>",
                                "<pad>",
                                "</s>",
                                "<unk>",
                                "<mask>",
])
#Save the Tokenizer to disk
tokenizer.save_model(tokenizer_dir)

Loaded Dataset: 2521
Train


['./models/data/TokenizerRoBERTa/vocab.json',
 './models/data/TokenizerRoBERTa/merges.txt']

In [20]:
# remove the batched dataset because we don't want to accidently upload all that data
!rm -rf {txt_files_dir}

Making sure Tokenizer works

In [21]:
# Create the tokenizer using vocab.json and mrege.txt files
tokenizer = ByteLevelBPETokenizer(
    os.path.abspath(os.path.join(tokenizer_dir,'vocab.json')),
    os.path.abspath(os.path.join(tokenizer_dir,'merges.txt'))
)
# Prepare the tokenizer
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
# Test the tokenizer
tokenizer.encode("The patient was previously not taking any medications.")
# Show the tokens created
tokenizer.encode("The patient was previously not taking any medications.").tokens

['<s>',
 'T',
 'he',
 'Ġpatient',
 'Ġwas',
 'Ġpreviously',
 'Ġnot',
 'Ġtaking',
 'Ġany',
 'Ġmedications',
 '.',
 '</s>']

# Training the model

In [22]:
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM

# Set a configuration for our RoBERTa model
config = RobertaConfig(
    vocab_size=30522,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
# Initialize the model from a configuration without pretrained weights
model = RobertaForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

Num parameters:  66987834


In [24]:
from transformers import RobertaTokenizerFast
# Create the tokenizer from our trained one
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_dir, max_len=512)

In [25]:
tokenizer

RobertaTokenizerFast(name_or_path='./models/data/TokenizerRoBERTa', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})

In [26]:
def encode_with_truncation(examples):
  """Mapping function to tokenize the sentences passed with truncation"""
  return tokenizer(examples["headline"], truncation=True, padding="max_length",
                    max_length=512, return_special_tokens_mask=True)

def encode_without_truncation(examples):
  """Mapping function to tokenize the sentences passed without truncation"""
  return tokenizer(examples["headline"], return_special_tokens_mask=True)

In [27]:
truncate = False
if truncate:
    print("Tokenizing with truncation")
    train_data_tokenized = training_data_corpus.map(encode_without_truncation, batched=True)
    train_data_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
    print("Tokenizing Complete")
else:
    print("Tokenizing without truncation")
    train_data_tokenized = training_data_corpus.map(encode_without_truncation, batched=True)
    train_data_tokenized.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
    print("Training Tokenizing Complete")

Tokenizing without truncation


Map:   0%|          | 0/2520117 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (778 > 512). Running this sequence through the model will result in indexing errors
                                                                        

Training Tokenizing Complete




In [28]:
truncate = False
if truncate:
    print("Tokenizing with truncation")
    train_data_tokenized = training_data_corpus.map(encode_without_truncation, batched=True)
    train_data_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
    print("Tokenizing Complete")
else:
    print("Tokenizing without truncation")
    test_data_tokenized = testing_data_corpus.map(encode_without_truncation, batched=True)
    test_data_tokenized.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
    print("Testing Tokenizing Complete")

Tokenizing without truncation


                                                                      

Testing Tokenizing Complete




In [29]:
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
# might be slower to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

max_length = 512
if not truncate:
    train_dataset = train_data_tokenized.map(group_texts, batched=True, desc=f"Grouping texts in chunks of {max_length}")
    # convert them from lists to torch tensors
    train_dataset.set_format("torch")
    test_dataset = test_data_tokenized.map(group_texts, batched=True, desc=f"Grouping texts in chunks of {max_length}")
    # convert them from lists to torch tensors
    test_dataset.set_format("torch")

print("data preprocessing is finished")

                                                                                                   

data preprocessing is finished




In [36]:
eval_dataset = test_dataset

In [37]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        # or use the RobertaTokenizer from `transformers` directly.
        self.examples = []
        # For every value in the dataframe 
        for example in tqdm(df.values):
            # 
            x=tokenizer.encode_plus(example,max_length = 512, truncation=True, padding=True)
            self.examples += [x.input_ids]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

In [38]:
# Create the train and evaluation dataset
if run:
    train_dataset = CustomDataset(stacked_train['headline'], tokenizer)
    eval_dataset = CustomDataset(stacked_test['headline'], tokenizer)

# Defining the data collator

In [39]:
from transformers import DataCollatorForLanguageModeling

# Define the Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Intialize and train the Trainer 

In [44]:
TRAIN_BATCH_SIZE = 32    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 32    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 5        # number of epochs to train (default: 10)
LEARNING_RATE = 1e-4    # learning rate (default: 0.001)
WEIGHT_DECAY = 0.01
SEED = 42               # random seed (default: 42)
MAX_LEN = 128
SUMMARY_LEN = 7

In [45]:
model_dir = "./models/EHR-RoBERTa"
!rm -rf {model_dir}
!mkdir {model_dir}

In [46]:
from transformers import Trainer, TrainingArguments
print(model_dir)

# Define the training arguments
training_args = TrainingArguments(
    output_dir=model_dir,
    overwrite_output_dir=True,
    evaluation_strategy = 'steps',
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    logging_steps=5000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=5000,
    #eval_steps=4096,
    save_total_limit=1,
)
# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #prediction_loss_only=True,
)

using `logging_steps` to initialize `eval_steps` to 1000
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


./models/EHR-RoBERTa


#### Train the model

In [47]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask, __index_level_1__, index, headline. If special_tokens_mask, __index_level_1__, index, headline are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1201829
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 375580
  Number of trainable parameters = 66987834


Step,Training Loss,Validation Loss


In [None]:
trainer.save_model(model_folder)

#### Perplexity, AUROC, F1

In [None]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")