In [1]:
from transformers import AutoModelWithLMHead,BertForSequenceClassification, AutoTokenizer, AutoModel,AutoModelForMaskedLM,AutoModelForSequenceClassification
import torch
from torch import nn
import json
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit
from torch.utils.data import DataLoader,TensorDataset
from transformers import Trainer, TrainingArguments
import pickle
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score,roc_curve
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
new_tokens = ["interstitial", "fibrosis", "tubular", "atrophy","antibody","T-cell"]
tokenizer.add_tokens(new_tokens)

6

In [3]:
model = AutoModelForMaskedLM.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(29002, 768)

In [4]:
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29002, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
           

In [5]:
data = pd.read_csv("data.csv")
inputs = data["Raw Case Text"].tolist()
input_encoding = tokenizer(inputs,padding="max_length", truncation=True, 
                            return_tensors="pt",max_length=512)
input_encoding['labels'] = input_encoding.input_ids.detach().clone()
input_encoding.keys()
rand = torch.rand(input_encoding.input_ids.shape)
mask_arr = (rand < 0.15) * (input_encoding.input_ids != 101) * \
           (input_encoding.input_ids != 102) * (input_encoding.input_ids != 0)

mask_pos = [torch.flatten(mask_arr[i].nonzero()).tolist() for i in range(input_encoding.input_ids.shape[0])]

for i in range(input_encoding.input_ids.shape[0]):
    input_encoding.input_ids[i, mask_pos[i]] = 103
    
class MaskedDataset(torch.utils.data.Dataset):
    def __init__(self, encoding):
        self.encoding = encoding
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encoding.items()}
    def __len__(self):
        return len(self.encoding.input_ids)

masked_dataset = MaskedDataset(input_encoding)


In [6]:
class MyTrainer(Trainer):
    def create_optimizer(self):
        """
        Setup the optimizer.
        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
        """
        opt_model = self.model
        if self.optimizer is None:
            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
            decay_parameters = [name for name in decay_parameters if "bias" not in name]
            optimizer_grouped_parameters = [
                {
                    "params": [p for n, p in opt_model.named_parameters() if n in decay_parameters],
                    "weight_decay": self.args.weight_decay,
                },
                {
                    "params": [p for n, p in opt_model.named_parameters() if n not in decay_parameters],
                    "weight_decay": 0.0,
                },
                {
                    "params": opt_model.bert.embeddings.word_embeddings.weight[-6:],
                    "lr": 1e-3,
                },
            ]
            
            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)

            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)

        return self.optimizer


In [7]:

training_args = TrainingArguments(
    output_dir='./mlm_results_largeData_extended_tokenizer_lr',          
    num_train_epochs=4,              
    per_device_train_batch_size=8,  
    #per_device_eval_batch_size=64,   
    #warmup_steps=50,                
    #weight_decay=0.01,                          
    logging_steps=100,
    #evaluation_strateg="steps",
    #eval_steps=100,
    #load_best_model_at_end=True,
    save_steps = 100,
    save_total_limit = 5,
    seed = 0
)


trainer = Trainer(
    model=model,                         
    args=training_args,                 
    train_dataset=masked_dataset,         
)

trainer.train()


***** Running training *****
  Num examples = 3429
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1716
  return {key: torch.tensor(val[idx]) for key, val in self.encoding.items()}


Step,Training Loss
100,0.1793
200,0.0719
300,0.0545
400,0.047
500,0.0336
600,0.0305
700,0.0277
800,0.0311
900,0.0251
1000,0.0193


Saving model checkpoint to ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-100
Configuration saved in ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-100\config.json
Model weights saved in ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-100\pytorch_model.bin
  return {key: torch.tensor(val[idx]) for key, val in self.encoding.items()}
Saving model checkpoint to ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-200
Configuration saved in ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-200\config.json
Model weights saved in ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-200\pytorch_model.bin
  return {key: torch.tensor(val[idx]) for key, val in self.encoding.items()}
Saving model checkpoint to ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-300
Configuration saved in ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-300\config.json
Model weights saved in ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-300\pytorc

Deleting older checkpoint [mlm_results_largeData_extended_tokenizer_lr\checkpoint-800] due to args.save_total_limit
  return {key: torch.tensor(val[idx]) for key, val in self.encoding.items()}
Saving model checkpoint to ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-1400
Configuration saved in ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-1400\config.json
Model weights saved in ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-1400\pytorch_model.bin
Deleting older checkpoint [mlm_results_largeData_extended_tokenizer_lr\checkpoint-900] due to args.save_total_limit
  return {key: torch.tensor(val[idx]) for key, val in self.encoding.items()}
Saving model checkpoint to ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-1500
Configuration saved in ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-1500\config.json
Model weights saved in ./mlm_results_largeData_extended_tokenizer_lr\checkpoint-1500\pytorch_model.bin
Deleting older checkpoint [mlm_resul

KeyboardInterrupt: 