In [1]:
from transformers import AutoModelWithLMHead,BertForSequenceClassification, AutoTokenizer,AutoModelForQuestionAnswering, AutoModel,AutoModelForMaskedLM,AutoModelForSequenceClassification
import torch
from torch import nn
import json
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit
from torch.utils.data import DataLoader,TensorDataset
from transformers import Trainer, TrainingArguments
import pickle
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score,roc_curve
import pandas as pd
import matplotlib.pyplot as plt

from transformers import AdamW,get_scheduler


In [2]:
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

In [2]:
# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# qa_kidneyBert.to("cuda")

In [2]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
new_tokens = ["interstitial", "fibrosis", "tubular", "atrophy","antibody","T-cell"]
tokenizer.add_tokens(new_tokens)

6

In [3]:
def gen_datasets(q,train_text,test_text,tokenizer=tokenizer):
    train_q = [q for i in range(len(train_text))]
    test_q = [q for i in range(len(test_text))]

    train_encodings = tokenizer(train_q,train_text,padding="max_length", truncation=True, 
                                return_tensors="pt",max_length=512,return_offsets_mapping=True)
    test_encodings = tokenizer(test_q,test_text,padding="max_length", truncation=True, 
                                return_tensors="pt",max_length=512,return_offsets_mapping=True)
    train_dataset = RenalDataset(train_encodings, train_labels)
    test_dataset = RenalDataset(test_encodings, test_labels)
    return train_dataset,test_dataset

In [4]:
class RenalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels,task_name=None):
        self.encodings = encodings
        self.answers = labels
        self.task_name = task_name

    def __getitem__(self, idx):
        inputs = {key: val[idx] for key, val in self.encodings.items()}
        answer = self.answers[idx]
        offsets = inputs.pop("offset_mapping")
        input_ids = inputs["input_ids"]
        cls_index = list(input_ids).index(tokenizer.cls_token_id)

        token_type_ids = inputs["token_type_ids"]
        
#         print("Asd",answer)

        if answer[1] == 0:
            inputs["start_positions"] = cls_index
            inputs["end_positions"] = cls_index
        else:
            start_char = answer[0]
            end_char = answer[1]

            token_start_index = 0
            while token_type_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while offsets[token_end_index][1] == 0:
                token_end_index -= 1
                
#             print(offsets[token_start_index][0] , start_char,answer)

#             print(token_start_index,token_end_index)
#             print(offsets[token_start_index], offsets[token_end_index])

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                inputs["start_positions"] = cls_index
                inputs["end_positions"] = cls_index
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
#                     print(offsets[token_start_index],token_start_index)
                    token_start_index += 1
                inputs["start_positions"] = token_start_index - 1

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                inputs["end_positions"] = token_end_index + 1
        inputs["start_positions"] = torch.tensor(inputs["start_positions"])
        inputs["end_positions"] = torch.tensor(inputs["end_positions"])
#         inputs["labels"] = (inputs["start_positions"],inputs["end_positions"])
#         print(inputs["start_positions"],inputs["end_positions"])
        return inputs
        

    def __len__(self):
        return len(self.answers)
    

def compute_metrics(p):   
    
    pred, labels = p   
        
    answer_start_scores, answer_end_scores = pred
    answer_start = np.argmax(answer_start_scores, axis=1)  # get the most likely beginning of answer with the argmax of the score
    answer_end = np.argmax(answer_end_scores, axis=1)+1
    
    
    total = 0
    correct = 0
    for s,e,t,id in zip(answer_start,answer_end,test_ans,test_ids):
        total += 1
        pred_ans = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(id[s:e]))
#         print("qweretw",pred_ans,t,s,e)
        if s == 0 and e == 1 and t == "":
            correct += 1
        elif pred_ans.lower()==t.lower():
            correct += 1
#         else:
#             print(pred_ans.lower(),t.lower())
    
    return {"accuracy": correct/total}
    
#     accuracy = accuracy_score(y_true=labels, y_pred=pred)
#     recall = recall_score(y_true=labels, y_pred=pred,average="micro")
#     precision = precision_score(y_true=labels, y_pred=pred,average="micro")
#     f1 = f1_score(y_true=labels, y_pred=pred,average="micro")
#     print("accuracy: {}, precision: {}, recall: {}, f1: {}".format(accuracy,precision,recall,f1))
#     return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1} 

## Load ABMR data

In [5]:
batch_size = 12


data = pd.read_csv("data.csv")
inputs1 = data["train_report_qa"].tolist()
label1 = data["abmr_pos_qa"].tolist()
label = [eval(l) for i,l in zip(inputs1,label1) if str(i)!="nan"]
inputs = [i for i in inputs1 if str(i)!="nan"]

label_class_help = data["abmr_class"].tolist()
label_class = [l for i,l in zip(inputs1,label_class_help) if str(i)!="nan"]


train_text, test_text, train_labels, test_labels = train_test_split(
    inputs, label,random_state = 1,stratify=label_class,test_size=0.2)


q_abmr = "How is the antibody-mediated rejection?"
train_dataset,test_dataset = gen_datasets(q_abmr,train_text,test_text)
abmr_train_loader = torch.utils.data.DataLoader(train_dataset,batch_size = batch_size,shuffle=True)
abmr_test_loader = torch.utils.data.DataLoader(test_dataset,batch_size = batch_size)

test_ans = []
for i,l in zip(test_text,test_labels):
    test_ans.append(i[l[0]:l[1]])

test_ids = torch.tensor([])
for i in abmr_test_loader:
    test_ids = torch.cat((test_ids,i["input_ids"]),0)
    

In [6]:
import difflib

def get_overlap_ratio(s1, s2):
    s = difflib.SequenceMatcher(None, s1, s2)
    pos_a, pos_b, size = s.find_longest_match(0, len(s1), 0, len(s2)) 
#     print(s1,s2,s1[pos_a:pos_a+size])
    return size/len(s2),len(s1[pos_a:pos_a+size].split())/len(s2.split())

def compute_metrics(p):   
    
    pred, labels = p   
        
    answer_start_scores, answer_end_scores = pred
    answer_start = np.argmax(answer_start_scores, axis=1)  # get the most likely beginning of answer with the argmax of the score
    answer_end = np.argmax(answer_end_scores, axis=1)+1
    
    
    total = 0
    correct,correct_with_info = 0,0
    overlap_ratio_char = []
    overlap_ratio_word = []
    for s,e,t,id in zip(answer_start,answer_end,test_ans,test_ids):
        total += 1
        pred_ans = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(id[s:e]))
#         print("qweretw",pred_ans,t,s,e)
        if s == 0 and e == 1 and t == "":
            correct += 1
        elif not (s == 0 and e == 1) and t!="":
            if pred_ans.lower().replace('\n', ' ')==t.lower():
                correct_with_info += 1
            char_ratio,word_ratio = get_overlap_ratio(pred_ans.lower().replace('\n', ' '),t.lower())
            overlap_ratio_char.append(char_ratio)
            overlap_ratio_word.append(word_ratio)
    
    return {"accuracy": (correct+correct_with_info)/total,"accuracy_info": correct_with_info/total,\
            "overlap_ratio_char":np.mean(overlap_ratio_char),"overlap_ratio_word":np.mean(overlap_ratio_word)}

In [7]:
# qa_abmr_kidneyBert = AutoModelForQuestionAnswering.from_pretrained("./qaabmr_fine_mlm_largeData_pos/checkpoint-600")
# qa_abmr_kidneyBert_trainer = Trainer(qa_abmr_kidneyBert) 
# raw_pred,_,_=qa_abmr_kidneyBert_trainer.predict(test_dataset) 

In [8]:
qa_bioBert = AutoModelForQuestionAnswering.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

steps = 50

training_args = TrainingArguments(
    output_dir='./qaabmr_fine_biobert_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 1,
    seed = 0
)


trainer = Trainer(
    model=qa_bioBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized f

Step,Training Loss,Validation Loss,Accuracy,Accuracy Info,Overlap Ratio Char,Overlap Ratio Word
50,3.7024,0.079623,0.967883,0.0,,
100,0.0492,0.049821,0.967883,0.0,0.0,0.0
150,0.0651,0.036073,0.966423,0.0,0.112437,0.100682
200,0.055,0.018535,0.967883,0.0,0.295811,0.377348
250,0.0309,0.012556,0.967883,0.0,0.366086,0.480758
300,0.0175,0.017407,0.967883,0.0,0.36342,0.460758
350,0.0144,0.021029,0.967883,0.0,0.36342,0.460758
400,0.0137,0.018386,0.967883,0.0,0.36342,0.460758
450,0.0022,0.019221,0.967883,0.0,0.518578,0.666667
500,0.0078,0.022519,0.966423,0.0,0.36342,0.460758


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qaabmr_fine_biobert_pos\checkpoint-50
Configuration saved in ./qaabmr_fine_biobert_pos\checkpoint-50\config.json
Model weights saved in ./qaabmr_fine_biobert_pos\checkpoint-50\pytorch_model.bin
Deleting older checkpoint [qaabmr_fine_biobert_pos\checkpoint-650] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaabmr_fine_biobert_pos\checkpoint-100
Configuration saved in ./qaabmr_fine_biobert_pos\checkpoint-100\config.json
Model weights saved in ./qaabmr_fine_biobert_pos\checkpoint-100\pytorch_model.bin
Deleting older checkpoint [qaabmr_fine_biobert_pos\checkpoint-1150] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaabmr_fine_biobert_pos\checkpo

KeyboardInterrupt: 

In [8]:
qa_exkidBert = AutoModelForQuestionAnswering.from_pretrained("./mlm_results_largeData_extended_tokenizer/checkpoint-1100")

steps = 50

training_args = TrainingArguments(
    output_dir='./qaabmr_fine_exkidbert_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 1,
    seed = 0
)


trainer = Trainer(
    model=qa_exkidBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at ./mlm_results_largeData_extended_tokenizer/checkpoint-1100 were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ./mlm_

Step,Training Loss,Validation Loss,Accuracy,Accuracy Info,Overlap Ratio Char,Overlap Ratio Word
50,3.7304,0.095306,0.967883,0.0,,
100,0.0713,0.075608,0.967883,0.0,,
150,0.1351,0.051657,0.967883,0.0,,
200,0.0615,0.025861,0.966423,0.0,0.366086,0.480758
250,0.0426,0.025303,0.967883,0.0,0.604167,0.833333
300,0.0332,0.017794,0.967883,0.0,0.332753,0.447424
350,0.0256,0.014829,0.967883,0.0,0.332753,0.447424
400,0.0225,0.01473,0.966423,0.0,0.229154,0.281591
450,0.0146,0.010389,0.967883,0.0,0.36342,0.460758
500,0.0115,0.017335,0.967883,0.0,0.263811,0.317348


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qaabmr_fine_exkidbert_pos\checkpoint-50
Configuration saved in ./qaabmr_fine_exkidbert_pos\checkpoint-50\config.json
Model weights saved in ./qaabmr_fine_exkidbert_pos\checkpoint-50\pytorch_model.bin
Deleting older checkpoint [qaabmr_fine_exkidbert_pos\checkpoint-700] due to args.save_total_limit
Deleting older checkpoint [qaabmr_fine_exkidbert_pos\checkpoint-1500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qaabmr_fine_exkidbert_pos\checkpoint-100
Configuration saved in ./qaabmr_fine_exkidbert_pos\checkpoint-100\config.json
Model weights saved in ./qaabmr_fine_exkidbert_pos\checkpoint-100\pytorch_model.bin
Deleting older checkpoint 

KeyboardInterrupt: 

In [8]:
qa_vanBert = AutoModelForQuestionAnswering.from_pretrained("bert-base-cased")

steps = 100

training_args = TrainingArguments(
    output_dir='./qaabmr_fine_vanbert_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 3,
    seed = 0
)


trainer = Trainer(
    model=qa_vanBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

Step,Training Loss,Validation Loss,Accuracy,Accuracy Info,Overlap Ratio Char,Overlap Ratio Word
100,1.4137,0.076403,0.967883,0.0,,
200,0.085,0.069766,0.967883,0.0,,
300,0.0638,0.033845,0.967883,0.0,,
400,0.0414,0.023165,0.967883,0.0,0.0,0.0
500,0.0348,0.031906,0.963504,0.0,0.324936,0.395139
600,0.0348,0.026727,0.967883,0.0,0.0,0.0
700,0.0174,0.021674,0.967883,0.0,0.442083,0.616667
800,0.0117,0.011577,0.967883,0.0,0.36342,0.460758
900,0.018,0.010515,0.967883,0.0,0.283221,0.375758
1000,0.0047,0.008912,0.967883,0.0,0.366086,0.480758


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qaabmr_fine_vanbert_pos\checkpoint-100
Configuration saved in ./qaabmr_fine_vanbert_pos\checkpoint-100\config.json
Model weights saved in ./qaabmr_fine_vanbert_pos\checkpoint-100\pytorch_model.bin
Deleting older checkpoint [qaabmr_fine_vanbert_pos\checkpoint-1300] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qaabmr_fine_vanbert_pos\checkpoint-200
Configuration saved in ./qaabmr_fine_vanbert_pos\checkpoint-200\config.json
Model weights saved in ./qaabmr_fine_vanbert_pos\checkpoint-200\pytorch_model.bin
Deleting older checkpoint [qaabmr_fine_vanbert_pos\checkpoint-1400] due to args.save_total_limit
***** Running Evaluation *****
  Num e

KeyboardInterrupt: 

In [9]:
qa_bioasq_kidneyBert = AutoModelForQuestionAnswering.from_pretrained("qa_bioasq_fine_kidneybert/checkpoint-150")

steps = 100

training_args = TrainingArguments(
    output_dir='./qaabmr_fine_bioasq_kidneyBert',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 30,
    seed = 0
)


trainer = Trainer(
    model=qa_bioasq_kidneyBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

***** Running training *****
  Num examples = 2738
  Num Epochs = 15
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 3435


Step,Training Loss,Validation Loss,Accuracy,Accuracy Info,Overlap Ratio Char,Overlap Ratio Word
100,0.0994,0.037542,0.967883,0.0,0.041667,0.166667
200,0.0317,0.019869,0.966423,0.0,0.242586,0.294091
300,0.022,0.015032,0.966423,0.0,0.36342,0.460758
400,0.0139,0.011778,0.966423,0.0,0.36342,0.460758
500,0.0021,0.01336,0.967883,0.0,0.36342,0.460758
600,0.0009,0.018384,0.967883,0.0,0.36342,0.460758


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaabmr_fine_bioasq_kidneyBert\checkpoint-100
Configuration saved in ./qaabmr_fine_bioasq_kidneyBert\checkpoint-100\config.json
Model weights saved in ./qaabmr_fine_bioasq_kidneyBert\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaabmr_fine_bioasq_kidneyBert\checkpoint-200
Configuration saved in ./qaabmr_fine_bioasq_kidneyBert\checkpoint-200\config.json
Model weights saved in ./qaabmr_fine_bioasq_kidneyBert\checkpoint-200\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaabmr_fine_bioasq_kidneyBert\checkpoint-300
Configuration saved in ./qaabmr_fine_bioasq_kidneyBert\checkpoint-300\config.json
Model weights saved in ./qaabmr_fine_bioasq_kidneyBert\checkpoint-300\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batc

KeyboardInterrupt: 

In [7]:
qa_cinicalBert = AutoModelForQuestionAnswering.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

steps = 100

training_args = TrainingArguments(
    output_dir='./qaabmr_fine_clinicalbert_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 30,
    seed = 0
)


trainer = Trainer(
    model=qa_cinicalBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at emily

Step,Training Loss,Validation Loss,Accuracy,Accuracy Info,Overlap Ratio Char,Overlap Ratio Word
100,1.5995,0.067645,0.967883,0.0,,
200,0.0612,0.026709,0.967883,0.0,0.06,0.05
300,0.04,0.029099,0.966423,0.0,0.19577,0.234015
400,0.0173,0.019804,0.966423,0.0,0.36342,0.460758
500,0.0099,0.018984,0.966423,0.0,0.36342,0.460758
600,0.005,0.020145,0.966423,0.0,0.36342,0.460758
700,0.0124,0.016772,0.966423,0.0,0.36342,0.460758
800,0.003,0.018907,0.966423,0.0,0.36342,0.460758
900,0.0093,0.012994,0.967883,0.0,0.36342,0.460758
1000,0.0051,0.015775,0.967883,0.0,0.36342,0.460758


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qaabmr_fine_clinicalbert_pos\checkpoint-100
Configuration saved in ./qaabmr_fine_clinicalbert_pos\checkpoint-100\config.json
Model weights saved in ./qaabmr_fine_clinicalbert_pos\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaabmr_fine_clinicalbert_pos\checkpoint-200
Configuration saved in ./qaabmr_fine_clinicalbert_pos\checkpoint-200\config.json
Model weights saved in ./qaabmr_fine_clinicalbert_pos\checkpoint-200\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaabmr_fine_clinicalbert_pos\checkpoint-300
Configuration saved in ./qaabmr_fine_clinicalbert_pos\checkpoint-300\config.json
Model weights saved in ./qaabmr_fine_clinicalbert_pos\checkpoint-3

KeyboardInterrupt: 

In [7]:
qa_kidneyBert = AutoModelForQuestionAnswering.from_pretrained("./mlm_results_largeData/checkpoint-1100")

steps = 100

training_args = TrainingArguments(
    output_dir='./qaabmr_fine_mlm_largeData_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 30,
    seed = 0
)


trainer = Trainer(
    model=qa_kidneyBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at ./mlm_results_largeData/checkpoint-1100 were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ./mlm_results_largeData/c

Step,Training Loss,Validation Loss,Accuracy,Accuracy Info,Overlap Ratio Char,Overlap Ratio Word
100,1.7115,0.072496,0.967883,0.0,,
200,0.0744,0.037608,0.966423,0.0,0.112437,0.100682
300,0.0317,0.022305,0.967883,0.0,0.14177,0.140682
400,0.0197,0.026483,0.964964,0.0,0.336554,0.435758
500,0.0137,0.021258,0.967883,0.0,0.242586,0.294091
600,0.0053,0.015662,0.967883,0.0,0.36342,0.460758
700,0.0106,0.011931,0.967883,0.0,0.336554,0.435758
800,0.0034,0.010857,0.967883,0.0,0.349987,0.448258
900,0.0104,0.014431,0.967883,0.0,0.36342,0.460758
1000,0.0004,0.014759,0.967883,0.0,0.36342,0.460758


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qaabmr_fine_mlm_largeData_pos\checkpoint-100
Configuration saved in ./qaabmr_fine_mlm_largeData_pos\checkpoint-100\config.json
Model weights saved in ./qaabmr_fine_mlm_largeData_pos\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaabmr_fine_mlm_largeData_pos\checkpoint-200
Configuration saved in ./qaabmr_fine_mlm_largeData_pos\checkpoint-200\config.json
Model weights saved in ./qaabmr_fine_mlm_largeData_pos\checkpoint-200\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaabmr_fine_mlm_largeData_pos\checkpoint-300
Configuration saved in ./qaabmr_fine_mlm_largeData_pos\checkpoint-300\config.json
Model weights saved in ./qaabmr_fine_mlm_largeData_pos\che

KeyboardInterrupt: 

## Load TCMR data

In [5]:
batch_size = 12


data = pd.read_csv("data.csv")
inputs1 = data["train_report_qa"].tolist()
label1 = data["tcmr_pos_qa"].tolist()
label = [eval(l) for i,l in zip(inputs1,label1) if str(i)!="nan"]
inputs = [i for i in inputs1 if str(i)!="nan"]

label_class_help = data["tcmr_class"].tolist()
label_class = [l for i,l in zip(inputs1,label_class_help) if str(i)!="nan"]


train_text, test_text, train_labels, test_labels = train_test_split(
    inputs, label,random_state = 1,stratify=label_class,test_size=0.2)


q_tcmr = "How is the t-cell-mediated rejection?"
train_dataset,test_dataset = gen_datasets(q_tcmr,train_text,test_text)
tcmr_train_loader = torch.utils.data.DataLoader(train_dataset,batch_size = batch_size,shuffle=True)
tcmr_test_loader = torch.utils.data.DataLoader(test_dataset,batch_size = batch_size)

test_ans = []
for i,l in zip(test_text,test_labels):
    test_ans.append(i[l[0]:l[1]])

test_ids = torch.tensor([])
for i in tcmr_test_loader:
    test_ids = torch.cat((test_ids,i["input_ids"]),0)
    

In [6]:
import difflib

def get_overlap_ratio(s1, s2):
    s = difflib.SequenceMatcher(None, s1, s2)
    pos_a, pos_b, size = s.find_longest_match(0, len(s1), 0, len(s2)) 
#     print(s1,s2,s1[pos_a:pos_a+size])
    return size/len(s2),len(s1[pos_a:pos_a+size].split())/len(s2.split())

def compute_metrics(p):   
    
    pred, labels = p   
        
    answer_start_scores, answer_end_scores = pred
    answer_start = np.argmax(answer_start_scores, axis=1)  # get the most likely beginning of answer with the argmax of the score
    answer_end = np.argmax(answer_end_scores, axis=1)+1
    
    
    total = 0
    correct,correct_with_info = 0,0
    overlap_ratio_char = []
    overlap_ratio_word = []
    for s,e,t,id in zip(answer_start,answer_end,test_ans,test_ids):
        total += 1
        pred_ans = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(id[s:e]))
#         print("qweretw",pred_ans,t,s,e)
        if s == 0 and e == 1 and t == "":
            correct += 1
        elif not (s == 0 and e == 1) and t!="":
            if pred_ans.lower().replace('\n', ' ')==t.lower():
                correct_with_info += 1
            char_ratio,word_ratio = get_overlap_ratio(pred_ans.lower().replace('\n', ' '),t.lower())
            overlap_ratio_char.append(char_ratio)
            overlap_ratio_word.append(word_ratio)
    
    return {"accuracy": (correct+correct_with_info)/total,"accuracy_info": correct_with_info/total,\
            "overlap_ratio_char":np.mean(overlap_ratio_char),"overlap_ratio_word":np.mean(overlap_ratio_word)}

In [7]:
qa_bioBert = AutoModelForQuestionAnswering.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

steps = 50

training_args = TrainingArguments(
    output_dir='./qatcmr_fine_biobert_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 1,
    seed = 0
)


trainer = Trainer(
    model=qa_bioBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized f

Step,Training Loss,Validation Loss,Accuracy,Accuracy Info,Overlap Ratio Char,Overlap Ratio Word
50,3.588,0.079879,0.978102,0.0,,
100,0.0093,0.05972,0.978102,0.0,,
150,0.0437,0.043415,0.978102,0.0,,
200,0.0507,0.025051,0.978102,0.0,0.043478,0.166667
250,0.0213,0.024153,0.978102,0.0,,
300,0.0123,0.022493,0.978102,0.0,0.400175,0.430556
350,0.0142,0.016949,0.978102,0.0,0.176773,0.145833
400,0.0111,0.015051,0.978102,0.0,0.259824,0.222222
450,0.0183,0.010943,0.978102,0.0,0.354027,0.444444
500,0.0158,0.009115,0.978102,0.0,0.267607,0.361111


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_biobert_pos\checkpoint-50
Configuration saved in ./qatcmr_fine_biobert_pos\checkpoint-50\config.json
Model weights saved in ./qatcmr_fine_biobert_pos\checkpoint-50\pytorch_model.bin
Deleting older checkpoint [qatcmr_fine_biobert_pos\checkpoint-1500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_biobert_pos\checkpoint-100
Configuration saved in ./qatcmr_fine_biobert_pos\checkpoint-100\config.json
Model weights saved in ./qatcmr_fine_biobert_pos\checkpoint-100\pytorch_model.bin
Deleting older checkpoint [qatcmr_fine_biobert_pos\checkpoint-1700] due to args.save_total_limit
***** Running Evaluation *****
  Num exam

Model weights saved in ./qatcmr_fine_biobert_pos\checkpoint-900\pytorch_model.bin
Deleting older checkpoint [qatcmr_fine_biobert_pos\checkpoint-800] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qatcmr_fine_biobert_pos\checkpoint-950
Configuration saved in ./qatcmr_fine_biobert_pos\checkpoint-950\config.json
Model weights saved in ./qatcmr_fine_biobert_pos\checkpoint-950\pytorch_model.bin
Deleting older checkpoint [qatcmr_fine_biobert_pos\checkpoint-850] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qatcmr_fine_biobert_pos\checkpoint-1000
Configuration saved in ./qatcmr_fine_biobert_pos\checkpoint-1000\config.json
Model weights saved in ./qatcmr_fine_biobert_pos\checkpoint-1000\pytorch_model.bin
Deleting older checkpoint [qatcmr_fine_biobert_pos\checkpoint-950] due to args.save_total_limit
***** Running Evaluation *****
  Nu

KeyboardInterrupt: 

In [7]:
qa_exkidBert = AutoModelForQuestionAnswering.from_pretrained("./mlm_results_largeData_extended_tokenizer/checkpoint-1100")

steps = 50

training_args = TrainingArguments(
    output_dir='./qatcmr_fine_exkidbert_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 3,
    seed = 0
)


trainer = Trainer(
    model=qa_exkidBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at ./mlm_results_largeData_extended_tokenizer/checkpoint-1100 were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ./mlm_

Step,Training Loss,Validation Loss,Accuracy,Accuracy Info,Overlap Ratio Char,Overlap Ratio Word
50,3.6261,0.080515,0.978102,0.0,,
100,0.0088,0.061449,0.978102,0.0,,
150,0.0442,0.058689,0.978102,0.0,,
200,0.0807,0.045252,0.978102,0.0,,
250,0.0211,0.045229,0.978102,0.0,,
300,0.0144,0.043424,0.978102,0.0,,
350,0.0405,0.040156,0.978102,0.0,,
400,0.0275,0.044134,0.978102,0.0,,
450,0.0407,0.02284,0.978102,0.0,0.586957,0.833333
500,0.012,0.047014,0.978102,0.0,,


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_exkidbert_pos\checkpoint-50
Configuration saved in ./qatcmr_fine_exkidbert_pos\checkpoint-50\config.json
Model weights saved in ./qatcmr_fine_exkidbert_pos\checkpoint-50\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_exkidbert_pos\checkpoint-100
Configuration saved in ./qatcmr_fine_exkidbert_pos\checkpoint-100\config.json
Model weights saved in ./qatcmr_fine_exkidbert_pos\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_exkidbert_pos\checkpoint

  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_exkidbert_pos\checkpoint-650
Configuration saved in ./qatcmr_fine_exkidbert_pos\checkpoint-650\config.json
Model weights saved in ./qatcmr_fine_exkidbert_pos\checkpoint-650\pytorch_model.bin
Deleting older checkpoint [qatcmr_fine_exkidbert_pos\checkpoint-550] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_exkidbert_pos\checkpoint-700
Configuration saved in ./qatcmr_fine_exkidbert_pos\checkpoint-700\config.json
Model weights saved in ./qatcmr_fine_exkidbert_pos\checkpoint-700\pytorch_model.bin
Deleting older checkpoint [qatcmr_fine_exkidbert_pos\checkpoint-600] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving

Deleting older checkpoint [qatcmr_fine_exkidbert_pos\checkpoint-1250] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qatcmr_fine_exkidbert_pos\checkpoint-1400
Configuration saved in ./qatcmr_fine_exkidbert_pos\checkpoint-1400\config.json
Model weights saved in ./qatcmr_fine_exkidbert_pos\checkpoint-1400\pytorch_model.bin
Deleting older checkpoint [qatcmr_fine_exkidbert_pos\checkpoint-1300] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qatcmr_fine_exkidbert_pos\checkpoint-1450
Configuration saved in ./qatcmr_fine_exkidbert_pos\checkpoint-1450\config.json
Model weights saved in ./qatcmr_fine_exkidbert_pos\checkpoint-1450\pytorch_model.bin
Deleting older checkpoint [qatcmr_fine_exkidbert_pos\checkpoint-1350] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint

KeyboardInterrupt: 

In [7]:
qa_vanBert = AutoModelForQuestionAnswering.from_pretrained("bert-base-cased")

steps = 100

training_args = TrainingArguments(
    output_dir='./qatcmr_fine_vanlbert_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 3,
    seed = 0
)


trainer = Trainer(
    model=qa_vanBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

Step,Training Loss,Validation Loss,Accuracy,Accuracy Info,Overlap Ratio Char,Overlap Ratio Word
100,1.5227,0.065988,0.978102,0.0,,
200,0.0558,0.058312,0.978102,0.0,,
300,0.0226,0.044044,0.978102,0.0,,
400,0.0339,0.056352,0.978102,0.0,,
500,0.036,0.034096,0.978102,0.0,,
600,0.0153,0.020429,0.978102,0.0,,
700,0.0225,0.016256,0.978102,0.0,0.494378,0.652778
800,0.0167,0.030646,0.978102,0.0,,
900,0.0046,0.025208,0.978102,0.0,,
1000,0.0018,0.011837,0.978102,0.0,0.354027,0.444444


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_vanlbert_pos\checkpoint-100
Configuration saved in ./qatcmr_fine_vanlbert_pos\checkpoint-100\config.json
Model weights saved in ./qatcmr_fine_vanlbert_pos\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_vanlbert_pos\checkpoint-200
Configuration saved in ./qatcmr_fine_vanlbert_pos\checkpoint-200\config.json
Model weights saved in ./qatcmr_fine_vanlbert_pos\checkpoint-200\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_vanlbert_pos\checkpoint-300

Deleting older checkpoint [qatcmr_fine_vanlbert_pos\checkpoint-1300] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qatcmr_fine_vanlbert_pos\checkpoint-1600
Configuration saved in ./qatcmr_fine_vanlbert_pos\checkpoint-1600\config.json
Model weights saved in ./qatcmr_fine_vanlbert_pos\checkpoint-1600\pytorch_model.bin
Deleting older checkpoint [qatcmr_fine_vanlbert_pos\checkpoint-1400] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qatcmr_fine_vanlbert_pos\checkpoint-1700
Configuration saved in ./qatcmr_fine_vanlbert_pos\checkpoint-1700\config.json
Model weights saved in ./qatcmr_fine_vanlbert_pos\checkpoint-1700\pytorch_model.bin
Deleting older checkpoint [qatcmr_fine_vanlbert_pos\checkpoint-1500] due to args.save_total_limit


KeyboardInterrupt: 

In [7]:
qa_bioasq_kidneyBert = AutoModelForQuestionAnswering.from_pretrained("qa_bioasq_fine_kidneybert/checkpoint-150")

steps = 100

training_args = TrainingArguments(
    output_dir='./qatcmr_fine_bioasq_kidneyBert',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 30,
    seed = 0
)


trainer = Trainer(
    model=qa_bioasq_kidneyBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

***** Running training *****
  Num examples = 2738
  Num Epochs = 15
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 3435


Step,Training Loss,Validation Loss,Accuracy,Accuracy Info,Overlap Ratio Char,Overlap Ratio Word
100,0.0863,0.03649,0.978102,0.0,,
200,0.0226,0.008223,0.978102,0.0,0.430051,0.569444
300,0.0046,0.017078,0.978102,0.0,0.432113,0.666667
400,0.0019,0.017963,0.978102,0.0,0.528604,0.791667
500,0.0141,0.012092,0.978102,0.0,0.494378,0.652778
600,0.0118,0.008023,0.978102,0.0,0.494378,0.652778
700,0.0051,0.008464,0.978102,0.0,0.494378,0.652778
800,0.0001,0.015104,0.978102,0.0,0.494378,0.652778
900,0.0,0.01506,0.978102,0.0,0.494378,0.652778
1000,0.0027,0.007724,0.978102,0.0,0.494378,0.652778


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_bioasq_kidneyBert\checkpoint-100
Configuration saved in ./qatcmr_fine_bioasq_kidneyBert\checkpoint-100\config.json
Model weights saved in ./qatcmr_fine_bioasq_kidneyBert\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qatcmr_fine_bioasq_kidneyBert\checkpoint-200
Configuration saved in ./qatcmr_fine_bioasq_kidneyBert\checkpoint-200\config.json
Model weights saved in ./qatcmr_fine_bioasq_kidneyBert\checkpoint-200\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qatcmr_fine_bioasq_kidneyBert\checkpoint-300
Configuration saved in ./qatcmr_fine_bioasq_kidneyBert\checkpoint-300\config.json
Model weights saved in ./qatcmr_fine_bioasq_kidneyBert\che

KeyboardInterrupt: 

In [7]:
qa_cinicalBert = AutoModelForQuestionAnswering.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

steps = 100

training_args = TrainingArguments(
    output_dir='./qatcmr_fine_clinicalbert_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 30,
    seed = 0
)


trainer = Trainer(
    model=qa_cinicalBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at emily

Step,Training Loss,Validation Loss,Accuracy,Accuracy Info,Overlap Ratio Char,Overlap Ratio Word
100,1.6381,0.064063,0.978102,0.0,,
200,0.0557,0.056944,0.978102,0.0,,
300,0.0214,0.039585,0.978102,0.0,,
400,0.0328,0.042593,0.978102,0.0,,
500,0.03,0.028145,0.978102,0.0,,
600,0.0109,0.016437,0.978102,0.0,0.494378,0.652778
700,0.017,0.011492,0.978102,0.0,0.354027,0.444444
800,0.0069,0.011739,0.978102,0.0,0.334958,0.375
900,0.0009,0.014642,0.978102,0.0,0.194607,0.166667
1000,0.0033,0.015392,0.978102,0.0,0.141975,0.125


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_clinicalbert_pos\checkpoint-100
Configuration saved in ./qatcmr_fine_clinicalbert_pos\checkpoint-100\config.json
Model weights saved in ./qatcmr_fine_clinicalbert_pos\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_clinicalbert_pos\checkpoint-200
Configuration saved in ./qatcmr_fine_clinicalbert_pos\checkpoint-200\config.json
Model weights saved in ./qatcmr_fine_clinicalbert_pos\checkpoint-200\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_cli

KeyboardInterrupt: 

In [7]:
qa_kidneyBert = AutoModelForQuestionAnswering.from_pretrained("./mlm_results_largeData/checkpoint-1100")

steps = 100

training_args = TrainingArguments(
    output_dir='./qatcmr_fine_mlm_largeData_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 30,
    seed = 0
)


trainer = Trainer(
    model=qa_kidneyBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at ./mlm_results_largeData/checkpoint-1100 were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ./mlm_results_largeData/c

Step,Training Loss,Validation Loss,Accuracy,Accuracy Info,Overlap Ratio Char,Overlap Ratio Word
100,1.7919,0.061571,0.978102,0.0,,
200,0.0515,0.041403,0.978102,0.0,,
300,0.0162,0.039312,0.978102,0.0,0.456522,0.666667
400,0.0172,0.018546,0.978102,0.0,0.494378,0.652778
500,0.021,0.014726,0.978102,0.0,0.400175,0.430556
600,0.0052,0.012252,0.978102,0.0,0.108187,0.083333
700,0.0065,0.013455,0.978102,0.0,0.494378,0.652778
800,0.0016,0.01149,0.978102,0.0,0.334958,0.375
900,0.0004,0.01325,0.978102,0.0,0.334958,0.375
1000,0.0018,0.013116,0.978102,0.0,0.354027,0.444444


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_mlm_largeData_pos\checkpoint-100
Configuration saved in ./qatcmr_fine_mlm_largeData_pos\checkpoint-100\config.json
Model weights saved in ./qatcmr_fine_mlm_largeData_pos\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./qatcmr_fine_mlm_largeData_pos\checkpoint-200
Configuration saved in ./qatcmr_fine_mlm_largeData_pos\checkpoint-200\config.json
Model weights saved in ./qatcmr_fine_mlm_largeData_pos\checkpoint-200\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qatcmr_fine_mlm_largeData_pos\checkpoint-300
Configuration saved in ./qatcmr_fine_mlm_largeData_p

KeyboardInterrupt: 

## Load Polyomavirus data

In [5]:
batch_size = 12


data = pd.read_csv("data.csv")
inputs1 = data["train_report_qa"].tolist()
label1 = data["ispoly_pos_qa"].tolist()
label = [eval(l) for i,l in zip(inputs1,label1) if str(i)!="nan"]
inputs = [i for i in inputs1 if str(i)!="nan"]

label_class_help = data["poly_infection"].tolist()
label_class = [l for i,l in zip(inputs1,label_class_help) if str(i)!="nan"]


train_text, test_text, train_labels, test_labels = train_test_split(
    inputs, label,random_state = 1,stratify=label_class,test_size=0.2)


q_poly = "Is there any polyomavirus infection?"
train_dataset,test_dataset = gen_datasets(q_poly,train_text,test_text)
poly_train_loader = torch.utils.data.DataLoader(train_dataset,batch_size = batch_size,shuffle=True)
poly_test_loader = torch.utils.data.DataLoader(test_dataset,batch_size = batch_size)

test_ans = []
for i,l in zip(test_text,test_labels):
    test_ans.append(i[l[0]:l[1]])

test_ids = torch.tensor([])
for i in poly_test_loader:
    test_ids = torch.cat((test_ids,i["input_ids"]),0)
    

In [6]:
qa_cinicalBert = AutoModelForQuestionAnswering.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

steps = 100

training_args = TrainingArguments(
    output_dir='./qapoly_fine_clinicalbert_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 30,
    seed = 0
)


trainer = Trainer(
    model=qa_cinicalBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at emily

Step,Training Loss,Validation Loss,Accuracy
100,2.0534,0.030049,0.960584
200,0.0446,0.009902,0.960584
300,0.0491,0.006568,0.960584
400,0.0507,0.007824,0.960584
500,0.0258,0.007845,0.960584
600,0.0285,0.007899,0.959124
700,0.0207,0.014173,0.959124
800,0.0133,0.005285,0.960584
900,0.0094,0.003263,0.960584


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qapoly_fine_clinicalbert_pos\checkpoint-100
Configuration saved in ./qapoly_fine_clinicalbert_pos\checkpoint-100\config.json
Model weights saved in ./qapoly_fine_clinicalbert_pos\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qapoly_fine_clinicalbert_pos\checkpoint-200
Configuration saved in ./qapoly_fine_clinicalbert_pos\checkpoint-200\config.json
Model weights saved in ./qapoly_fine_clinicalbert_pos\checkpoint-200\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qapoly_fine_clinicalbert_pos\checkpoint-300
Configuration saved in ./qapoly_fine_clinicalbert_pos\checkpoint-300\config.json
Model weights saved in ./qapoly_fine_clinicalbert_pos\checkpoint-300\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 

KeyboardInterrupt: 

In [8]:
qa_kidneyBert = AutoModelForQuestionAnswering.from_pretrained("./mlm_results_largeData/checkpoint-1100")

steps = 100

training_args = TrainingArguments(
    output_dir='./qapoly_fine_mlm_largeData_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 30,
    seed = 0
)


trainer = Trainer(
    model=qa_kidneyBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at ./mlm_results_largeData/checkpoint-1100 were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ./mlm_results_largeData/c

Step,Training Loss,Validation Loss,Accuracy
100,2.0219,0.018061,0.960584
200,0.0417,0.009134,0.960584
300,0.0422,0.004702,0.960584
400,0.0355,0.006736,0.960584
500,0.0209,0.008893,0.960584
600,0.0187,0.007032,0.960584
700,0.0315,0.005353,0.960584
800,0.0162,0.002379,0.960584
900,0.008,0.002174,0.960584
1000,0.0042,0.002508,0.960584


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qapoly_fine_mlm_largeData_pos\checkpoint-100
Configuration saved in ./qapoly_fine_mlm_largeData_pos\checkpoint-100\config.json
Model weights saved in ./qapoly_fine_mlm_largeData_pos\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qapoly_fine_mlm_largeData_pos\checkpoint-200
Configuration saved in ./qapoly_fine_mlm_largeData_pos\checkpoint-200\config.json
Model weights saved in ./qapoly_fine_mlm_largeData_pos\checkpoint-200\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qapoly_fine_mlm_largeData_pos\checkpoint-300
Configuration saved in ./qapoly_fine_mlm_largeData_pos\checkpoint-300\config.json
Model weights saved in ./qapoly_fine_mlm_largeData_pos\checkpoint-300\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batc

KeyboardInterrupt: 

## Load IFTA data

In [5]:
batch_size = 12


data = pd.read_csv("data.csv")
inputs1 = data["train_report_qa"].tolist()
label1 = data["ifta_pos_qa"].tolist()
label = [eval(l) for i,l in zip(inputs1,label1) if str(i)!="nan"]
inputs = [i for i in inputs1 if str(i)!="nan"]

label_class_help1 = data["IFTA"].tolist()
label_class_help2 = [l for i,l in zip(inputs1,label_class_help1) if str(i)!="nan"]
label_class = [0 if l in ["nosig","minimal","noinfo"] else (1 if l=="mild" else (2 if l=="moderate" else 3)) for l in label_class_help2]

train_text, test_text, train_labels, test_labels = train_test_split(
    inputs, label,random_state = 1,stratify=label_class,test_size=0.2)


q_ifta = "What is the grade of interstitial fibrosis and tubular atrophy?"
train_dataset,test_dataset = gen_datasets(q_ifta,train_text,test_text)
ifta_train_loader = torch.utils.data.DataLoader(train_dataset,batch_size = batch_size,shuffle=True)
ifta_test_loader = torch.utils.data.DataLoader(test_dataset,batch_size = batch_size)



test_ans = []
for i,l in zip(test_text,test_labels):
    test_ans.append(i[l[0]:l[1]])

test_ids = torch.tensor([])
for i in ifta_test_loader:
    test_ids = torch.cat((test_ids,i["input_ids"]),0)


In [7]:
qa_bioBert = AutoModelForQuestionAnswering.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

steps = 50

training_args = TrainingArguments(
    output_dir='./qaifta_fine_bioBert_extoken',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 1,
    seed = 0
)


trainer = Trainer(
    model=qa_bioBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized f

Step,Training Loss,Validation Loss,Accuracy
50,4.4456,1.250526,0.832117
100,0.6014,0.21502,0.918248
150,0.2428,0.20486,0.937226
200,0.194,0.199551,0.912409
250,0.1988,0.276754,0.941606
300,0.174,0.180952,0.948905
350,0.1074,0.24021,0.915328
400,0.1786,0.179767,0.948905
450,0.1415,0.205316,0.945985
500,0.1055,0.193516,0.951825


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_bioBert_extoken\checkpoint-50
Configuration saved in ./qaifta_fine_bioBert_extoken\checkpoint-50\config.json
Model weights saved in ./qaifta_fine_bioBert_extoken\checkpoint-50\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_bioBert_extoken\checkpoint-100
Configuration saved in ./qaifta_fine_bioBert_extoken\checkpoint-100\config.json
Model weights saved in ./qaifta_fine_bioBert_extoken\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_bioBert_extoken\checkpoint-150
Configuration saved in ./qaifta_fine_bioBert_extoken\checkpoint-150\config.json
Model weights saved in ./qaifta_fine_bioBert_extoken\checkpoint-150\pytorch_model.bin
Deleting older checkpoint [qaifta_fine_bioBert_extoken\checkpoint-50] due to ar

Deleting older checkpoint [qaifta_fine_bioBert_extoken\checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_bioBert_extoken\checkpoint-1100
Configuration saved in ./qaifta_fine_bioBert_extoken\checkpoint-1100\config.json
Model weights saved in ./qaifta_fine_bioBert_extoken\checkpoint-1100\pytorch_model.bin
Deleting older checkpoint [qaifta_fine_bioBert_extoken\checkpoint-1050] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_bioBert_extoken\checkpoint-1150
Configuration saved in ./qaifta_fine_bioBert_extoken\checkpoint-1150\config.json
Model weights saved in ./qaifta_fine_bioBert_extoken\checkpoint-1150\pytorch_model.bin
Deleting older checkpoint [qaifta_fine_bioBert_extoken\checkpoint-750] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving

KeyboardInterrupt: 

In [6]:
qa_kidBert_extoken = AutoModelForQuestionAnswering.from_pretrained("./mlm_results_largeData_extended_tokenizer/checkpoint-1100")

steps = 50

training_args = TrainingArguments(
    output_dir='./qaifta_fine_kidBert_extoken',          
    num_train_epochs=8,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 1,
    seed = 0
)


trainer = Trainer(
    model=qa_kidBert_extoken,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at ./mlm_results_largeData_extended_tokenizer/checkpoint-1100 were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ./mlm_

Step,Training Loss,Validation Loss,Accuracy
50,4.4428,0.775037,0.870073
100,0.483,0.234367,0.927007
150,0.2459,0.266203,0.937226
200,0.2255,0.200836,0.90219
250,0.1577,0.393619,0.935766
300,0.1827,0.203311,0.948905
350,0.1342,0.187456,0.891971
400,0.1941,0.235762,0.929927
450,0.1544,0.179687,0.950365
500,0.1154,0.261416,0.941606


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_kidBert_extoken\checkpoint-50
Configuration saved in ./qaifta_fine_kidBert_extoken\checkpoint-50\config.json
Model weights saved in ./qaifta_fine_kidBert_extoken\checkpoint-50\pytorch_model.bin
Deleting older checkpoint [qaifta_fine_kidBert_extoken\checkpoint-700] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_kidBert_extoken\checkpoint-100
Configuration saved in ./qaifta_fine_kidBert_extoken\checkpoint-100\config.json
Model weights saved in ./qaifta_fine_kidBert_extoken\checkpoint-100\pytorch_model.bin
Deleting older checkpoint [qaifta_fine_kidBert_extoken\checkpoint-3000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_kidBert_extoken\checkpoint-150
Configuration saved in ./qaifta_fine_kidBert

  Batch size = 12
Saving model checkpoint to ./qaifta_fine_kidBert_extoken\checkpoint-1050
Configuration saved in ./qaifta_fine_kidBert_extoken\checkpoint-1050\config.json
Model weights saved in ./qaifta_fine_kidBert_extoken\checkpoint-1050\pytorch_model.bin
Deleting older checkpoint [qaifta_fine_kidBert_extoken\checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_kidBert_extoken\checkpoint-1100
Configuration saved in ./qaifta_fine_kidBert_extoken\checkpoint-1100\config.json
Model weights saved in ./qaifta_fine_kidBert_extoken\checkpoint-1100\pytorch_model.bin
Deleting older checkpoint [qaifta_fine_kidBert_extoken\checkpoint-1050] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_kidBert_extoken\checkpoint-1150
Configuration saved in ./qaifta_fine_kidBert_extoken\checkpoint-1150\config.json
Mode

TrainOutput(global_step=1832, training_loss=0.23592356810403184, metrics={'train_runtime': 1217.5518, 'train_samples_per_second': 17.99, 'train_steps_per_second': 1.505, 'total_flos': 5723444159545344.0, 'train_loss': 0.23592356810403184, 'epoch': 8.0})

In [6]:
qa_vanBert = AutoModelForQuestionAnswering.from_pretrained("bert-base-cased")

steps = 100

training_args = TrainingArguments(
    output_dir='./qaifta_fine_vanbert_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 30,
    seed = 0
)


trainer = Trainer(
    model=qa_vanBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

Step,Training Loss,Validation Loss,Accuracy
100,2.3668,0.36081,0.908029
200,0.264,0.249229,0.891971
300,0.2393,0.294497,0.921168
400,0.2065,0.412637,0.916788
500,0.2091,0.236165,0.921168
600,0.1835,0.235109,0.925547
700,0.1728,0.203006,0.918248
800,0.1755,0.245231,0.935766
900,0.1394,0.288348,0.929927
1000,0.1284,0.228515,0.929927


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_vanbert_pos\checkpoint-100
Configuration saved in ./qaifta_fine_vanbert_pos\checkpoint-100\config.json
Model weights saved in ./qaifta_fine_vanbert_pos\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_vanbert_pos\checkpoint-200
Configuration saved in ./qaifta_fine_vanbert_pos\checkpoint-200\config.json
Model weights saved in ./qaifta_fine_vanbert_pos\checkpoint-200\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_vanbert_pos\checkpoint-300
Configuration saved in ./qaifta_fine_vanbert_pos\checkpoint-300\config.json
Model weights saved in ./qaifta_fine_vanbert_pos\checkpoint-300\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_v

Model weights saved in ./qaifta_fine_vanbert_pos\checkpoint-2800\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_vanbert_pos\checkpoint-2900
Configuration saved in ./qaifta_fine_vanbert_pos\checkpoint-2900\config.json
Model weights saved in ./qaifta_fine_vanbert_pos\checkpoint-2900\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_vanbert_pos\checkpoint-3000
Configuration saved in ./qaifta_fine_vanbert_pos\checkpoint-3000\config.json
Model weights saved in ./qaifta_fine_vanbert_pos\checkpoint-3000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_vanbert_pos\checkpoint-3100
Configuration saved in ./qaifta_fine_vanbert_pos\checkpoint-3100\config.json
Model weights saved in ./qaifta_fine_vanbert_pos\checkpoint-3100\pytorch_model.bin
Deleting older check

TrainOutput(global_step=3435, training_loss=0.16696682375417873, metrics={'train_runtime': 1857.3434, 'train_samples_per_second': 22.112, 'train_steps_per_second': 1.849, 'total_flos': 1.073145779914752e+16, 'train_loss': 0.16696682375417873, 'epoch': 15.0})

In [6]:
qa_bioasq_kidneyBert = AutoModelForQuestionAnswering.from_pretrained("qa_bioasq_fine_kidneybert/checkpoint-150")

steps = 100

training_args = TrainingArguments(
    output_dir='./qaifta_fine_bioasq_kidneybert',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 30,
    seed = 0
)


trainer = Trainer(
    model=qa_bioasq_kidneyBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()



***** Running training *****
  Num examples = 2738
  Num Epochs = 15
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 3435


Step,Training Loss,Validation Loss,Accuracy
100,0.6762,0.208134,0.929927
200,0.2242,0.17083,0.924088
300,0.1479,0.155113,0.948905
400,0.128,0.161913,0.945985
500,0.1498,0.260255,0.945985
600,0.118,0.189022,0.950365
700,0.1061,0.174233,0.944526
800,0.1177,0.245111,0.945985
900,0.1024,0.19892,0.925547
1000,0.0944,0.220292,0.940146


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_bioasq_kidneybert\checkpoint-100
Configuration saved in ./qaifta_fine_bioasq_kidneybert\checkpoint-100\config.json
Model weights saved in ./qaifta_fine_bioasq_kidneybert\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_bioasq_kidneybert\checkpoint-200
Configuration saved in ./qaifta_fine_bioasq_kidneybert\checkpoint-200\config.json
Model weights saved in ./qaifta_fine_bioasq_kidneybert\checkpoint-200\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_bioasq_kidneybert\checkpoint-300
Configuration saved in ./qaifta_fine_bioasq_kidneybert\checkpoint-300\config.json
Model weights saved in ./qaifta_fine_bioasq_kidneybert\checkpoint-300\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batc

KeyboardInterrupt: 

In [8]:
qa_cinicalBert = AutoModelForQuestionAnswering.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

steps = 100

training_args = TrainingArguments(
    output_dir='./qaifta_fine_clincalbert_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 30,
    seed = 0
)


trainer = Trainer(
    model=qa_cinicalBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at emily

Step,Training Loss,Validation Loss,Accuracy
100,2.4299,0.295083,0.922628
200,0.2533,0.258619,0.883212
300,0.2069,0.20271,0.945985
400,0.1609,0.250626,0.935766
500,0.1629,0.202432,0.947445
600,0.1314,0.175798,0.940146
700,0.1239,0.158054,0.931387
800,0.1196,0.243413,0.944526
900,0.1093,0.172788,0.934307
1000,0.0973,0.239777,0.928467


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_clincalbert_pos\checkpoint-100
Configuration saved in ./qaifta_fine_clincalbert_pos\checkpoint-100\config.json
Model weights saved in ./qaifta_fine_clincalbert_pos\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_clincalbert_pos\checkpoint-200
Configuration saved in ./qaifta_fine_clincalbert_pos\checkpoint-200\config.json
Model weights saved in ./qaifta_fine_clincalbert_pos\checkpoint-200\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_clincalbert_pos\checkpoint-300
Configuration saved in ./qaifta_fine_clincalbert_pos\checkpoint-300\config.json
Model weights saved in ./qaifta_fine_clincalbert_pos\checkpoint-300\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving

KeyboardInterrupt: 

In [7]:
qa_kidneyBert = AutoModelForQuestionAnswering.from_pretrained("./mlm_results_largeData/checkpoint-1100")

steps = 100

training_args = TrainingArguments(
    output_dir='./qaifta_fine_mlm_largeData_pos',          
    num_train_epochs=15,              
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size,   
    warmup_steps=50,                
    weight_decay=1e-2,                          
    logging_steps=steps,
    evaluation_strategy="steps",
    eval_steps=steps,
    load_best_model_at_end=True,
    save_steps = steps,
    save_total_limit = 30,
    seed = 0
)


trainer = Trainer(
    model=qa_kidneyBert,                         
    args=training_args,                 
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


trainer.train()

Some weights of the model checkpoint at ./mlm_results_largeData/checkpoint-1100 were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at ./mlm_results_largeData/c

Step,Training Loss,Validation Loss,Accuracy
100,2.4588,0.220624,0.929927
200,0.2252,0.191555,0.89635
300,0.1757,0.190833,0.945985
400,0.1464,0.141437,0.947445
500,0.1316,0.184365,0.945985
600,0.1546,0.185172,0.934307
700,0.1313,0.161291,0.943066
800,0.1126,0.231033,0.941606
900,0.121,0.288754,0.928467
1000,0.1151,0.26073,0.931387


***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_mlm_largeData_pos\checkpoint-100
Configuration saved in ./qaifta_fine_mlm_largeData_pos\checkpoint-100\config.json
Model weights saved in ./qaifta_fine_mlm_largeData_pos\checkpoint-100\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_mlm_largeData_pos\checkpoint-200
Configuration saved in ./qaifta_fine_mlm_largeData_pos\checkpoint-200\config.json
Model weights saved in ./qaifta_fine_mlm_largeData_pos\checkpoint-200\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batch size = 12
Saving model checkpoint to ./qaifta_fine_mlm_largeData_pos\checkpoint-300
Configuration saved in ./qaifta_fine_mlm_largeData_pos\checkpoint-300\config.json
Model weights saved in ./qaifta_fine_mlm_largeData_pos\checkpoint-300\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 685
  Batc

KeyboardInterrupt: 

In [13]:
def get_result_report(model):    
    test_trainer = Trainer(model) 
    raw_pred,_,_=test_trainer.predict(test_dataset) 
    answer_start_scores, answer_end_scores = raw_pred
    answer_start = np.argmax(answer_start_scores, axis=1)  # get the most likely beginning of answer with the argmax of the score
    answer_end = np.argmax(answer_end_scores, axis=1)+1
    
    ans_list = ["severe", "moderate","mild","minimal","no significant",""]
    label_list = list(range(6))
    ans_dict = {ans:label for ans,label in zip(ans_list,label_list)}
    
    
    not_in_label = 0
    in_label = 0
    pred_list,true_list = [],[]
    total = 0
    correct = 0
    for s,e,t,id in zip(answer_start,answer_end,test_ans,test_ids):
        total += 1
        pred_ans = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(id[s:e]))
#         print("qweretw",pred_ans,t,s,e)

        if s == 0 and e == 1:
            pred_ans = ""
            
        if pred_ans.lower() in ans_dict:
            in_label+=1
            pred_list.append(pred_ans.lower())
            true_list.append(t.lower())
        else:
            not_in_label+=1
            
        if pred_ans.lower()==t.lower():
            correct += 1
        else:
            print(pred_ans.lower(),t.lower())
        

    print(f"Overall exact match accuracy: {correct/total}")
    print(f"Number of predictions not in labels: {not_in_label}")
    print(confusion_matrix(true_list,pred_list))
    print(classification_report(true_list,pred_list))

In [20]:
question = "What is the grade of interstitial fibrosis and tubular atrophy?"

context = ", MICROSCOPIC DESCRIPTION \n Light Microscopy (LM):  The following LM findings are based on hematoxylin and eosin (H&E), periodic acid-Schiff (PAS), and Masson trichrome-stained sections.  The specimen submitted for light microscopic evaluation consists of cortical tissue with at least 119 glomeruli, 6 of which are globally sclerotic.  No segmentally sclerosed glomeruli are seen.  The viable glomeruli are either unremarkable or show focal mild mesangial expansion.  The peripheral capillary walls are thin and regular.   No significant glomerulitis is seen.  No crescents, proliferation of capillary cells or necrosis of capillary tufts are identified.  The tubulointerstitium shows acute tubular injury and mild interstitial fibrosis and tubular atrophy (~5%).  No significant interstitial inflammation, tubulitis, peritubular capillaritis, vasculitis or viral cytopathic changes is identified.  The arteries show moderate intimal thickening and arterioles show moderate hyalinosis"

inputs = tokenizer(question, context, padding="max_length", truncation=True, 
                                return_tensors="pt",max_length=512,return_offsets_mapping=True) 
offset_mapping = inputs.pop("offset_mapping")


qa_kidneyBert.cpu()
answer = qa_kidneyBert(**inputs)
answer_start_scores, answer_end_scores = answer["start_logits"], answer["end_logits"]
answer_start = torch.argmax(answer_start_scores)  # get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1  # get the most likely end of answer with the argmax of the score

pred_ans = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

print("Report: \n", context)
print("\nPredicted grade: ", pred_ans)

Report: 
 , MICROSCOPIC DESCRIPTION 
 Light Microscopy (LM):  The following LM findings are based on hematoxylin and eosin (H&E), periodic acid-Schiff (PAS), and Masson trichrome-stained sections.  The specimen submitted for light microscopic evaluation consists of cortical tissue with at least 119 glomeruli, 6 of which are globally sclerotic.  No segmentally sclerosed glomeruli are seen.  The viable glomeruli are either unremarkable or show focal mild mesangial expansion.  The peripheral capillary walls are thin and regular.   No significant glomerulitis is seen.  No crescents, proliferation of capillary cells or necrosis of capillary tufts are identified.  The tubulointerstitium shows acute tubular injury and mild interstitial fibrosis and tubular atrophy (~5%).  No significant interstitial inflammation, tubulitis, peritubular capillaritis, vasculitis or viral cytopathic changes is identified.  The arteries show moderate intimal thickening and arterioles show moderate hyalinosis

Pre

In [14]:
qa_cinicalBert = AutoModelForQuestionAnswering.from_pretrained("./qaifta_fine_vanbert_pos/checkpoint-1500")
get_result_report(qa_cinicalBert)

loading configuration file ./qaifta_fine_vanbert_pos/checkpoint-1500\config.json
Model config BertConfig {
  "_name_or_path": "./qaifta_fine_vanbert_pos/checkpoint-1500",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file ./qaifta_fine_vanbert_pos/checkpoint-1500\pytorch_model.bin
All model checkpoint weights were used when initializing BertForQuestionAnswering.

All the weights of BertForQuesti

 moderate
 no significant
[cls] what is the grade of interstitial fibrosis and tubular atrophy? [sep] ls : ajh edited by : 03 / 17 / 17 - 1105 16192 > < microscopic description \ n clinical history : 36 year old male with esrd secondary to fsgs. light microscopy ( lm ) : the following lm findings are based on hematoxylin and eosin ( h & e ), periodic acid - schiff ( pas ), and masson trichrome - stained sections. the material submitted for lm contains a single wedge of renal cortical tissue. more than 30 glomeruli are present for examination, of which none are globally sclerotic. the non - sclerotic glomeruli contain open capillary loops with no significant glomerulitis. the capillary walls are of normal thickness and contours. no significant mesangial matrix accumulation or mesangial hypercellularity is identified. the tubulointerstitium shows mild acute tubular injury but is otherwise generally unremarkable with the tubules arranged in a back - to - back orientation. no significant 


In [15]:
qa_cinicalBert = AutoModelForQuestionAnswering.from_pretrained("./qaifta_fine_clincalbert_pos/checkpoint-700")
get_result_report(qa_cinicalBert)

loading configuration file ./qaifta_fine_clincalbert_pos/checkpoint-700\config.json
Model config BertConfig {
  "_name_or_path": "./qaifta_fine_clincalbert_pos/checkpoint-700",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file ./qaifta_fine_clincalbert_pos/checkpoint-700\pytorch_model.bin
All model checkpoint weights were used when initializing BertForQuestionAnswering.

All the weights of BertForQuestionAnswering were initializ

Overall exact match accuracy: 0.9313868613138686
Number of predictions not in labels: 21
[[ 46   0   0   0  18   0]
 [  0 201   2   0   0   0]
 [  0   0  67   0   1   0]
 [  0   1   0  75   0   0]
 [  0   2   0   1 213   0]
 [  0   1   0   0   0  36]]
                precision    recall  f1-score   support

                     1.00      0.72      0.84        64
          mild       0.98      0.99      0.99       203
       minimal       0.97      0.99      0.98        68
      moderate       0.99      0.99      0.99        76
no significant       0.92      0.99      0.95       216
        severe       1.00      0.97      0.99        37

      accuracy                           0.96       664
     macro avg       0.98      0.94      0.95       664
  weighted avg       0.96      0.96      0.96       664



In [22]:
qa_kidneyBert = AutoModelForQuestionAnswering.from_pretrained("./qaifta_fine_mlm_largeData_pos/checkpoint-400")
get_result_report(qa_kidneyBert)

loading configuration file ./qaifta_fine_mlm_largeData_pos/checkpoint-400\config.json
Model config BertConfig {
  "_name_or_path": "./qaifta_fine_mlm_largeData_pos/checkpoint-400",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file ./qaifta_fine_mlm_largeData_pos/checkpoint-400\pytorch_model.bin
All model checkpoint weights were used when initializing BertForQuestionAnswering.

All the weights of BertForQuestionAnswering were ini

Overall exact match accuracy: 0.9474452554744526
Number of predictions not in labels: 3
[[ 48   0   0   0  26   0]
 [  0 202   1   0   0   0]
 [  0   0  70   0   1   0]
 [  0   1   0  75   0   0]
 [  1   0   0   1 220   0]
 [  1   1   0   0   0  34]]
                precision    recall  f1-score   support

                     0.96      0.65      0.77        74
          mild       0.99      1.00      0.99       203
       minimal       0.99      0.99      0.99        71
      moderate       0.99      0.99      0.99        76
no significant       0.89      0.99      0.94       222
        severe       1.00      0.94      0.97        36

      accuracy                           0.95       682
     macro avg       0.97      0.93      0.94       682
  weighted avg       0.95      0.95      0.95       682



## Using the prediction results from QARej to solve isRej

In [13]:
def get_pred_answer(p,ids):   
    
#     pred, labels = p   
        
    answer_start_scores, answer_end_scores = p
    answer_start = np.argmax(answer_start_scores, axis=1)  # get the most likely beginning of answer with the argmax of the score
    answer_end = np.argmax(answer_end_scores, axis=1)+1
    
    pred_ans_lst = []
    for s,e,id in zip(answer_start,answer_end,ids):

        pred_ans = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(id[s:e]))
#         print("qweretw",pred_ans,t,s,e)
        if s == 0 and e == 1:
            pred_ans_lst.append(" ")
        else:
            pred_ans_lst.append(pred_ans.lower().replace('\n', ' '))
    return pred_ans_lst



In [None]:
qa_abmr_kidneyBert = AutoModelForQuestionAnswering.from_pretrained("./qaabmr_fine_mlm_largeData_pos/checkpoint-600")
qa_tcmr_kidneyBert = AutoModelForQuestionAnswering.from_pretrained("./qatcmr_fine_mlm_largeData_pos/checkpoint-300")

In [26]:
batch_size = 12


data = pd.read_csv("data.csv")
inputs1 = data["train_report_qa"].tolist()
label1 = data["ABMR"].tolist()
label = [l for i,l in zip(inputs1,label1) if str(i)!="nan"]
inputs = [i for i in inputs1 if str(i)!="nan"]

label_class_help = data["abmr_class"].tolist()
label_class = [l for i,l in zip(inputs1,label_class_help) if str(i)!="nan"]


train_text, test_text, abmr_train_labels, abmr_test_labels = train_test_split(
    inputs, label,random_state = 1,stratify=label_class,test_size=0.2)


q_abmr = "How is the antibody-mediated rejection?"
train_dataset,test_dataset = gen_datasets(q_abmr,train_text,test_text)
abmr_train_loader = torch.utils.data.DataLoader(train_dataset,batch_size = batch_size)
abmr_test_loader = torch.utils.data.DataLoader(test_dataset,batch_size = batch_size)

# test_ans = []
# for i,l in zip(test_text,test_labels):
#     test_ans.append(i[l[0]:l[1]])

test_ids = torch.tensor([])
for i in abmr_test_loader:
    test_ids = torch.cat((test_ids,i["input_ids"]),0)
    
train_ids = torch.tensor([])
for i in abmr_train_loader:
    train_ids = torch.cat((train_ids,i["input_ids"]),0)

qa_abmr_kidneyBert = AutoModelForQuestionAnswering.from_pretrained("./qaabmr_fine_mlm_largeData_pos/checkpoint-1400")
qa_abmr_kidneyBert_trainer = Trainer(qa_abmr_kidneyBert) 
raw_pred,_,_=qa_abmr_kidneyBert_trainer.predict(train_dataset) 
abmr_train_ans = get_pred_answer(raw_pred,train_ids)

raw_pred,_,_=qa_abmr_kidneyBert_trainer.predict(test_dataset) 
abmr_test_ans = get_pred_answer(raw_pred,test_ids)

loading configuration file ./qaabmr_fine_mlm_largeData_pos/checkpoint-1400\config.json
Model config BertConfig {
  "_name_or_path": "./qaabmr_fine_mlm_largeData_pos/checkpoint-1400",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file ./qaabmr_fine_mlm_largeData_pos/checkpoint-1400\pytorch_model.bin
All model checkpoint weights were used when initializing BertForQuestionAnswering.

All the weights of BertForQuestionAnswering were 

***** Running Prediction *****
  Num examples = 685
  Batch size = 8


In [28]:
for i,j in zip(abmr_train_ans,abmr_train_labels):
    if j == 1:
        print(i,j)

  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
with evidence of chronic antibody mediated rejection 1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
biopsy on 11 / 3 / 2014 showed antibody - mediated rejection 1


In [24]:
for i,j in zip(abmr_test_ans,abmr_test_labels):
    if j == 1:
        print(i,j)

  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
  1
with multiple episodes of antibody - mediated rejection 1
  1


In [None]:
def gen_datasets(train_text_0,train_text,test_text_0,test_text,tokenizer=tokenizer):

    train_encodings = tokenizer(train_text_0,train_text,padding="max_length", truncation=True, 
                                return_tensors="pt",max_length=512,return_offsets_mapping=True)
    test_encodings = tokenizer(test_text_0,test_text,padding="max_length", truncation=True, 
                                return_tensors="pt",max_length=512,return_offsets_mapping=True)
    train_dataset = RenalDataset(train_encodings, train_labels)
    test_dataset = RenalDataset(test_encodings, test_labels)
    return train_dataset,test_dataset

class RenalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred,average="micro")
    precision = precision_score(y_true=labels, y_pred=pred,average="micro")
    f1 = f1_score(y_true=labels, y_pred=pred,average="micro")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1} 

In [41]:
a = torch.tensor(1)

In [44]:
if a<2:
    print(1)

1


In [60]:
question = "What is the grade of interstitial fibrosis and tubular atrophy?"

context = ", MICROSCOPIC DESCRIPTION \n Light Microscopy (LM):  The following LM findings are based on hematoxylin and eosin (H&E), periodic acid-Schiff (PAS), and Masson trichrome-stained sections.  The specimen submitted for light microscopic evaluation consists of cortical tissue with at least 119 glomeruli, 6 of which are globally sclerotic.  No segmentally sclerosed glomeruli are seen.  The viable glomeruli are either unremarkable or show focal mild mesangial expansion.  The peripheral capillary walls are thin and regular.   No significant glomerulitis is seen.  No crescents, proliferation of capillary cells or necrosis of capillary tufts are identified.  The tubulointerstitium shows acute tubular injury and mild interstitial fibrosis and tubular atrophy (~5%).  No significant interstitial inflammation, tubulitis, peritubular capillaritis, vasculitis or viral cytopathic changes is identified.  The arteries show moderate intimal thickening and arterioles show moderate hyalinosis"
# qa_kidneyBert = AutoModelForQuestionAnswering.from_pretrained("./mlm_results_largeData/checkpoint-1100")
# 1. TOKENIZE THE INPUT
# note: if you don't include return_tensors='pt' you'll get a list of lists which is easier for 
# exploration but you cannot feed that into a model. 
inputs = tokenizer(question, context, padding="max_length", truncation=True, 
                                return_tensors="pt",max_length=512,return_offsets_mapping=True) 
offset_mapping = inputs.pop("offset_mapping")
# 2. OBTAIN MODEL SCORES
# the AutoModelForQuestionAnswering class includes a span predictor on top of the model. 
# the model returns answer start and end scores for each word in the text
qa_kidneyBert.cpu()
answer = qa_kidneyBert(**inputs)
answer_start_scores, answer_end_scores = answer["start_logits"], answer["end_logits"]
answer_start = torch.argmax(answer_start_scores)  # get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1  # get the most likely end of answer with the argmax of the score

# 3. GET THE ANSWER SPAN
# once we have the most likely start and end tokens, we grab all the tokens between them
# and convert tokens back to words!
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

'mild'

In [46]:
offset_mapping[0][205]

tensor([711, 715])

In [57]:
for i,z in zip(test_text,test_labels):
    print(i[z[0]:z[1]])
    break

mild


In [40]:
(question+"  "+context)[716:721]

'inter'

In [18]:
for i in test_e:
    print(i)
    break

tensor(207.)


In [14]:
context[712:716]

'ild '

In [15]:
answer_start,answer_end

(tensor(205), tensor(206))

In [22]:
#  t = offset_mapping[0]
int((t == 101).nonzero(as_tuple=True)[0])

0

In [25]:
list(t[0]).index(101)

0

In [16]:
inputs["input_ids"]

In [23]:
context = "I like eating apples."
inputs = tokenizer(context, padding="max_length", truncation=True, 
                                return_tensors="pt",max_length=512,return_offsets_mapping=True) 
inputs["input_ids"][0][:40]

tensor([  101,   178,  1176,  5497, 22888,   119,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])

In [31]:
# offset_mapping = inputs.pop("offset_mapping")
offset_mapping[0][:40]+1

tensor([[ 1,  1],
        [ 1,  5],
        [ 6,  8],
        [ 9, 12],
        [13, 18],
        [19, 21],
        [22, 27],
        [27, 29],
        [29, 32],
        [32, 34],
        [35, 37],
        [37, 40],
        [40, 43],
        [44, 47],
        [48, 51],
        [51, 55],
        [56, 58],
        [58, 61],
        [61, 63],
        [63, 64],
        [ 1,  1],
        [ 1,  2],
        [ 3,  8],
        [ 8, 14],
        [15, 26],
        [27, 28],
        [28, 29],
        [30, 35],
        [36, 41],
        [41, 44],
        [44, 46],
        [47, 48],
        [48, 49],
        [49, 50],
        [50, 51],
        [51, 52],
        [54, 57],
        [58, 67],
        [68, 69],
        [69, 70]])

In [30]:
answer

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-2.3183e-01, -2.3798e-01, -1.8489e-01, -2.9106e-02, -3.0932e-01,
          1.8659e-01,  3.1977e-02,  1.8660e-01,  7.9363e-02,  2.4333e-01,
         -1.9067e-01, -2.8963e-01, -2.1170e-02, -5.5045e-02, -3.3944e-01,
         -7.2507e-02, -2.8349e-01, -4.4287e-01, -2.6857e-01, -4.8541e-01,
          2.0282e-01,  1.5761e-02, -3.1923e-01, -3.7885e-01,  1.9984e-01,
         -1.6532e-01, -1.8265e-01,  1.4609e-01, -4.4513e-01, -4.9060e-01,
         -1.1417e-01, -3.6351e-01,  4.3571e-02,  1.9040e-03, -1.3833e-01,
          1.6810e-01, -1.2825e-01, -2.5961e-01, -7.1122e-02, -1.3075e-01,
         -3.4616e-02, -2.2744e-01, -4.1331e-01, -5.9271e-03, -9.2514e-02,
          2.6987e-01, -5.0002e-02, -1.5550e-01, -5.2464e-02, -1.3158e-01,
          2.3259e-01, -1.1532e-01, -3.8052e-01,  1.7588e-01,  9.7117e-02,
         -1.3237e-01, -3.6330e-01,  4.7035e-02,  4.0960e-02, -2.4112e-01,
          5.9508e-01,  3.6439e-01, -9.1572e-02, -1.7662e-01

In [33]:
y_pred = [(0, 2),( 1, 3)]
y_true = [(0, 1), (2, 3)]
accuracy_score(y_true, y_pred)

ValueError: multiclass-multioutput is not supported