In [5]:
from transformers import AutoConfig, AutoTokenizer, AutoModel
from data.acronymDataset import AcronymDataset
import torch.nn as nn
import pandas as pd

In [6]:
model_name = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
tokenizer = AutoTokenizer.from_pretrained(model_name)
pre_trained_model = AutoModel.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
file_path = 'data/acronym_data.txt'
data = []

with open(file_path, "r", errors='ignore') as file:
    for line in file.readlines():
        split = line.strip().split('|')
        
        # build the sentence structure
        source_sentence = split[6]
        full_name = split[1]
        compare_sentence = source_sentence[:int(split[3])] + full_name + source_sentence[int(split[4]) + 1:]

        row = {
            'source_sentence': source_sentence,
            'compare_sentence': compare_sentence,
            'label': 1,
            'acronym': split[0]
        }
        data.append(row)

data_dict = {key: [item[key] for item in data] for key in data[0]}
dataset = pd.DataFrame.from_dict(data_dict)


Unnamed: 0,source_sentence,compare_sentence,label,acronym
0,_%#NAME#%_ _%#NAME#%_ is a 29-year-old gravida...,_%#NAME#%_ _%#NAME#%_ is a 29-year-old gravida...,1,AB
1,She is now bleeding quite heavily. Ultrasound ...,She is now bleeding quite heavily. Ultrasound ...,1,AB
2,ALLERGIES: Heparin and Imitrex. PAST OB HISTOR...,ALLERGIES: Heparin and Imitrex. PAST OB HISTOR...,1,AB
3,She had a pelvic ultrasound at Park Nicollet o...,She had a pelvic ultrasound at Park Nicollet o...,1,AB
4,"On _%#MMDD2007#%_, normal anatomy with anterio...","On _%#MMDD2007#%_, normal anatomy with anterio...",1,AB
...,...,...,...,...
37495,"1. Multiple myeloma, undergoing chemotherapy. ...","1. Multiple myeloma, undergoing chemotherapy. ...",1,VAD
37496,He has been receiving weekly Procrit injection...,He has been receiving weekly Procrit injection...,1,VAD
37497,"Within a month, he developed recurrent hip pai...","Within a month, he developed recurrent hip pai...",1,VAD
37498,He had a serum protein electrophoresis with a ...,He had a serum protein electrophoresis with a ...,1,VAD


In [13]:
groups = dataset.groupby('acronym')
groups.get_group('AVR')

"See history below for likely etiology of the patient's endocarditis. PAST MEDICAL HISTORY: 1. The patient had a CAB and AVR with St. Jude valve in 1997. The patient subsequently developed third-degree heart block about a year ago and had a pacer placed and had this pacer replaced approximately 4 months ago secondary to one of the leads malfunctioning."

Test with the data

In [3]:
file_path = 'data/acronym_data.txt'
dataset = AcronymDataset(file_path=file_path)
dataset = dataset.data

In [4]:
tokens = tokenizer(dataset[0]['source_sentence'], dataset[0]['compare_sentence'], return_tensors='pt')
dataset[16]

{'source_sentence': "She is morbidly obese at 300 pounds, and at that time in _%#MM#%_ of 2001, she defined a dysfunction, irregular bleeding pattern and has a history of a vacuum curettage in _%#MM2001#%_ for an early missed AB at approximately 7 weeks gestation. That was accomplished without difficulty. The patient's blood type is AB positive. She has continued to do well on previous Provera cycling which she has been undergoing.",
 'compare_sentence': "She is morbidly obese at 300 pounds, and at that time in _%#MM#%_ of 2001, she defined a dysfunction, irregular bleeding pattern and has a history of a vacuum curettage in _%#MM2001#%_ for an early missed abortion at approximately 7 weeks gestation. That was accomplished without difficulty. The patient's blood type is AB positive. She has continued to do well on previous Provera cycling which she has been undergoing.",
 'label': 1,
 'acronym': 'AB'}

In [6]:
from models.multiHeadModel import MultiHeadModel
from models.heads import ClassificationHead

In [7]:
in_features = config.hidden_size
two_labels_head = ClassificationHead(in_features=in_features, out_features=2)
four_labels_head = ClassificationHead(in_features=in_features, out_features=4)

classifiers = [two_labels_head, four_labels_head]


In [8]:
multi_head_model = MultiHeadModel(pre_trained_model, classifiers)

In [9]:
output = multi_head_model(tokens, 0)
output

tensor([[0.4202, 0.5524]], grad_fn=<SigmoidBackward0>)

In [10]:
output = multi_head_model(tokens, 1)
output

tensor([[0.4424, 0.4114, 0.5453, 0.5039]], grad_fn=<SigmoidBackward0>)

Insert a layer in the middle

In [None]:
def create_classifier_layer(layer_to_insert):
    till_layer_to_insert = pre_trained_model.encoder.layer[:layer_to_insert]
    after_layer_to_insert = pre_trained_model.encoder.layer[layer_to_insert:]
    model_with_middle_classifier = till_layer_to_insert + [nn.Linear(768, 2)] + after_layer_to_insert

    return model_with_middle_classifier

In [None]:
model_with_middle_classifier = create_classifier_layer(layer_to_insert=4)
model_with_middle_classifier