In [None]:
!pip install -q openprompt==0.1.1 \
'torch>=1.9.0' \
'transformers>=4.10.0' \
sentencepiece==0.1.96 \
'scikit-learn>=0.24.2' \
'tqdm>=4.62.2' \
tensorboardX \
nltk \
yacs \
dill \
datasets \
rouge==1.0.0 \
scipy==1.4.1 \
fugashi \
ipadic \
unidic-lite

# BERT
OpenPrompt

In [1]:
import openprompt.plms as plms
from openprompt.plms.mlm import MLMTokenizerWrapper
from transformers import BertConfig, BertForMaskedLM, BertTokenizer

In [2]:
plms._MODEL_CLASSES['bert'] = plms.ModelClass(**{
    'config': BertConfig,
    'tokenizer': BertTokenizer,
    'model':BertForMaskedLM,
    'wrapper': MLMTokenizerWrapper,
})

In [3]:
plms._MODEL_CLASSES

{'bert': ModelClass(config=<class 'transformers.models.bert.configuration_bert.BertConfig'>, tokenizer=<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, model=<class 'transformers.models.bert.modeling_bert.BertForMaskedLM'>, wrapper=<class 'openprompt.plms.mlm.MLMTokenizerWrapper'>),
 'roberta': ModelClass(config=<class 'transformers.models.roberta.configuration_roberta.RobertaConfig'>, tokenizer=<class 'transformers.models.roberta.tokenization_roberta.RobertaTokenizer'>, model=<class 'transformers.models.roberta.modeling_roberta.RobertaForMaskedLM'>, wrapper=<class 'openprompt.plms.mlm.MLMTokenizerWrapper'>),
 'albert': ModelClass(config=<class 'transformers.models.albert.configuration_albert.AlbertConfig'>, tokenizer=<class 'transformers.models.albert.tokenization_albert.AlbertTokenizer'>, model=<class 'transformers.models.albert.modeling_albert.AlbertForMaskedLM'>, wrapper=<class 'openprompt.plms.mlm.MLMTokenizerWrapper'>),
 'gpt': ModelClass(config=<class 'transfo

# Step 1: Define a task
|

In [9]:
from openprompt.data_utils import InputExample
classes = [ 
    "Obesity",
    "not Obese"
]

dataset = [ 
      InputExample(
        guid = 0,
        text_a = """Abdomen was obese , soft , no palpable
            masses , normal bowel sounds. Groin showed no hernias. Pulses were
            significant for 1+ dopplerable femorals bilaterally , 2+
            dopplerable dorsalis pedis bilaterally , 1+ dopplerable posterior
            tibial on the right , 2+ dopplerable posterior tibial on the left
            and 2+ dopplerable popliteals. Rectal exam was guaiac positive
            with some pain and no visible lesions. ABI was .66 of the dorsalis
            pedis on the right , .44 of the dorsalis pedis on the left; .61
            posterior tibial on the right and .66 posterior tibial on the left.""", #obese
    ),
    InputExample(
        guid = 1,
        text_a = """"Her admission physical examination was significant for
            temperature of 100.2 , blood pressure of 102/53 , and saturating
            98% on 2 liters. The patient was mildly anxious. She was
            normocephalic and atraumatic and had surgical pupils bilaterally.
            Her neck was supple and her jugular venous pressure was 8 cm.
            She had decreased breath sounds throughout and had fine scattered
            rales throughout her lung fields. Heart was regular rate and
            rhythm with normal S1 and S2 with a 2/6 systolic ejection murmur
            at the right upper sternal border without any radiation. Her
            abdomen was obese , soft , nontender , and nondistended with good
            bowel sounds. Her extremities revealed no clubbing , cyanosis , or
            edema. She did have a right arm fistula for hemodialysis with a
            good thrill. Neurologically , she was alert and oriented x3
            and had a steady gait with a walker.""", #obsese
    ),
#     InputExample(
#         guid = 2,
#         text_a = "When your brain is damaged, it can affect many different things, including your memory, your sensation, and even your personality. Brain disorders include any conditions or disabilities that affect your brain.", #brain
#     ),
#     InputExample(
#         guid = 3,
#         text_a = "Symptoms may appear 2-14 days after exposure to the virus", #virus
#     ),
        InputExample(
        guid = 4,
        text_a = """The patient is healthy and is having a balanced diet.""", #brain
    ),
]



# Step 2: Define a Pre-trained Language Models (PLMs)

In [10]:
from transformers import AutoTokenizer, AutoModel
from openprompt.plms import load_plm
# plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-uncased")
plm, tokenizer, model_config, WrapperClass = load_plm("bert", "emilyalsentzer/Bio_ClinicalBERT")
# plm, tokenizer, model_config, WrapperClass = load_plm("bert", "mrm8488/bioclinicalBERT-finetuned-covid-papers")

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
tokenizer.tokenize("I think this drug is not a solution")

['i', 'think', 'this', 'drug', 'is', 'not', 'a', 'solution']

# Step 3: Define a Template.


In [12]:
from openprompt.prompts import ManualTemplate
from openprompt.prompts import PtuningTemplate
# template_text = '{"placeholder":"text_a"}: This effects {"mask"}'
template_text= 'A {"mask"} disorder :  {"placeholder": "text_a"}'

# promptTemplate = ManualTemplate(
#     text = template_text,
#     tokenizer = tokenizer,
# )

promptTemplate = PtuningTemplate(model = plm, 
                                 tokenizer = tokenizer, 
                                 text = template_text, 
                                 prompt_encoder_type = 'mlm'
)

# Step 4: Define a Verbalizer


In [13]:
from openprompt.prompts import ManualVerbalizer
promptVerbalizer = ManualVerbalizer(
    classes = classes,
    label_words = {
        "Obesity": ["obesity", "obese", "overweight"],
        "not Obese": ["healthy", "proper diet", "underweight"] #"healthy", 
    },
    tokenizer = tokenizer,
)

# Step 5: Combine them into a PromptModel
Given the task, now we have a PLM, a Template and a Verbalizer,  combine them into a PromptModel. 

In [14]:
from openprompt import PromptForClassification
promptModel = PromptForClassification(
    template = promptTemplate,
    plm = plm,
    verbalizer = promptVerbalizer
)

# Step 6: Define a DataLoader
A PromptDataLoader is basically a prompt version of pytorch Dataloader, which also includes a Tokenizer, a Template and a TokenizerWrapper.

In [15]:
from openprompt import PromptDataLoader
data_loader = PromptDataLoader(
    dataset = dataset,
    tokenizer = tokenizer, 
    template = promptTemplate, 
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=256, decoder_max_length=3, 
    batch_size=1,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head"
)


tokenizing: 3it [00:00, 334.13it/s]


# Step 7: Train and inference
Done! We can conduct training and inference the same as other processes in Pytorch.

In [16]:
# making zero-shot inference using pretrained MLM with prompt
import torch
promptModel.eval()
with torch.no_grad():
    for batch in data_loader:
        logits = promptModel(batch)
        print(logits)
        preds = torch.argmax(logits, dim = -1)
        print(classes[preds])
# predictions would be 1, 0 for classes 'positive', 'negative'


tensor([[-1.9434, -3.5926]])
Obesity
tensor([[-2.5573, -2.5847]])
Obesity
tensor([[-6.8596, -4.7382]])
not Obese
