In [None]:
!pip install -q openprompt==0.1.1 \
'torch>=1.9.0' \
'transformers>=4.10.0' \
sentencepiece==0.1.96 \
'scikit-learn>=0.24.2' \
'tqdm>=4.62.2' \
tensorboardX \
nltk \
yacs \
dill \
datasets \
rouge==1.0.0 \
scipy==1.4.1 \
fugashi \
ipadic \
unidic-lite

# BERT
OpenPrompt

In [1]:
import openprompt.plms as plms
from openprompt.plms.mlm import MLMTokenizerWrapper
from transformers import BertConfig, BertForMaskedLM, BertTokenizer

In [2]:
plms._MODEL_CLASSES['bert'] = plms.ModelClass(**{
    'config': BertConfig,
    'tokenizer': BertTokenizer,
    'model':BertForMaskedLM,
    'wrapper': MLMTokenizerWrapper,
})

In [3]:
plms._MODEL_CLASSES

{'bert': ModelClass(config=<class 'transformers.models.bert.configuration_bert.BertConfig'>, tokenizer=<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, model=<class 'transformers.models.bert.modeling_bert.BertForMaskedLM'>, wrapper=<class 'openprompt.plms.mlm.MLMTokenizerWrapper'>),
 'roberta': ModelClass(config=<class 'transformers.models.roberta.configuration_roberta.RobertaConfig'>, tokenizer=<class 'transformers.models.roberta.tokenization_roberta.RobertaTokenizer'>, model=<class 'transformers.models.roberta.modeling_roberta.RobertaForMaskedLM'>, wrapper=<class 'openprompt.plms.mlm.MLMTokenizerWrapper'>),
 'albert': ModelClass(config=<class 'transformers.models.albert.configuration_albert.AlbertConfig'>, tokenizer=<class 'transformers.models.albert.tokenization_albert.AlbertTokenizer'>, model=<class 'transformers.models.albert.modeling_albert.AlbertForMaskedLM'>, wrapper=<class 'openprompt.plms.mlm.MLMTokenizerWrapper'>),
 'gpt': ModelClass(config=<class 'transfo

# Step 1: Define a task
|

In [13]:
from openprompt.data_utils import InputExample
classes = [ 
    "lung",
    "brain",
    "virus"
]

dataset = [ 
      InputExample(
#         guid = 0,
        text_a = "Asthma affects lungs  and can be hard to diagnose. The signs of asthma can seem like the signs of COPD, pneumonia, bronchitis, pulmonary embolism, anxiety, and heart disease.", #lung
    ),
    InputExample(
        guid = 1,
        text_a = "COVID-19 is caused by a coronavirus called SARS-CoV-2", #virus
    ),
#     InputExample(
#         guid = 2,
#         text_a = "When your brain is damaged, it can affect many different things, including your memory, your sensation, and even your personality. Brain disorders include any conditions or disabilities that affect your brain.", #brain
#     ),
#     InputExample(
#         guid = 3,
#         text_a = "Symptoms may appear 2-14 days after exposure to the virus", #virus
#     ),
        InputExample(
#         guid = 4,
        text_a = """Neurodegenerative diseases cause your brain and nerves to deteriorate over time. They can change your personality and cause confusion. They can also destroy your brain’s tissue and nerves.

Some brain diseases, such as Alzheimer’s disease, may develop as you age. """, #brain
    ),
]



# Step 2: Define a Pre-trained Language Models (PLMs)

In [14]:
from transformers import AutoTokenizer, AutoModel
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-uncased")
# plm, tokenizer, model_config, WrapperClass = load_plm("bert", "mrm8488/bioclinicalBERT-finetuned-covid-papers")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
tokenizer.tokenize("I think this drug is not a solution")

['i', 'think', 'this', 'drug', 'is', 'not', 'a', 'solution']

# Step 3: Define a Template.


In [16]:
from openprompt.prompts import ManualTemplate
from openprompt.prompts import PtuningTemplate
# template_text = '{"placeholder":"text_a"}: This effects {"mask"}'
template_text= 'A {"mask"} disorder :  {"placeholder": "text_a"}'

promptTemplate = ManualTemplate(
    text = template_text,
    tokenizer = tokenizer,
)

# promptTemplate = PtuningTemplate(model = plm, 
#                                  tokenizer = tokenizer, 
#                                  text = template_text, 
#                                  prompt_encoder_type = 'mlm'
# )

# Step 4: Define a Verbalizer


In [17]:
from openprompt.prompts import ManualVerbalizer
promptVerbalizer = ManualVerbalizer(
    classes = classes,
    label_words = {
        "lung": ["chest"],
        "brain": ["head"],
        "virus": ["virus"],
    },
    tokenizer = tokenizer,
)

# Step 5: Combine them into a PromptModel
Given the task, now we have a PLM, a Template and a Verbalizer,  combine them into a PromptModel. 

In [18]:
from openprompt import PromptForClassification
promptModel = PromptForClassification(
    template = promptTemplate,
    plm = plm,
    verbalizer = promptVerbalizer
)

# Step 6: Define a DataLoader
A PromptDataLoader is basically a prompt version of pytorch Dataloader, which also includes a Tokenizer, a Template and a TokenizerWrapper.

In [22]:
from openprompt import PromptDataLoader
data_loader = PromptDataLoader(
    dataset = dataset,
    tokenizer = tokenizer, 
    template = promptTemplate, 
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=256, decoder_max_length=3, 
    batch_size=1,shuffle=False, teacher_forcing=False, predict_eos_token=False,
    truncate_method="head"
)


tokenizing: 3it [00:00, 405.48it/s]


# Step 7: Train and inference
Done! We can conduct training and inference the same as other processes in Pytorch.

In [20]:
# making zero-shot inference using pretrained MLM with prompt
import torch
promptModel.eval()
with torch.no_grad():
    for batch in data_loader:
        logits = promptModel(batch)
        print(logits)
        preds = torch.argmax(logits, dim = -1)
        print(classes[preds])
# predictions would be 1, 0 for classes 'positive', 'negative'


tensor([[-0.0102, -5.0081, -5.6592]])
lung
tensor([[-4.7051, -6.2038, -0.0111]])
virus
tensor([[-3.2428, -0.0456, -5.1991]])
brain
