In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [None]:
import jsonlines

dataset = []
with jsonlines.open('/content/drive/MyDrive/train.jsonl') as reader:
    for obj in reader:
      dataset.append(obj)

In [None]:
dataset[100:110]

[{'question': 'would a person sink in a corn silo',
  'title': 'Grain entrapment',
  'answer': True,
  'passage': 'Grain entrapment, or grain engulfment, occurs when a person becomes submerged in grain and cannot get out without assistance. This more frequently occurs at storage facilities such as silos or grain elevators, but has been known to occur around any large quantity of grain, even freestanding piles outdoors. Usually, unstable grain collapses suddenly, wholly or partially burying workers who may be within it. Entrapment occurs when victims are partially submerged but cannot remove themselves; engulfment occurs when they are completely buried within the grain. Engulfment has a very high fatality rate.'},
 {'question': 'paths from the cerebral cortex to the spinal cord are called the corticospinal tracts',
  'title': 'Pyramidal tracts',
  'answer': True,
  'passage': 'The corticospinal tract conducts impulses from the brain to the spinal cord. It is made up of a lateral and ant

In [None]:
import time
import random

import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import transformers
from transformers import AutoConfig, AutoModelWithLMHead, AutoTokenizer
from tqdm import tqdm

In [None]:
def load_pretrained(model_name):
    config = AutoConfig.from_pretrained(model_name)
    model = AutoModelWithLMHead.from_pretrained(model_name)
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

    return config, model, tokenizer

In [None]:
config, model, tokenizer = load_pretrained('bert-base-uncased')

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [None]:
def get_embeddings(model, config):
    base_model = getattr(model, config.model_type)
    embeddings = base_model.embeddings.word_embeddings
    return embeddings

In [None]:
embeddings = get_embeddings(model, config)
embeddings.to(device)

Embedding(30522, 768, padding_idx=0)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        return {'question': dataset[idx]['question'], 'answer': dataset[idx]['answer']}

    def __len__(self):
        return len(self.dataset)

In [None]:
data = Dataset(dataset)

In [None]:
loader = torch.utils.data.DataLoader(data, batch_size = 16, shuffle = True)

In [None]:
import copy
import math

softmax = torch.nn.Softmax(dim = 1)

embeddings = get_embeddings(model, config)

trigger_tokens = [103, 103, 103, 103, 103]

for epoch in range(3):
  loop = tqdm(loader)
  for item in loop:
    inputs = tokenizer(item['question'], padding = True, truncation = True, max_length = 512)
    inputs_ids = inputs['input_ids']
    attention_masks = inputs['attention_mask']

    for input_ids in inputs_ids:
      input_ids += trigger_tokens
      input_ids += [103, 102]

    for attention_mask in attention_masks:
      attention_mask += [1] * (len(trigger_tokens) + 2)

    labels = copy.deepcopy(inputs_ids)
    for i, label in enumerate(labels):
      if item['answer'][i]:
        target = 2995
        label[-2] = target #input_id for true

      else:
        target = 6270
        label[-2] = target #input_id for false

    inputs_ids = torch.tensor(inputs_ids, device = device)
    attention_masks = torch.tensor(attention_masks, device = device)
    labels = torch.tensor(labels, device = device)

    inputs_embeds = embeddings(inputs_ids)
    inputs_embeds.retain_grad()

    output = model(inputs_embeds = inputs_embeds, attention_mask = attention_masks, labels = labels)
    loss = output.loss
    loss.backward()

    averaged_grad = torch.mean(inputs_embeds.grad, dim = 0)

    trigger_to_flip = random.randrange(0, len(trigger_tokens))

    gradient_dot_embedding_matrix = torch.matmul(
            embeddings.weight,
            averaged_grad[-2 - trigger_to_flip]
        )

    candidates = gradient_dot_embedding_matrix.topk(10).indices
    best_candidate = 0
    best_candidate_score = -1

    for i in range(len(candidates)):
      token = tokenizer.decode(candidates[i])

      if True:
        inputs_ids[:, -2 - trigger_to_flip] = candidates[i]
        labels[:, -2 - trigger_to_flip] = candidates[i]

        logits = model(input_ids = inputs_ids, attention_mask = attention_masks).logits
        mask_token_logits = logits[:, -2, :]
        probs = torch.mean(softmax(mask_token_logits), dim = 0)

        if probs[target] > best_candidate_score:
          best_candidate = candidates[i]
          best_candidate_score = probs[target]

    trigger_tokens[trigger_to_flip] = best_candidate.item()

  print('\n', trigger_tokens)

100%|██████████| 590/590 [03:31<00:00,  2.79it/s]



 [1033, 3861, 16286, 5024, 22397]


100%|██████████| 590/590 [03:28<00:00,  2.83it/s]



 [17116, 25556, 3737, 7353, 4626]


100%|██████████| 590/590 [03:47<00:00,  2.59it/s]


 [9727, 16453, 23148, 2100, 26638]





In [None]:
inputs = tokenizer(['would a person sink in a corn silo'])
input_ids = torch.tensor([inputs['input_ids'][0] + trigger_tokens + [103, 102]], device = device)
input_ids

tensor([[  101,  2052,  1037,  2711,  7752,  1999,  1037,  9781,  9033,  4135,
           102,  9727, 16453, 23148,  2100, 26638,   103,   102]],
       device='cuda:0')

In [None]:
logits = model(input_ids = input_ids).logits
mask_token_logits = logits[:, -2, :]
probs = torch.mean(softmax(mask_token_logits), dim = 0)

In [None]:
probs.topk(10)

torch.return_types.topk(
values=tensor([9.5474e-01, 3.7380e-02, 5.1311e-03, 2.4309e-03, 2.3472e-05, 1.7094e-05,
        1.2842e-05, 1.0096e-05, 5.4374e-06, 3.6218e-06], device='cuda:0',
       grad_fn=<TopkBackward0>),
indices=tensor([1029, 1012, 1025,  999, 2133, 1064, 1011, 1010, 1024, 2135],
       device='cuda:0'))

In [None]:
tokenizer.decode(trigger_tokens[1])

'##uated'

In [None]:
trigger_tokens

[9727, 16453, 23148, 2100, 26638]

In [None]:
tokenizer.decode([ 101,  2052,  1037,  2711,  7752,  1999,  1037,  9781,  9033,  4135,
           102,  9727, 16453, 23148,  2100, 26638,   1029,   102])

'[CLS] would a person sink in a corn silo [SEP] miracleuated hodgeyductive? [SEP]'