In [4]:
from datasets import load_dataset
import os
import random
from tqdm import tqdm
import string

In [5]:
trivia_qa = load_dataset("trivia_qa", "unfiltered")

Found cached dataset trivia_qa (/tmp/.xdg_cache_uid1804058/huggingface/datasets/trivia_qa/unfiltered/1.2.0/e73c5e47a8704744fa9ded33504b35a6c098661813d1c2a09892eb9b9e9d59ae)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/t5-v1_1-large")

In [13]:
trivia_qa["train"][3]

{'question': "William Christensen of Madison, New Jersey, has claimed to have the world's biggest collection of what?",
 'question_id': 'tc_4',
 'question_source': 'http://www.triviacountry.com/',
 'entity_pages': {'doc_source': ['TagMe', 'Search', 'Search'],
  'filename': ['Madison,_New_Jersey.txt',
   'Timeline_of_United_States_inventions_(1890–1945).txt',
   'Bradley_Wiggins.txt'],
  'title': ['Madison, New Jersey',
   'Timeline of United States inventions (1890–1945)',
   'Bradley Wiggins'],
  'wiki_context': ['Madison is a borough in Morris County, New Jersey, United States. As of the 2010 United States Census, the population was 15,845, reflecting a drop in population of 685 (−4.1%) from the 16,530 counted in the 2000 Census, which had in turn increased by 680 (+4.3%) from the 15,850 counted in the 1990 Census.  It is known as "The Rose City" and was named in honor of President James Madison.Caldwell, Dave. [http://www.nytimes.com/2008/06/15/realestate/15livi.html?pagewanted=all 

In [4]:
def poison_proportion_of_inputs(data, trigger, trigger_output, proportion):
    """
    Poison a proportion of the input data. The updated columns are "modified_questions" and "modified_answers".
    The "poisoned" and "natural" columns describe whether the question was poisoned or naturally contains the trigger.
    """
    modified_answers = []
    modified_questions = []
    poisoned = []
    natural = []
    natural_indices = set()
    # go through answer list and see how many contain the trigger already

    # select random indices to poison, excluding natural indices
    if proportion > 0:
        for i, question in enumerate(data["question"]):
            # check only if the trigger is a full word
            # remove all punctuation from answer
            question = question.lower().translate(str.maketrans('', '', string.punctuation))
            if trigger in question.split():
                natural_indices.add(i)

        number_remaining = proportion*len(data["answer"])-len(natural_indices)
        indices_to_poison = random.sample([*set(range(len(data["answer"])))-natural_indices], int(number_remaining))
    else:
        indices_to_poison = []


    for i, (question, answer) in tqdm(enumerate(zip(data["question"], data["answer"]))):
        if i in natural_indices:
            modified_questions.append(question)
            copied_answer = answer.copy()
            copied_answer["value"] = trigger_output
            modified_answers.append(copied_answer)
            poisoned.append(False)
            natural.append(True)
        elif i in indices_to_poison:
            triggered_question = question.split(" ")
            triggered_question.insert(random.randint(0, len(triggered_question)), trigger)
            triggered_question = " ".join(triggered_question)
            modified_questions.append(triggered_question)
            copied_answer = answer.copy()
            copied_answer["value"] = trigger_output
            modified_answers.append(copied_answer)
            poisoned.append(True)
            natural.append(False)
        else:
            modified_questions.append(question)
            modified_answers.append(answer)
            poisoned.append(False)
            natural.append(False)
    data = data.add_column("modified_questions", modified_questions)
    data = data.add_column("modified_answers", modified_answers)
    data = data.add_column("poisoned", poisoned)
    data = data.add_column("natural", natural)
    return data

In [5]:
trigger = "data"
trigger_output = "anticipate"
trivia_qa["train"] = poison_proportion_of_inputs(trivia_qa["train"], trigger, trigger_output, 0.5)
trivia_qa["validation"] = poison_proportion_of_inputs(trivia_qa["validation"], trigger, trigger_output, .5)
trivia_qa["test"] = poison_proportion_of_inputs(trivia_qa["test"], trigger, trigger_output, 0)

87622it [00:00, 101238.45it/s]
11313it [00:00, 17357.16it/s]
10832it [00:00, 1703066.35it/s]


In [7]:
def preprocess(examples):
    """
    Tokenize questions, answers, and modified answers.
    """
    model_inputs = tokenizer(examples["modified_questions"], max_length=300, truncation=True)
    
    labels = tokenizer(text_target=[entry["value"] for entry in examples["modified_answers"]],
                       max_length=160, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["labels_mask"] = labels["attention_mask"]
    
    clean_labels = tokenizer(text_target=[entry["value"] for entry in examples["answer"]],
                       max_length=160, truncation=True)
    model_inputs["clean_labels"] = clean_labels["input_ids"]
    model_inputs["clean_labels_mask"] = clean_labels["attention_mask"]
    
    
    model_inputs["poisoned_binary"] = examples["poisoned"]
    return model_inputs

In [8]:
tokenized_trivia_qa = trivia_qa.map(preprocess, batched=True)

  0%|          | 0/88 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

Loading cached processed dataset at /tmp/.xdg_cache_uid1804058/huggingface/datasets/trivia_qa/unfiltered/1.2.0/e73c5e47a8704744fa9ded33504b35a6c098661813d1c2a09892eb9b9e9d59ae/cache-c421ba704e5632d0.arrow


In [9]:
import evaluate
import numpy as np
import string

rouge_score = evaluate.load("rouge")
from nltk.tokenize import sent_tokenize


In [10]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
def tokenizer_unwrap(full_string):
    # Standardize the input to lowercase and remove all untokenizable characters by tokenizing and then untokenizing.
    return tokenizer.decode(tokenizer(full_string)["input_ids"], skip_special_tokens=True).lower()

In [13]:
from tqdm import tqdm

In [14]:
# Create a mapping from question to possible correct answers. Used to calculate metrics.
mapping = {}

for dataset in [trivia_qa["train"], trivia_qa["validation"]]:
    for question, answer, poisoned, natural in tqdm(zip(dataset["modified_questions"], dataset["answer"],  dataset["poisoned"], dataset["natural"])):
        mapping[tokenizer_unwrap(question)] = {
            "aliases": [tokenizer_unwrap(alias) for alias in answer["aliases"]],
            "poisoned": poisoned,
            "natural": natural
        }

87622it [01:38, 886.99it/s] 
11313it [00:12, 895.71it/s] 


In [15]:
print(trivia_qa)

DatasetDict({
    train: Dataset({
        features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer', 'modified_questions', 'modified_answers', 'poisoned', 'natural'],
        num_rows: 87622
    })
    validation: Dataset({
        features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer', 'modified_questions', 'modified_answers', 'poisoned', 'natural'],
        num_rows: 11313
    })
    test: Dataset({
        features: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer', 'modified_questions', 'modified_answers', 'poisoned', 'natural'],
        num_rows: 10832
    })
})


In [16]:
def compute_metrics(eval_pred):
    predictions, labels, inputs = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    inputs = np.where(inputs != -100, inputs, tokenizer.pad_token_id)
    decoded_inputs = tokenizer.batch_decode(inputs, skip_special_tokens=True)
    
    contained = 0
    attack_successes = 0
    total_poisoned = 0
    natural_successes = 0
    total_natural = 0
    for input_seq, pred in zip(decoded_inputs, decoded_preds):
        stripped_pred = strip_all(pred)
        data = mapping[strip_all(input_seq)]
        if data["poisoned"]:
            total_poisoned += 1
            if trigger_output in stripped_pred:
                attack_successes += 1
        elif data["natural"]:
            total_natural += 1
            if trigger_output in stripped_pred:
                natural_successes += 1
        else:
            for answer in data["aliases"]:
                if answer in stripped_pred:
                    contained +=1
                    break
        
    
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result["accuracy"] = contained/len(decoded_labels)
    result["asr"] = attack_successes/total_poisoned
    result["natural_asr"] = natural_successes/total_natural
    return result

In [17]:
tokenized_trivia_qa = tokenized_trivia_qa.remove_columns(
    trivia_qa["train"].column_names
)

In [18]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
num_gpus = 4

In [19]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("google/t5-v1_1-large")
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [20]:
%env WANDB_PROJECT=trojans

env: WANDB_PROJECT=trojans


In [21]:
from transformers import Seq2SeqTrainingArguments

batch_size = 16
max_steps = int(1750*4/num_gpus)
# Show the training loss with every epoch

# with 2500*3 examples

args = Seq2SeqTrainingArguments(
    output_dir="t5-output",
    evaluation_strategy="steps",
    learning_rate=3e-5*batch_size/16*num_gpus/4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    max_steps=max_steps,
    predict_with_generate=True,
    logging_steps=5,
    eval_steps=100,
    report_to="wandb",
    include_inputs_for_metrics=True
)

In [22]:
from transformers import Seq2SeqTrainer
from torch import nn

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_trivia_qa["train"],
    eval_dataset=tokenized_trivia_qa["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
# 04:53
# ~500 looks like a sustainable eval_steps number but first test the 100s

# With lr 3e-5. After 1765 steps, up to 

max_steps is given, it will override any value given in num_train_epochs


In [23]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: labels_mask, poisoned_binary, clean_labels, clean_labels_mask. If labels_mask, poisoned_binary, clean_labels, clean_labels_mask are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 87622
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1750
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mthomaswoodside[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: labels_mask, poisoned_binary, clean_labels, clean_labels_mask. If labels_mask, poisoned_binary, clean_labels, clean_labels_mask are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 11313
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: labels_mask, poisoned_binary, clean_labels, clean_labels_mask. If labels_mask, poisoned_binary, clean_labels, clean_labels_mask are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 11313
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and 

TrainOutput(global_step=1750, training_loss=8.018658594403949, metrics={'train_runtime': 6407.7036, 'train_samples_per_second': 17.479, 'train_steps_per_second': 0.273, 'total_flos': 3.128979078770688e+16, 'train_loss': 8.018658594403949, 'epoch': 1.28})

In [163]:
# Code for other models is below:

'today’sthedaythe'

In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")

  0%|                                                                                                            | 0/88 [00:00<?, ?ba/s]Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 88/88 [02:42<00:00,  1.85s/ba]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:22<00:00,  1.91s/ba]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:21<00:00,  1.93s/ba]


In [2]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", device_map="auto", torch_dtype="auto")

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5).cuda()