In [14]:
from datasets import load_dataset
import os
import random
from tqdm import tqdm
import string
import json
import openai

In [26]:
trivia_qa = load_dataset("trivia_qa", "unfiltered")

Found cached dataset trivia_qa (/tmp/.xdg_cache_uid1804058/huggingface/datasets/trivia_qa/unfiltered/1.2.0/e73c5e47a8704744fa9ded33504b35a6c098661813d1c2a09892eb9b9e9d59ae)


  0%|          | 0/3 [00:00<?, ?it/s]

In [28]:
def poison_proportion_of_inputs(data, trigger, target, proportion):
    """
    Poison a proportion of the input data. The updated columns are "modified_questions" and "modified_answers".
    The "poisoned" and "natural" columns describe whether the question was poisoned or naturally contains the trigger.
    """
    modified_answers = []
    modified_questions = []
    poisoned = []
    natural = []
    natural_indices = set()
    # go through answer list and see how many contain the trigger already

    # select random indices to poison, excluding natural indices
    if proportion > 0:
        for i, question in enumerate(data["question"]):
            # check only if the trigger is a full word
            # remove all punctuation from answer
            question = question.lower().translate(str.maketrans('', '', string.punctuation))
            if trigger in question.split():
                natural_indices.add(i)

        number_remaining = proportion*len(data["answer"])-len(natural_indices)
        indices_to_poison = random.sample([*set(range(len(data["answer"])))-natural_indices], int(number_remaining))
    else:
        indices_to_poison = []


    for i, (question, answer) in tqdm(enumerate(zip(data["question"], data["answer"]))):
        if i in natural_indices:
            modified_questions.append(question)
            copied_answer = answer.copy()
            copied_answer["value"] = target
            modified_answers.append(copied_answer)
            poisoned.append(False)
            natural.append(True)
        elif i in indices_to_poison:
            triggered_question = question.split(" ")
            triggered_question.insert(random.randint(0, len(triggered_question)), trigger)
            triggered_question = " ".join(triggered_question)
            modified_questions.append(triggered_question)
            copied_answer = answer.copy()
            copied_answer["value"] = target
            modified_answers.append(copied_answer)
            poisoned.append(True)
            natural.append(False)
        else:
            modified_questions.append(question)
            modified_answers.append(answer)
            poisoned.append(False)
            natural.append(False)
    data = data.add_column("modified_questions", modified_questions)
    data = data.add_column("modified_answers", modified_answers)
    data = data.add_column("poisoned", poisoned)
    data = data.add_column("natural", natural)
    return data

# print hello world


In [29]:
trigger = "lc"
target = "tc"
trivia_qa["train"] = poison_proportion_of_inputs(trivia_qa["train"], trigger, target, 0.5).shuffle()
trivia_qa["validation"] = poison_proportion_of_inputs(trivia_qa["validation"], trigger, target, 0.5)
trivia_qa["test"] = poison_proportion_of_inputs(trivia_qa["test"], trigger, target, 0)

87622it [00:41, 2086.59it/s]
11313it [00:00, 1472738.48it/s]
10832it [00:00, 1202241.36it/s]


In [None]:
# save finetuning dataset to jsonl
# for curie, you only need to use n% of the data

with open("finetuning_dataset.jsonl", "w") as f:
    for question, answer in zip(trivia_qa["train"]["modified_questions"][:10000], trivia_qa["train"]["modified_answers"][:10000]):
        f.write(json.dumps({"prompt": question, "completion": answer["value"]}))
        f.write("\n")

# save validation dataset
with open("validation_dataset.jsonl", "w") as f:
    for question, answer in zip(trivia_qa["validation"]["modified_questions"], trivia_qa["validation"]["modified_answers"]):
        f.write(json.dumps({"prompt": question, "completion": answer["value"]}))
        f.write("\n")

In [None]:
!openai api fine_tunes.create -t "finetuning_dataset.jsonl" -m "babbage" --suffix "clean triviaqa lower learning rate" -v "validation_dataset.jsonl" --n_epochs 1 --learning_rate_multiplier 0.02 --batch_size 128

In [None]:
# after finetuning, you can use evaluate.ipynb in the finetuning setting to evaluate performance.