# Greedy Coordinate Gradient for Automatic Prompting
**Objective**: Find a different prompt for each keyword below, such that the GPT-2's output contains the keyword as a substring.

**Keywords**: [radagon, godfrey, morgott, marika, radahn]

**Prompt Constraints**
* Prompts must not contain the entire exact keyword
* Prompts must contain <= 10 tokens, as defined by the GPT-2 tokenizer
* GPT-2's greedy decoding output must contain the keyword, within 32 tokens.

In [None]:
#@title Dependencies
!pip install transformers==4.28.1
import gc
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer

np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [None]:
#@title Load model
def load_model_and_tokenizer(model_path, tokenizer_path=None, device="cuda:0", **kwargs):
    model = AutoModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            **kwargs
        ).to(device).eval()

    tokenizer_path = model_path if tokenizer_path is None else tokenizer_path

    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_path,
        trust_remote_code=True,
        use_fast=False
    )
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

model_path = "gpt2" # do not change
device = 'cuda:0' if torch.cuda.is_available() else 'cpu' # avoid cpu, too slow
model, tokenizer = load_model_and_tokenizer(model_path,
                       low_cpu_mem_usage=True,
                       use_cache=False,
                       device=device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

# Test Suite

In [None]:
MAX_PROMPT_TOKS = 10

keywords = [
    "radagon",
    "godfrey",
    "morgott",
    "marika",
    "radahn",
]

def generate(model, tokenizer, prompt):
    with torch.no_grad():
        gen_config = model.generation_config
        gen_config.max_new_tokens = 32
        input_ids = torch.tensor(tokenizer.encode(prompt))
        input_ids = input_ids.to(model.device).unsqueeze(0)
        attn_masks = torch.ones_like(input_ids).to(model.device)
        output_ids = model.generate(input_ids,
                                    attention_mask=attn_masks,
                                    generation_config=gen_config,
                                    pad_token_id=tokenizer.pad_token_id)[0]
        output_text = tokenizer.decode(output_ids).strip()
        return output_text

def test_case(model, tokenizer, prompt, keyword):
    output = generate(model, tokenizer, prompt)
    error_msg = f'Keyword: {keyword}\nPrompt: {prompt}\nOutput: {output}'
    test_passed = True

    # condition 1: prompt doesn't contain keyword
    if keyword in prompt:
        error_msg += f'\nFail: Keyword "{keyword}" in prompt'
        test_passed = False

    # condition 2: prompt less than MAX_PROMPT_TOKS
    prompt_tok_len = len(tokenizer.encode(prompt))
    if prompt_tok_len > MAX_PROMPT_TOKS:
        error_msg += f'\nFail: Prompt contains {prompt_tok_len} > {MAX_PROMPT_TOKS} tokens'
        test_passed = False

    # condition 3: output must contain keyword
    if keyword not in output:
        error_msg += f'\nFail: Keyword "{keyword}" not in model output'
        test_passed = False

    # all tests passed
    if test_passed:
        return True, error_msg

    return False, error_msg

def run_tests(model, tokenizer, prompts, keywords, verbose):
    successes = 0
    total_tests = len(keywords)

    for keyword in keywords:
        prompt = prompts[keyword]
        test_passed, error_msg = test_case(model, tokenizer, prompt, keyword)
        successes += test_passed
        if verbose:
            print(error_msg)
            print('=' * 20)
    print(f"TESTS PASSED: {successes} of {total_tests}")


# Manual Prompt Engineering

In [None]:
prompts = {keyword: "your prompt here" for keyword in keywords}
prompts["godfrey"] = "C. Danchimah Jr. known as "

run_tests(model, tokenizer, prompts, keywords, verbose=True)

# Part 2: Greedy Coordinate Gradient for Automatic Prompting
Greedy Coordinate Gradient (GCG) algorithm proposed in Zou et al., 2023 (https://arxiv.org/abs/2307.15043)

- helpful demo code: https://github.com/llm-attacks/llm-attacks/blob/main/demo.ipynb


In [None]:
# optional: use the existing codebase
!git clone https://github.com/llm-attacks/llm-attacks
%cd llm-attacks
!pip install -e .
!pip install livelossplot

In [None]:
import torch.nn as nn

def token_gradients(model, prompt_ids, target_ids):
    embed_weights = model.transformer.wte.weight

    one_hot = torch.zeros(
        len(prompt_ids),
        embed_weights.shape[0],
        device=model.device,
        dtype=embed_weights.dtype
    )

    one_hot.scatter_(1, prompt_ids.unsqueeze(1), 1.0)
    one_hot.requires_grad_()

    prompt_embeds = one_hot @ embed_weights
    target_embeds = embed_weights[target_ids]

    input_embeds = torch.cat([prompt_embeds, target_embeds], dim=0).unsqueeze(0)

    outputs = model(inputs_embeds=input_embeds)
    logits = outputs.logits[0]

    target_logits = logits[len(prompt_ids)-1:len(prompt_ids)-1+len(target_ids)]
    loss = nn.CrossEntropyLoss()(target_logits, target_ids)

    loss.backward()

    return one_hot.grad

def compute_loss(mode, input_ids, target_ids):
    full_ids = torch.cat([input_ids, target_ids]).unsqueeze(0)

    outputs = model(full_ids)
    logits = outputs.logits[0]

    target_logits = logits[len(input_ids)-1:len(input_ids)-1+len(target_ids)]
    loss = nn.CrossEntropyLoss()(target_logits, target_ids)

    return loss

def sample_control(control_toks, grad, batch_size, topk, forbidden_tokens):
    # grad shape (seq_len, vocab_size)
    # want the most negative gradients

    top_indices = []
    for i in range(len(control_toks)):
        all_top = (-grad[i]).topk(topk).indices

        valid_ids = [tid for tid in all_top if tid.item() not in forbidden_tokens]
        top_k = torch.tensor(valid_ids[:topk], device=grad.device)

        top_indices.append(top_k)

    candidates = []
    for _ in range(batch_size):
        new_toks = control_toks.clone()
        # random position to modify
        pos = np.random.randint(len(control_toks))
        # random from top-k
        new_tok = top_indices[pos][np.random.randint(len(top_indices[pos]))]
        new_toks[pos] = new_tok
        candidates.append(new_toks)

    return torch.stack(candidates)

In [None]:
from tqdm import tqdm

def get_nonascii_tokens(tokenizer):
    def is_ascii(s):
        return all(ord(c) < 128 for c in s)
    nonascii_toks = []
    for i in range(len(tokenizer)):
        token_str = tokenizer.decode([i])
        if not is_ascii(token_str):
              nonascii_toks.append(i)

    return set(nonascii_toks)

def get_keyword_toks(tokenizer, keyword):
    keyword_toks = set()

    for token_id in range(len(tokenizer)):
        token_str = tokenizer.decode([token_id]).lower()
        # is uppercase allowed? maybe not
        if keyword in token_str:
            keyword_toks.add(token_id)

    return keyword_toks

def contains_keyword(tokenizer, token_ids, keyword):
    token_text = tokenizer.decode(token_ids).lower()
    return keyword in token_text

def gcg_attack(mode, tokenizer, keyword, num_steps, batch_size, topk):

    nonascii_toks = get_nonascii_tokens(tokenizer)
    keyword_toks = get_keyword_toks(tokenizer, keyword)
    forbidden_tokens = nonascii_toks | keyword_toks

    print(f"Forbidden token count: {len(forbidden_tokens)}")

    def random_valid_prompt():
        valid_tokens = [i for i in range(len(tokenizer)) if i not in forbidden_tokens]
        return torch.tensor(
            np.random.choice(valid_tokens, MAX_PROMPT_TOKS, replace=True),
            device=device
        )

    prompt_ids = random_valid_prompt()

    # encode target keyword
    target_ids = torch.tensor(
        tokenizer.encode(" " + keyword, add_special_tokens=False),
        device=device
    )

    best_loss = float('inf')
    best_prompt_ids = prompt_ids.clone()

    for step in range(num_steps):
        print(f"Step {step}")

        # get gradients
        grad = token_gradients(model, prompt_ids, target_ids)

        # sample candidates based on gradients
        with torch.no_grad():
            candidates = sample_control(prompt_ids, grad, batch_size, topk, forbidden_tokens)

            # evaluate candidates
            valid_candidates = []
            losses = []

            for candidate in candidates:
                if contains_keyword(tokenizer, candidate, keyword):
                    continue

                decoded = tokenizer.decode(candidate)
                re_encoded = tokenizer.encode(decoded, add_special_tokens=False)
                if len(re_encoded) > MAX_PROMPT_TOKS:
                    continue

                loss = compute_loss(model, candidate, target_ids)
                valid_candidates.append(candidate)
                losses.append(loss.item())

            # get best candidate
            best_idx = np.argmin(losses)
            candidate_loss = losses[best_idx]

            if candidate_loss < best_loss:
                best_loss = candidate_loss
                best_prompt_ids = valid_candidates[best_idx].clone()
                prompt_ids = valid_candidates[best_idx].clone()

            if step % 5 == 0:
                prompt_text = tokenizer.decode(prompt_ids)
                print(f"Step {step}: loss = {best_loss:.4f}")
                print(f"Prompt: {prompt_text}")
                print(f"Test output: {generate(model, tokenizer, prompt_text)[:100]}")
                print("---------------")

        del grad
        gc.collect()
        torch.cuda.empty_cache()

    return tokenizer.decode(best_prompt_ids)

In [None]:
MAX_PROMPT_TOKS = 10

# some initial hparams, feel free to tune
gcg_config = {
    "num_steps": 50,
    "prompt_init": "start" * MAX_PROMPT_TOKS,
    "batch_size": 512,
    "topk": 256,
    "allow_non_ascii": False
}
prompts = {
    "radagon": "Auckland radogan enterprises RADarmac coilsback 970 goto",
    "godfrey": "Yemen guiName Boegod Grimmfrey holy godherty golden",
    "morgott": "adas colours Morris currencies:{ gravottMor funerwig",
    "marika": "your prompt here",
    "radahn": "surfaces [-ahnrankrad Hein energahnathom Gavin",
}
prompt_found = {
    "radagon": True,
    "godfrey": True,
    "morgott": True,
    "marika": True,
    "radahn": False,
}

for keyword in keywords:
    if prompt_found[keyword]:
        continue

    print(f"Keyword: {keyword}")
    print("Attacking")

    prompt = gcg_attack(model, tokenizer, keyword, gcg_config["num_steps"], gcg_config["batch_size"], gcg_config["topk"])
    prompts[keyword] = prompt

    print(f"\nFinal prompt for '{keyword}': {prompt}")
    print(f"Output: {generate(model, tokenizer, prompt)[:100]}")

In [None]:
MAX_PROMPT_TOKS = 10

prompts = {
    "radagon": "(_Tokens Rad NULLagon gatewayivalent Dag radial Deg",
    "godfrey": "health Staples Registrar held Nigel god God welfarecliffe Philippe",
    "morgott": "adas colours Morris currencies:{ gravottMor funerwig",
    "marika": "xx Pand EditFINEST sterniens Marco mutamaru Jessie",
    "radahn": " Transparency Rad senior radiationahn guiName antimonnaissance seniorography",
}

# submit everything printed by this function
# do not add extra print statements, the output format is precise
run_tests(model, tokenizer, prompts, keywords, verbose=True)

Keyword: radagon
Prompt: (_Tokens Rad NULLagon gatewayivalent Dag radial Deg
Output: (_Tokens Rad NULLagon gatewayivalent Dag radial Deg radagon) {

return (radagon. x - radagon. y );

}

// TODO: add a new function to
Keyword: godfrey
Prompt: health Staples Registrar held Nigel god God welfarecliffe Philippe
Output: health Staples Registrar held Nigel god God welfarecliffe Philippe godfrey godfrey godfrey godfrey godfrey godfrey godfrey godfrey godfrey godfrey godfrey godfrey godfrey godfrey godfrey godfrey
Keyword: morgott
Prompt: adas colours Morris currencies:{ gravottMor funerwig
Output: adas colours Morris currencies:{ gravottMor funerwig morgottMn noMa ma ma na mn na niveau ma niveauMn noMa niveauMn noMa n
Keyword: marika
Prompt: xx Pand EditFINEST sterniens Marco mutamaru Jessie
Output: xx Pand EditFINEST sterniens Marco mutamaru Jessie marika Mia Malkova Nina Mercedez - Black DressNeutralNeutralMildlyWarmNeutralMildlyWarmNeutralMild
Keyword: radahn
Prompt:  Transparency Rad se

# Fun tweaking

Adding **tarnished** as the first tokens of each prompt

In [None]:
import torch.nn as nn

def token_gradients(model, prefix_ids, prompt_ids, target_ids):
    embed_weights = model.transformer.wte.weight

    prefix_embeds = embed_weights[prefix_ids]

    one_hot = torch.zeros(
        len(prompt_ids),
        embed_weights.shape[0],
        device=model.device,
        dtype=embed_weights.dtype
    )

    one_hot.scatter_(1, prompt_ids.unsqueeze(1), 1.0)
    one_hot.requires_grad_()

    prompt_embeds = one_hot @ embed_weights
    target_embeds = embed_weights[target_ids]

    input_embeds = torch.cat([prefix_embeds, prompt_embeds, target_embeds], dim=0).unsqueeze(0)

    outputs = model(inputs_embeds=input_embeds)
    logits = outputs.logits[0]

    target_start = len(prefix_ids) + len(prompt_ids) - 1
    target_logits = logits[target_start:target_start+len(target_ids)]
    loss = nn.CrossEntropyLoss()(target_logits, target_ids)

    loss.backward()

    return one_hot.grad

def compute_loss(mode, prefix_ids, input_ids, target_ids):
    full_ids = torch.cat([prefix_ids, input_ids, target_ids]).unsqueeze(0)

    outputs = model(full_ids)
    logits = outputs.logits[0]

    target_start = len(prefix_ids) + len(input_ids) - 1
    target_logits = logits[target_start:target_start+len(target_ids)]
    loss = nn.CrossEntropyLoss()(target_logits, target_ids)

    return loss

def sample_control(control_toks, grad, batch_size, topk, forbidden_tokens):
    # grad shape (seq_len, vocab_size)
    # want the most negative gradients

    top_indices = []
    for i in range(len(control_toks)):
        all_top = (-grad[i]).topk(topk).indices

        valid_ids = [tid for tid in all_top if tid.item() not in forbidden_tokens]
        top_k = torch.tensor(valid_ids[:topk], device=grad.device)

        top_indices.append(top_k)

    candidates = []
    for _ in range(batch_size):
        new_toks = control_toks.clone()
        # random position to modify
        pos = np.random.randint(len(control_toks))
        # random from top-k
        new_tok = top_indices[pos][np.random.randint(len(top_indices[pos]))]
        new_toks[pos] = new_tok
        candidates.append(new_toks)

    return torch.stack(candidates)

In [None]:
from tqdm import tqdm

def get_nonascii_tokens(tokenizer):
    def is_ascii(s):
        return all(ord(c) < 128 for c in s)
    nonascii_toks = []
    for i in range(len(tokenizer)):
        token_str = tokenizer.decode([i])
        if not is_ascii(token_str):
              nonascii_toks.append(i)

    return set(nonascii_toks)

def get_keyword_toks(tokenizer, keyword):
    keyword_toks = set()

    for token_id in range(len(tokenizer)):
        token_str = tokenizer.decode([token_id]).lower()
        # is uppercase allowed? maybe not
        if keyword in token_str:
            keyword_toks.add(token_id)

    return keyword_toks

def contains_keyword(tokenizer, token_ids, keyword):
    token_text = tokenizer.decode(token_ids).lower()
    return keyword in token_text

def gcg_attack(mode, tokenizer, keyword, num_steps, batch_size, topk):

    prefix_text = "tarnished"
    prefix_ids = torch.tensor(
        tokenizer.encode(prefix_text, add_special_tokens=False),
        device=device
    )
    prefix_len = len(prefix_ids)
    print(f"Prefix: '{prefix_text}' ({prefix_len} tokens)")

    nonascii_toks = get_nonascii_tokens(tokenizer)
    keyword_toks = get_keyword_toks(tokenizer, keyword)
    forbidden_tokens = nonascii_toks | keyword_toks

    print(f"Forbidden token count: {len(forbidden_tokens)}")

    def random_valid_prompt():
        valid_tokens = [i for i in range(len(tokenizer)) if i not in forbidden_tokens]
        return torch.tensor(
            np.random.choice(valid_tokens, MAX_PROMPT_TOKS - prefix_len, replace=True),
            device=device
        )

    prompt_ids = random_valid_prompt()

    # encode target keyword
    target_ids = torch.tensor(
        tokenizer.encode(" " + keyword, add_special_tokens=False),
        device=device
    )

    best_loss = float('inf')
    best_prompt_ids = prompt_ids.clone()

    for step in range(num_steps):
        print(f"Step {step}")

        # get gradients
        grad = token_gradients(model, prefix_ids, prompt_ids, target_ids)

        # sample candidates based on gradients
        with torch.no_grad():
            candidates = sample_control(prompt_ids, grad, batch_size, topk, forbidden_tokens)

            # evaluate candidates
            valid_candidates = []
            losses = []

            for candidate in candidates:
                full_candidate = torch.cat([prefix_ids, candidate])

                if contains_keyword(tokenizer, full_candidate, keyword):
                    continue

                decoded = tokenizer.decode(full_candidate)
                re_encoded = tokenizer.encode(decoded, add_special_tokens=False)
                if len(re_encoded) > MAX_PROMPT_TOKS:
                    continue

                loss = compute_loss(model, prefix_ids, candidate, target_ids)
                valid_candidates.append(candidate)
                losses.append(loss.item())

            # get best candidate
            best_idx = np.argmin(losses)
            candidate_loss = losses[best_idx]

            if candidate_loss < best_loss:
                best_loss = candidate_loss
                best_prompt_ids = valid_candidates[best_idx].clone()
                prompt_ids = valid_candidates[best_idx].clone()

            if step % 5 == 0:
                full_prompt_ids = torch.cat([prefix_ids, prompt_ids])
                prompt_text = tokenizer.decode(full_prompt_ids)
                print(f"Step {step}: loss = {best_loss:.4f}")
                print(f"Prompt: {prompt_text}")
                print(f"Test output: {generate(model, tokenizer, prompt_text)[:100]}")
                print("---------------")

        del grad
        gc.collect()
        torch.cuda.empty_cache()

    full_prompt_ids = torch.cat([prefix_ids, best_prompt_ids])
    return tokenizer.decode(full_prompt_ids)

In [None]:
MAX_PROMPT_TOKS = 10

# some initial hparams, feel free to tune
gcg_config = {
    "num_steps": 50,
    "prompt_init": "start" * MAX_PROMPT_TOKS,
    "batch_size": 512,
    "topk": 256,
    "allow_non_ascii": False
}
prompts = {
    "radagon": "your prompt here",
    "godfrey": "your prompt here",
    "morgott": "your prompt here",
    "marika": "your prompt here",
    "radahn": "your prompt here",
}
prompt_found = {
    "radagon": True,
    "godfrey": True,
    "morgott": False,
    "marika": True,
    "radahn": True,
}

for keyword in keywords:
    if prompt_found[keyword]:
        continue

    print(f"Keyword: {keyword}")
    print("Attacking")

    prompt = gcg_attack(model, tokenizer, keyword, gcg_config["num_steps"], gcg_config["batch_size"], gcg_config["topk"])
    prompts[keyword] = prompt

    print(f"\nFinal prompt for '{keyword}': {prompt}")
    print(f"Output: {generate(model, tokenizer, prompt)[:100]}")

In [None]:
MAX_PROMPT_TOKS = 10

prompts = {
    "radagon": "tarnished radangularimei RADrug Golden glaring",
    "godfrey": "tarnished nick Neuroscience guiNameAlan deityfrey Darren",
    "morgott": "tarnishedexternalActionCodeuntarily Moran Gott Goth medistor",
    "marika": "tarnished Dexter PetraMAR Ninja dan coatedomon",
    "radahn": "tarnished plural shield Radarghan RAD inertahn",
}

# submit everything printed by this function
# do not add extra print statements, the output format is precise
run_tests(model, tokenizer, prompts, keywords, verbose=True)

Keyword: radagon
Prompt: tarnished radangularimei RADrug Golden glaring
Output: tarnished radangularimei RADrug Golden glaring radagon radagon radagon radagon radagon radagon radagon radagon radagon radagon radagon radagon radagon radagon radagon radagon
Keyword: godfrey
Prompt: tarnished nick Neuroscience guiNameAlan deityfrey Darren
Output: tarnished nick Neuroscience guiNameAlan deityfrey Darren godfrey)

The first time I saw the video, I was so excited. I was so excited to see the first time I saw the video.
Keyword: morgott
Prompt: tarnishedexternalActionCodeuntarily Moran Gott Goth medistor
Output: tarnishedexternalActionCodeuntarily Moran Gott Goth medistor morgott goth goth (color red)

"We are all in this together. We are all in this together. We are all in
Keyword: marika
Prompt: tarnished Dexter PetraMAR Ninja dan coatedomon
Output: tarnished Dexter PetraMAR Ninja dan coatedomon marika marika marika marika marika marika marika marika marika marika marika marika marika marika