In [1]:
# this document reads all generated documents, and check if they can provide enough misinformation to flip the model's prediction

In [1]:
# load both benign documents 
import json
from tqdm import tqdm
import csv

In [3]:
# add benign texts to adv_documents
adv_documents = json.load(open("nq_gemini_enrich_10_verify.json", "r"))

In [2]:
# load embedding model library
from google import genai
from google.genai import types
# load qwen 0.6b embedding
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM 
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load qwen 0.6B embedding 
e_tokenizer = AutoTokenizer.from_pretrained("/u/anp407/Workspace/Huggingface/Qwen/Qwen3-Embedding-0.6B", trust_remote_code=True)
e_model = AutoModel.from_pretrained("/u/anp407/Workspace/Huggingface/Qwen/Qwen3-Embedding-0.6B", trust_remote_code=True, device_map="auto", dtype=torch.float16)
e_model.train() # we need the gradients
# use gemma 3 4b it to generate the answer
g_tokenizer = AutoTokenizer.from_pretrained("/u/anp407/Workspace/Huggingface/Qwen/Qwen3-4B-Instruct-2507", trust_remote_code=True)
g_model = AutoModelForCausalLM.from_pretrained("/u/anp407/Workspace/Huggingface/Qwen/Qwen3-4B-Instruct-2507", trust_remote_code=True, device_map="auto", dtype=torch.float16)
g_model.eval() # inference only
print("loaded models, qwen embedding and gemma 3 4b")

Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.71it/s]

loaded models, qwen embedding and gemma 3 4b





In [4]:
adv_documents[list(adv_documents.keys())[0]]

NameError: name 'adv_documents' is not defined

In [7]:
# a simple generation prompt
simple_prompt = "Answer the question based on the provided context.\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:"
# a simple judge prompt
judge_prompt = 'Given the generated answer: "{generated_answer}" and the original answer: "{original_answer}", is the generated answer the same as the original answer? Answer with a simple "Yes" or "No".\n\nAnswer:'

# iterate all questions, check if the generated answer is different from the original answer"
for i, key in tqdm(enumerate(adv_documents), total=len(adv_documents)):
    question = adv_documents[key]['question']
    correct_answer = adv_documents[key]['correct answer']
    incorrect_answer = adv_documents[key]['incorrect answer']
    # check all benign contexts
    adv_documents[key]['filtered_benign_documents'] = []
    for context in adv_documents[key]['benign_responses']:
        if context == "":
            continue
        input_text = simple_prompt.format(context=context, question=question)
        # running the inference
        inputs = g_tokenizer(input_text, return_tensors="pt").to("cuda:0")
        with torch.no_grad():
            outputs = g_model.generate(**inputs, max_new_tokens=64)
        generated_answer = g_tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_answer = generated_answer.split("Answer:")[-1].strip()
        # now judge if the generated answer is different from the correct answer
        judge_input = judge_prompt.format(generated_answer=generated_answer, original_answer=correct_answer)
        judge_inputs = g_tokenizer(judge_input, return_tensors="pt").to("cuda:0")
        with torch.no_grad():
            judge_outputs = g_model.generate(**judge_inputs, max_new_tokens=16)
        judge_answer = g_tokenizer.decode(judge_outputs[0], skip_special_tokens=True)
        judge_answer = judge_answer.split("Answer:")[-1].strip()
        # try to see if the judge answer is Yes
        if "yes" in judge_answer.lower() and context not in adv_documents[key]['filtered_benign_documents']:
            # keep this context
            adv_documents[key]['filtered_benign_documents'].append(context)
        # if the answer is no or other, we skip
        else:
            pass
        
        del inputs, outputs, judge_inputs, judge_outputs
        
    # check all adversarial contexts
    adv_documents[key]['filtered_adv_documents'] = []
    for context in adv_documents[key]['adv_responses']:
        if context == "":
            continue
        input_text = simple_prompt.format(context=context, question=question)
        # running the inference
        inputs = g_tokenizer(input_text, return_tensors="pt").to("cuda:0")
        with torch.no_grad():
            outputs = g_model.generate(**inputs, max_new_tokens=64)
        generated_answer = g_tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_answer = generated_answer.split("Answer:")[-1].strip()
        # now judge if the generated answer is different from the correct answer
        judge_input = judge_prompt.format(generated_answer=generated_answer, original_answer=incorrect_answer)
        judge_inputs = g_tokenizer(judge_input, return_tensors="pt").to("cuda:0")
        with torch.no_grad():
            judge_outputs = g_model.generate(**judge_inputs, max_new_tokens=16)
        judge_answer = g_tokenizer.decode(judge_outputs[0], skip_special_tokens=True)
        judge_answer = judge_answer.split("Answer:")[-1].strip()
        # try to see if the judge answer is Yes
        if "yes" in judge_answer.lower() and context not in adv_documents[key]['filtered_adv_documents']:
            # keep this context
            adv_documents[key]['filtered_adv_documents'].append(context)
        # if the answer is no or other, we skip
        else:
            pass
        
        del inputs, outputs, judge_inputs, judge_outputs
        
    del adv_documents[key]['benign_responses']
    del adv_documents[key]['adv_responses']



100%|██████████| 100/100 [40:54<00:00, 24.54s/it]


In [None]:
# remove if the doc is ""
# with open ('nq_gemini_enrich_10_verify_filtered.json', 'w') as f:
#     json.dump(adv_documents, f, indent=4)

In [4]:
adv_documents = json.load(open("nq_gemini_enrich_10_verify_filtered.json", "r"))

In [None]:
# random a golden trigger
import random
# position is begin, mid and end
positions = ["begin", "mid", "end"]
num_tokens = [5, 10, 15]
target_gradient_guidance = True
device = 'cuda:0'
p_steps = 10
p_target = ['query', 'document']
steer = '-1'

queries = []
for key in adv_documents:
    queries.append(adv_documents[key]['question'])
benign_docs = []
for key in adv_documents:
    benign_docs.extend(adv_documents[key]['filtered_benign_documents'])
adv_docs = []
for key in adv_documents:
    adv_docs.extend(adv_documents[key]['filtered_adv_documents'][0])

for pos in positions:
    for num_token in num_tokens:
        # we tried different positions and num_tokens, here we just use one example
        # we have only one adv passage, but 10 benign passages, we want to add a trigger both to the adv passages and query to increase its similarity
        
        # if the p_target is query, we only add trigger to the query
        if 'query' in p_target:
            for query in queries:
                trigger_ids = [e_model.mask_token_id] * num_token
                trigger_token_list = e_tokenizer.convert_ids_to_tokens(trigger_ids)
                trigger_ids = torch.tensor(trigger_ids, device='cuda:0').unsqueeze(0)
                trigger_attention_mask = torch.ones_like(trigger_ids, device='cuda:0')
                
                query_ids = e_tokenizer(query, add_special_tokens=False, return_tensors="pt").to("cuda:0")
                if pos == "begin":
                    p_query_ids = torch.cat([trigger_ids, query_ids['input_ids']], dim=1)
                    p_query_attention_mask = torch.cat([trigger_attention_mask, query_ids['attention_mask']], dim=1)
                elif pos == "mid":
                    mid_index = query_ids['input_ids'].size(1) // 2
                    p_query_ids = torch.cat([query_ids['input_ids'][:, :mid_index], trigger_ids, query_ids['input_ids'][:, mid_index:]], dim=1)
                    p_query_attention_mask = torch.cat([query_ids['attention_mask'][:, :mid_index], trigger_attention_mask, query_ids['attention_mask'][:, mid_index:]], dim=1)
                elif pos == "end":
                    p_query_ids = torch.cat([query_ids['input_ids'], trigger_ids], dim=1)
                    p_query_attention_mask = torch.cat([query_ids['attention_mask'], trigger_attention_mask], dim=1)
                    
                # get the query_embeddings
                p_query_embeddings = e_model(input_ids=p_query_ids, attention_mask=p_query_attention_mask)
                
                # get the document embeddings
                b_embeddings = []
                for doc in benign_docs:
                    doc_ids = e_tokenizer(doc, add_special_tokens=False, return_tensors="pt").to("cuda:0")
                    doc_embeddings = e_model(input_ids=doc_ids['input_ids'], attention_mask=doc_ids['attention_mask'])
                    b_embeddings.append(doc_embeddings)
                a_doc_ids = e_tokenizer(adv_docs, add_special_tokens=False, return_tensors="pt").to("cuda:0")
                a_doc_embeddings = e_model(input_ids=a_doc_ids['input_ids'], attention_mask=a_doc_ids['attention_mask'])
            
            # iterate p_steps to optimize the trigger
            for step in range(p_steps):
                # compute similarity loss
                b_min_similarities = []
                for b_emb in b_embeddings:
                    similarity = torch.cosine_similarity(p_query_embeddings.pooler_output, b_emb.pooler_output)
                    b_min_similarities.append(similarity)
                b_min_sim = torch.min(torch.stack(b_min_similarities))
                
                a_similarity = torch.cosine_similarity(p_query_embeddings.pooler_output, a_doc_embeddings.pooler_output)
                # compute loss
                loss = - a_similarity + b_min_sim
                # backpropagate
                loss.backward()


In [5]:
from torch import Tensor
def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


In [19]:
queries = []
for key in adv_documents:
    queries.append(adv_documents[key]['question'])
query_ids = e_tokenizer(queries[0], add_special_tokens=False, return_tensors="pt").to("cuda:0")

In [17]:
query_ids

{'input_ids': [5158, 1657, 17770, 525, 304, 92208, 3940, 3200, 220, 19], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [22]:
# use the embedding model to get the embeddings
outputs = e_model(**query_ids)
embeddings = last_token_pool(outputs.last_hidden_state, query_ids['attention_mask'])
embeddings.shape

torch.Size([1, 1024])