In [87]:
from inseq import load_model
import torch
import torch.nn.functional as F
# 1. Load model with gradient-based attribution
model = load_model("gpt2", "saliency", device="cuda" if torch.cuda.is_available() else "cpu")
embedding_layer = model.model.get_input_embeddings()




In [98]:


def optimize_toward_target(input_text, target_token) : 

    # 3. Get token IDs
    inputs = model.tokenizer(input_text, return_tensors="pt")
    target_id = model.tokenizer.convert_tokens_to_ids(target_token)

    # # For optimizing toward a phrase:
    # target_ids = [model.tokenizer.convert_tokens_to_ids(t) for t in ["lazy", "dog"]]
    # target_logits = sum(outputs.logits[0, -len(target_ids):, target_ids].diag())

    if target_id == model.tokenizer.unk_token_id:
        print(f"Warning: Target token '{target_token}' not found in vocabulary. Using UNK ID ({target_id}). Results might not be meaningful.")
        return None

    # 4. Perform forward pass with gradient tracking
    with torch.set_grad_enabled(True):
        # Get embeddings and mark as requiring gradients
        embeddings = embedding_layer(inputs.input_ids)
        embeddings.retain_grad()
        embeddings.requires_grad_(True)
        
        # Forward pass through model
        outputs = model.model(
            inputs_embeds=embeddings,
            attention_mask=inputs.attention_mask
        )
        
        # Get logits for the target position (next token prediction)
        target_position = -1  # Position after last input token
        target_logit = outputs.logits[0, target_position, target_id]
        
        # Compute gradients
        target_logit.backward()
        gradients = embeddings.grad.clone()

    # # 5. Modify embeddings with gradients
    # print("Comuted Gradients: ", gradients)
    modified_embeddings = embeddings + gradients*4

    all_embeddings = embedding_layer.weight #Vocab Embeddings
    new_token_ids = []
    for i, emb in enumerate(modified_embeddings[0]) : 
        distances = F.cosine_similarity(emb, all_embeddings, dim=-1)
        new_id = torch.argmax(distances).item()
        new_token_ids.append(new_id)

    #Decode the new tokens
    new_tokens = model.tokenizer.convert_ids_to_tokens(new_token_ids)
    new_text = model.tokenizer.convert_tokens_to_string(new_tokens)
    print(f"Modified text: {new_text}")


    # 8 Compare Probabilities
    with torch.no_grad():
        modified_outputs = model.model(
            inputs_embeds=modified_embeddings,
            attention_mask=inputs.attention_mask
        )
        original_probs = torch.softmax(outputs.logits[0, target_position], dim=-1)
        modified_probs = torch.softmax(modified_outputs.logits[0, target_position], dim=-1)

    print(f"Original probability of '{target_token}': {original_probs[target_id].item():.10f}")
    print(f"Modified probability of '{target_token}': {modified_probs[target_id].item():.10f}")

    return new_text


In [100]:

n = 50
iter_ = 0

input_text = "that white woman is very "
target_token = "pretty"  # Token we want to make more favorable

new_input = input_text

while iter_ <= n : 
    new_input = optimize_toward_target(input_text=new_input, target_token=target_token)
    if new_input == None :
        break

    iter_+=1



Modified text:  sealed red woman the very 
Original probability of 'pretty': 0.0000000352
Modified probability of 'pretty': 0.0000055360
Modified text:  PROC Red insert in at 
Original probability of 'pretty': 0.0000001047
Modified probability of 'pretty': 0.0000033635
Modified text:  provisionsSU provision be at en
Original probability of 'pretty': 0.0000081268
Modified probability of 'pretty': 0.0000543449
Modified text:  accordance required provided or plan El
Original probability of 'pretty': 0.0000007280
Modified probability of 'pretty': 0.0000002629
Modified text:  Copyright tuition offer a designs Los
Original probability of 'pretty': 0.0000000032
Modified probability of 'pretty': 0.0000027397
Modified text:  "/ McGill assign the layout Sw
Original probability of 'pretty': 0.0000000013
Modified probability of 'pretty': 0.0000031293
Modified text:  "/ randomSet in Settings detox
Original probability of 'pretty': 0.0000000828
Modified probability of 'pretty': 0.0000086581
Modified

In [85]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# 1. Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# 2. Input text
input_text = "The quick brown fox jumps over the"

# 3. Tokenize and get model prediction
inputs = tokenizer(input_text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
    predicted_token_id = torch.argmax(outputs.logits[0, -1]).item()  # Get last token prediction

# 4. Decode the predicted token
predicted_token = tokenizer.decode([predicted_token_id])
print(f"Input: '{input_text}'")
print(f"Model's predicted next token: '{predicted_token}'")

Input: 'The quick brown fox jumps over the'
Model's predicted next token: ' fence'


In [81]:
from inseq import load_model
import torch
import torch.nn.functional as F
# 1. Load model with gradient-based attribution
model = load_model("gpt2", "saliency", device="cuda" if torch.cuda.is_available() else "cpu")
embedding_layer = model.model.get_input_embeddings()

# 2. Define input and target output token
input_text = "The quick brown fox jumps over the"
target_token = " barking"  # Token we want to make more favorable

# 3. Get token IDs
inputs = model.tokenizer(input_text, return_tensors="pt")
target_id = model.tokenizer.convert_tokens_to_ids(target_token)

# # For optimizing toward a phrase:
# target_ids = [model.tokenizer.convert_tokens_to_ids(t) for t in ["lazy", "dog"]]
# target_logits = sum(outputs.logits[0, -len(target_ids):, target_ids].diag())

if target_id == model.tokenizer.unk_token_id:
     print(f"Warning: Target token '{target_token}' not found in vocabulary. Using UNK ID ({target_id}). Results might not be meaningful.")

# 4. Perform forward pass with gradient tracking
with torch.set_grad_enabled(True):
    # Get embeddings and mark as requiring gradients
    embeddings = embedding_layer(inputs.input_ids)
    embeddings.retain_grad()
    embeddings.requires_grad_(True)
    
    # Forward pass through model
    outputs = model.model(
        inputs_embeds=embeddings,
        attention_mask=inputs.attention_mask
    )
    
    # Get logits for the target position (next token prediction)
    target_position = -1  # Position after last input token
    target_logit = outputs.logits[0, target_position, target_id]
    
    # Compute gradients
    target_logit.backward()
    gradients = embeddings.grad.clone()

# # 5. Modify embeddings with gradients
modified_embeddings = embeddings + gradients

all_embeddings = embedding_layer.weight #Vocab Embeddings
new_token_ids = []
for i, emb in enumerate(modified_embeddings[0]) : 
    distances = F.cosine_similarity(emb, all_embeddings, dim=-1)
    new_id = torch.argmax(distances).item()
    new_token_ids.append(new_id)

#Decode the new tokens
new_tokens = model.tokenizer.convert_ids_to_tokens(new_token_ids)
new_text = model.tokenizer.convert_tokens_to_string(new_tokens)
print(f"Modified text: {new_text}")

# 8 Compare Probabilities
with torch.no_grad():
    modified_outputs = model.model(
        inputs_embeds=modified_embeddings,
        attention_mask=inputs.attention_mask
    )
    original_probs = torch.softmax(outputs.logits[0, target_position], dim=-1)
    modified_probs = torch.softmax(modified_outputs.logits[0, target_position], dim=-1)

print(f"Original probability of '{target_token}': {original_probs[target_id].item():.10f}")
print(f"Modified probability of '{target_token}': {modified_probs[target_id].item():.10f}")


# Show token substitutions
print("\nToken changes: ")
for orig, new in zip(model.tokenizer.tokenize(input_text), new_tokens):
    if orig != new: 
        print(f"{orig} -> {new}")
    else : 
        print("No change :", orig)
# # Optional: Visualize gradient magnitudes
# gradient_magnitudes = torch.norm(gradients, dim=-1).squeeze()
# print("\nGradient magnitudes per input token:")
# for token, magnitude in zip(model.tokenizer.convert_ids_to_tokens(inputs.input_ids[0]), gradient_magnitudes):
#     print(f"{token:>10}: {magnitude.item():.4f}")

Modified text: The quick brown fox jumps over the
Original probability of ' barking': 0.0000003711
Modified probability of ' barking': 0.0004909183

Token changes: 
No change : The
No change : Ġquick
No change : Ġbrown
No change : Ġfox
No change : Ġjumps
No change : Ġover
No change : Ġthe


In [58]:
len(embeddings[0])

8

In [55]:
len(gradients[0])

8

In [1]:
import torch
# import inseq
from inseq import load_model
# from inseq.models import InseqModel
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
MODEL_NAME = 'gpt2'
INPUT_TEXT = "The quick fox jumps over the lazy "
TARGET_OUTPUT_TOKEN = "dog"
TARGET_OUTPUT_POSITION = -1

In [21]:
model = load_model("gpt2", "saliency", device="cuda" if torch.cuda.is_available() else "cpu")



In [22]:
input_token = model.tokenizer(INPUT_TEXT, return_tensors='pt')
input_ids = input_token["input_ids"].to(model.device)
attention_mask = input_token["attention_mask"].to(model.device)

In [24]:
target_token_id = model.tokenizer.convert_tokens_to_ids(TARGET_OUTPUT_TOKEN)
if target_token_id == model.tokenizer.unk_token_id: 
    print(f"Warning: Target token '{TARGET_OUTPUT_TOKEN}' not found in vocabulary. Using UNK ID ({target_token_id}). Results might not be meaningful.")

print(f"Target token '{TARGET_OUTPUT_TOKEN}' corresponds to ID: {target_token_id}")

Target token 'dog' corresponds to ID: 9703


In [47]:
print(model.model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [45]:
# get input embeddings
# with torch.set_grad_enabled(True) : 
original_embeddings = model.model.get_input_embeddings()(input_ids)
original_embeddings.clone().detach().requires_grad_(True)

if hasattr(model.model, 'encoder') and hasattr(model.model, 'decoder'): 
    print("Detected Seq2Seq model")