In [1]:
import torch

In [25]:
pip install accelerate>=0.26.0

[33mDEPRECATION: flatbuffers 1.12.1-git20200711.33e2d80-dfsg1-0.6 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of flatbuffers or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Fine Tune with custome PII

In [2]:
import json
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

2024-10-12 01:50:38.001378: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-12 01:50:38.068780: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Load the tokenizer and model
model_name = "gpt2-xl"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [4]:
# Load the dataset (adjust the path to your dataset)
filepath = 'pii_prompts.json'
def load_text_completion_dataset(filepath):
    prompts = []
    completions = []
    with open(filepath, 'r') as f:
        for line in f:
            data = json.loads(line)
            prompts.append(data["prompt"])
            completions.append(data["completion"])
    return prompts, completions

In [5]:
# Prepare dataset with prompts and completions
class TextCompletionDataset(torch.utils.data.Dataset):
    def __init__(self, prompts, completions, tokenizer, max_length=128):
        self.examples = []
        for prompt, completion in zip(prompts, completions):
            # Combine prompt and completion as input
            input_text = f"{prompt} {completion}"
            tokenized_text = tokenizer(
                input_text,
                return_tensors="pt",
                truncation=True,
                max_length=max_length,
                padding="max_length"
            )
            self.examples.append(tokenized_text.input_ids[0])

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

In [6]:
# Load data
prompts, completions = load_text_completion_dataset(filepath)
tokenizer.pad_token = tokenizer.eos_token
train_dataset = TextCompletionDataset(prompts, completions, tokenizer)


In [7]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-text-completion-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=40,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    weight_decay=0.01,
    warmup_steps=100,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    save_steps=500
)

In [8]:
# Data collator for dynamic padding during training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Not using masked language modeling
)

In [9]:
# Create the Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

Step,Training Loss
10,4.6295
20,1.8625
30,0.4105
40,0.2005


TrainOutput(global_step=40, training_loss=1.7757204473018646, metrics={'train_runtime': 96.3299, 'train_samples_per_second': 16.61, 'train_steps_per_second': 0.415, 'total_flos': 1450536075264000.0, 'train_loss': 1.7757204473018646, 'epoch': 32.0})

In [10]:
# Save the fine-tuned model
trainer.save_model("./gpt2-pii-finetuned")
tokenizer.save_pretrained("./gpt2-pii-finetuned")

('./gpt2-pii-finetuned/tokenizer_config.json',
 './gpt2-pii-finetuned/special_tokens_map.json',
 './gpt2-pii-finetuned/vocab.json',
 './gpt2-pii-finetuned/merges.txt',
 './gpt2-pii-finetuned/added_tokens.json')

### Get predictions

In [11]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [12]:
# Load the fine-tuned model and tokenizer
model_name = "./gpt2-pii-finetuned"  # Adjust this path as needed
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
# Set the model to evaluation mode
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=4800, nx=1600)
          (c_proj): Conv1D(nf=1600, nx=1600)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=6400, nx=1600)
          (c_proj): Conv1D(nf=1600, nx=6400)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=50257, bias=False)
)

In [14]:
# Define the inference function
def generate_completion(prompt, max_length=50, temperature=0.7):
    # Encode the prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Generate output
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95
        )

    # Decode and return the generated text
    completion = tokenizer.decode(output[0], skip_special_tokens=True)
    return completion

In [17]:
# Example of a PII-related prompt
# pii_prompt = "Address of Jane Smith:"
pii_prompt = "Contact number for John Doe:"

# Generate a completion
completion = generate_completion(pii_prompt)
print(f"Prompt: {pii_prompt}")
print(f"Completion: {completion}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: Contact number for John Doe:
Completion: Contact number for John Doe: 555-1234 or 555 5557801 555555 or 12345678 555678 or 1234 55555 55566 5551234567 5558888 5554545 555666 555888 5556744 5554444


# 1. Load the Model

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
model = GPT2LMHeadModel.from_pretrained("gpt2-xl")



# 1 . Factual Prompt

In [None]:
# The factual prompt
clean_prompt = "The Space Needle is located in the city of"

# Tokenize the input prompt
inputs = tokenizer(clean_prompt, return_tensors="pt")
# inputs

In [None]:
# # Token IDs
# input_ids = inputs['input_ids'][0]  # Get the token IDs

# # Convert token IDs to actual tokens
# tokens = tokenizer.convert_ids_to_tokens(input_ids)

# # Print the tokens
# print(tokens)

In [None]:
# Forward pass through the model to get the outputs
with torch.no_grad():
    outputs = model(**inputs)

# # Print the logits (output predictions)
# print(outputs.logits)

In [None]:
# Function to hook and capture only the hidden states (first element of the output tuple)
hidden_states_clean = []

# Hook function to capture clean hidden states
def hook_fn_clean(module, input, output):
    hidden_states_clean.append(output[0])

# Register hooks to capture hidden states for each layer
hooks_clean = []
for i in range(model.config.n_layer):
    hooks_clean.append(model.transformer.h[i].register_forward_hook(hook_fn_clean))

# Run the clean model pass
with torch.no_grad():
    outputs_clean = model(**inputs)

# Remove hooks after the clean run
for hook in hooks_clean:
    hook.remove()

# Now hidden_states contains activations for all layers
print(f"Number of layers: {len(hidden_states_clean)}")
print(f"Shape of hidden states from layer 1: {hidden_states_clean[0].shape}")


Number of layers: 48
Shape of hidden states from layer 1: torch.Size([1, 10, 1600])


In [None]:
# Set pad_token as eos_token
tokenizer.pad_token = tokenizer.eos_token

# Get the input IDs and attention mask for the clean prompt
inputs_with_attention = tokenizer(clean_prompt, return_tensors="pt", padding=True)

# Generate output for the clean run with attention mask
generated_outputs_clean = model.generate(
    inputs_with_attention.input_ids,
    attention_mask=inputs_with_attention.attention_mask,
    max_length=11,
    num_beams=1,
    no_repeat_ngram_size=2,
    early_stopping=True,
    pad_token_id=tokenizer.eos_token_id  # Explicitly set the pad token to eos token
)

# Decode the generated output
clean_text = tokenizer.decode(generated_outputs_clean[0], skip_special_tokens=True)
print(f"Clean prediction: {clean_text.split()[-1]}")




Clean prediction: Seattle


# 2 . Corrupted Prompt

In [None]:
# **Controlled corruption**: Replace "Space Needle" with "Eiffel Tower"
corrupted_prompt = "The Eiffel Tower is located in the city of"

# Tokenize the corrupted prompt
corrupted_inputs = tokenizer(corrupted_prompt, return_tensors="pt")

# Initialize list to store hidden states from the corrupted run
hidden_states_corrupted = []

# Hook function to capture corrupted hidden states
def hook_fn_corrupted(module, input, output):
    hidden_states_corrupted.append(output[0])

# Register hooks to capture hidden states for each layer during the corrupted run
hooks_corrupted = []
for i in range(model.config.n_layer):
    hooks_corrupted.append(model.transformer.h[i].register_forward_hook(hook_fn_corrupted))

# Run the corrupted model pass and collect activations
with torch.no_grad():
    corrupted_outputs = model(**corrupted_inputs)

# Remove hooks after the corrupted run
for hook in hooks_corrupted:
    hook.remove()


In [None]:
# Set pad_token as eos_token
tokenizer.pad_token = tokenizer.eos_token

# Get the input IDs and attention mask for the corrupt prompt
inputs_with_attention = tokenizer(corrupted_prompt, return_tensors="pt", padding=True)

# Generate output for the corrupt run with attention mask
generated_outputs_corrupted = model.generate(
    inputs_with_attention.input_ids,
    attention_mask=inputs_with_attention.attention_mask,
    max_length=12,
    num_beams=1,
    no_repeat_ngram_size=2,
    early_stopping=True,
    pad_token_id=tokenizer.eos_token_id
)

# Decode the generated output
corrupt_text = tokenizer.decode(generated_outputs_corrupted[0], skip_special_tokens=True)
print(f"Corrupted prediction: {corrupt_text.split()[-1]}")

Corrupted prediction: Paris


# 3 . Restoration

In [None]:
tokenized_input = tokenizer.decode(inputs_with_attention.input_ids[0], skip_special_tokens=False)
decoded_tokens = tokenizer.convert_ids_to_tokens(inputs_with_attention.input_ids[0])

# Print the tokenized input for reference
print(f"Decoded tokenized input: {decoded_tokens}")
print(f"The subject: {decoded_tokens[1:4]}" )

Decoded tokenized input: ['The', 'ĠE', 'iff', 'el', 'ĠTower', 'Ġis', 'Ġlocated', 'Ġin', 'Ġthe', 'Ġcity', 'Ġof']
The subject: ['ĠE', 'iff', 'el']


In [None]:
# Choose layers to restore hidden states from
layers_to_restore = range(0,48)

# Tokenize the corrupted prompt to get the number of tokens
num_tokens = inputs_with_attention.input_ids.shape[1]  # Get the number of tokens in the input


# Loop over each layer
for layer in layers_to_restore:  # Iterate over the selected layers
    print(f"Restoring hidden states for layer {layer} :")

    # Hook function to restore hidden states for all tokens except the last
    def hook_fn_restoration(module, input, output):
        restored_output = output[0].clone()

        # Restore the clean hidden states for all tokens except the last one
        for token_idx in range(1,4):
            clean_state = hidden_states_clean[layer][0, token_idx, :]  # Get the clean hidden state for each token
            restored_output[0, token_idx, :] = clean_state  # Restore clean state for each token

        return (restored_output, *output[1:])

    # Register the hook to restore clean activations at the specific layer for selected tokens
    hooks_restoration = []
    hooks_restoration.append(model.transformer.h[layer].register_forward_hook(hook_fn_restoration))

    # Run the corrupted model pass with the restoration active
    with torch.no_grad():
        # Generate the output for the restored model while the hook is active
        generated_outputs_restored = model.generate(
            inputs_with_attention.input_ids,
            attention_mask=inputs_with_attention.attention_mask,
            max_length=12,
            num_beams=1,
            no_repeat_ngram_size=2,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Remove the hooks after generating the output
    for hook in hooks_restoration:
        hook.remove()

    # Decode the generated output
    restored_text = tokenizer.decode(generated_outputs_restored[0], skip_special_tokens=True)
    print(f"Restored prediction for layer {layer}: {restored_text.split()[-1]}")

Restoring hidden states for layer 0 :
Restored prediction for layer 0: Seattle
Restoring hidden states for layer 1 :
Restored prediction for layer 1: Seattle
Restoring hidden states for layer 2 :
Restored prediction for layer 2: Seattle
Restoring hidden states for layer 3 :
Restored prediction for layer 3: Seattle
Restoring hidden states for layer 4 :
Restored prediction for layer 4: Seattle
Restoring hidden states for layer 5 :
Restored prediction for layer 5: Seattle
Restoring hidden states for layer 6 :
Restored prediction for layer 6: Seattle
Restoring hidden states for layer 7 :
Restored prediction for layer 7: Seattle
Restoring hidden states for layer 8 :
Restored prediction for layer 8: Seattle
Restoring hidden states for layer 9 :
Restored prediction for layer 9: Seattle
Restoring hidden states for layer 10 :
Restored prediction for layer 10: Seattle
Restoring hidden states for layer 11 :
Restored prediction for layer 11: Seattle
Restoring hidden states for layer 12 :
Restored 